import os
import json
import csv
import ssl
import time
import urllib.request
import urllib.error
import re

# Target site configuration
BASE_URL = "https://geopontsproperties.com"
PROPERTIES_API_URL = f"{BASE_URL}/index.php?rest_route=/wp/v2/properties"
MEDIA_API_URL = f"{BASE_URL}/index.php?rest_route=/wp/v2/media"
TARGET_COUNT = 300
PER_PAGE = 50

# Setup SSL context to ignore certificate verification errors
ssl_ctx = ssl.create_default_context()
ssl_ctx.check_hostname = False
ssl_ctx.verify_mode = ssl.CERT_NONE

def make_request(url):
    """Makes an HTTP GET request with custom User-Agent and SSL bypass."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as response:
            return json.loads(response.read().decode('utf-8'))
    except urllib.error.HTTPError as e:
        print(f"\n[HTTP Error] Failed to fetch {url}: {e.code} {e.reason}")
        return None
    except Exception as e:
        print(f"\n[Error] Failed to fetch {url}: {e}")
        return None

def clean_html(html_text):
    """Converts HTML description to clean plain text while preserving lines."""
    if not html_text:
        return ""
    # Replace list items with bullet points
    text = re.sub(r'<li>', '- ', html_text)
    # Replace paragraph and line break tags with newlines
    text = re.sub(r'</p>|<br\s*/?>', '\n', text)
    # Remove all other HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Decode HTML entities
    import html
    text = html.unescape(text)
    # Clean up multiple spaces or newlines
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    return text.strip()

def resolve_images(property_id, featured_media_id):
    """Fetches all media attachments for a given property ID to get image URLs."""
    # Fetch all media attached to the parent property
    url = f"{MEDIA_API_URL}&parent={property_id}&per_page=100"
    media_items = make_request(url) or []
    
    images = []
    for item in media_items:
        if item.get('mime_type', '').startswith('image/') and 'source_url' in item:
            images.append(item['source_url'])
            
    # If the featured media was not in the parent attachments, try to fetch it separately
    if featured_media_id and featured_media_id > 0:
        # Check if we already have it (source URLs matching or checking if we can find it)
        # To be safe, fetch featured media URL if not already fetched
        featured_url = f"{MEDIA_API_URL}/{featured_media_id}"
        # Wait slightly to prevent hammering
        time.sleep(0.1)
        feat_media = make_request(featured_url)
        if feat_media and 'source_url' in feat_media:
            feat_img = feat_media['source_url']
            if feat_img not in images:
                images.insert(0, feat_img) # Insert at front as featured
                
    return images

def main():
    print("Starting Geoponts Properties Scraper...")
    print(f"Target count: {TARGET_COUNT} properties")
    
    properties = []
    page = 1
    
    while len(properties) < TARGET_COUNT:
        print(f"\nFetching page {page} of properties...")
        url = f"{PROPERTIES_API_URL}&per_page={PER_PAGE}&page={page}&_embed=1"
        batch = make_request(url)
        
        if not batch or not isinstance(batch, list) or len(batch) == 0:
            print("No more properties returned from API. Ending fetch loop.")
            break
            
        print(f"Received {len(batch)} properties on page {page}.")
        
        for i, item in enumerate(batch):
            if len(properties) >= TARGET_COUNT:
                break
                
            prop_id = item.get('id')
            title = item.get('title', {}).get('rendered', 'Untitled Property')
            slug = item.get('slug', '')
            link = item.get('link', '')
            content_html = item.get('content', {}).get('rendered', '')
            content_text = clean_html(content_html)
            
            # Extract metadata
            meta = item.get('property_meta', {})
            def get_meta_field(key):
                val = meta.get(key, [])
                return val[0] if val and len(val) > 0 else ""
                
            price_raw = get_meta_field('fave_property_price') or get_meta_field('property_price')
            beds = get_meta_field('fave_property_bedrooms')
            baths = get_meta_field('fave_property_bathrooms')
            size = get_meta_field('fave_property_size')
            address = get_meta_field('fave_property_address')
            lat = get_meta_field('houzez_geolocation_lat')
            lng = get_meta_field('houzez_geolocation_long')
            
            # Agent details
            agent_phone = get_meta_field('houzez_agent_mobile')
            agent_email = get_meta_field('houzez_agent_email')
            
            # Map embedded taxonomy terms
            embedded = item.get('_embedded', {})
            terms = embedded.get('wp:term', [])
            
            taxonomies = {
                'property_type': [],
                'property_status': [],
                'property_feature': [],
                'property_state': [],
                'property_area': [],
                'property_city': []
            }
            
            for term_group in terms:
                for term in term_group:
                    tax = term.get('taxonomy')
                    name = term.get('name')
                    if tax in taxonomies and name:
                        taxonomies[tax].append(name)
            
            # Resolve taxonomy values to single strings or lists
            prop_type = taxonomies['property_type'][0] if taxonomies['property_type'] else "Residential"
            prop_status = taxonomies['property_status'][0] if taxonomies['property_status'] else "For Sale"
            features = taxonomies['property_feature']
            state = taxonomies['property_state'][0] if taxonomies['property_state'] else ""
            area = taxonomies['property_area'][0] if taxonomies['property_area'] else ""
            city = taxonomies['property_city'][0] if taxonomies['property_city'] else ""
            
            # Address resolution fallback
            location = address
            if not location:
                loc_parts = [p for p in [area, city, state] if p]
                location = ", ".join(loc_parts) if loc_parts else "Lagos, Nigeria"
                
            # Resolve author/agent name
            author_name = "Geoponts Properties"
            author_data = embedded.get('author', [])
            if author_data and len(author_data) > 0:
                author_name = author_data[0].get('name', 'Geoponts Properties')
                
            # Fetch listing images
            feat_media_id = item.get('featured_media', 0)
            print(f"[{len(properties) + 1}/{TARGET_COUNT}] Resolving images for ID {prop_id}: '{title}'...", end="", flush=True)
            images = resolve_images(prop_id, feat_media_id)
            print(f" resolved {len(images)} images.")
            
            # Add small sleep to be polite
            time.sleep(0.15)
            
            prop_data = {
                'id': prop_id,
                'title': title,
                'slug': slug,
                'link': link,
                'price': price_raw,
                'location': location,
                'property_type': prop_type,
                'status': prop_status,
                'bedrooms': beds,
                'bathrooms': baths,
                'size_sqft': size,
                'latitude': lat,
                'longitude': lng,
                'features': features,
                'images': images,
                'description': content_text,
                'description_html': content_html,
                'agent_name': author_name,
                'agent_phone': agent_phone,
                'agent_email': agent_email
            }
            
            properties.append(prop_data)
            
        page += 1
        # Sleep between pages
        time.sleep(0.5)
        
    print(f"\nSuccessfully scraped {len(properties)} properties.")
    
    # Save as JSON
    json_path = "backend/data/scraped_properties.json"
    print(f"Writing data to {json_path}...")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(properties, f, indent=2, ensure_ascii=False)
        
    # Save as CSV
    csv_path = "backend/data/scraped_properties.csv"
    print(f"Writing data to {csv_path}...")
    
    csv_headers = [
        'id', 'title', 'price', 'link', 'location', 'property_type', 'status', 
        'bedrooms', 'bathrooms', 'size_sqft', 'features', 'images', 'agent_name', 
        'agent_phone', 'agent_email', 'description'
    ]
    
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(csv_headers)
        for p in properties:
            writer.writerow([
                p['id'],
                p['title'],
                p['price'],
                p['link'],
                p['location'],
                p['property_type'],
                p['status'],
                p['bedrooms'],
                p['bathrooms'],
                p['size_sqft'],
                ", ".join(p['features']),
                ", ".join(p['images']),
                p['agent_name'],
                p['agent_phone'],
                p['agent_email'],
                p['description']
            ])
            
    print("Scraping and compilation completed successfully!")

if __name__ == "__main__":
    main()
