import requests
import os
import pdfkit
from urllib.parse import urlparse
import re
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse, unquote
import random


def download_file_main(url, save_dir='downloads', convert_webpage_to_pdf=True, max_retries=3, retry_delay=2):
    """
    Download a file from URL or convert webpage to PDF with retry mechanism.
    Windows-compatible version with wkhtmltopdf path configuration.
    
    Args:
        url (str): URL to download from
        save_dir (str): Directory to save files
        convert_webpage_to_pdf (bool): Whether to convert webpages to PDF
        max_retries (int): Maximum number of retry attempts
        retry_delay (int): Base delay between retries in seconds (uses exponential backoff)
    """
    
    # Skip if not a web URL
    if not url.startswith(('http://', 'https://', 'www.')):
        print(f"Skipping non-web URL: {url}")
        return

    if (url.strip().lower() == 'https://sso.gem.gov.in/arxsso/oauth/dologin' or url.strip().lower().startswith('https://bidplus.gem.gov.in/bidding/downloadomppdfile/')):
        print(f"Skipping login URL: {url}")
        return
    
    # Add https:// if URL starts with www.
    if url.startswith('www.'):
        url = 'https://' + url
    
    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Create a requests session with retry strategy
    session = _create_retry_session(max_retries)
    
    for attempt in range(max_retries + 1):
        try:
            if attempt > 0:
                wait_time = retry_delay * (2 ** (attempt - 1))  # Exponential backoff
                print(f"Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries + 1})")
                time.sleep(wait_time)
            
            print(f"Accessing: {url}" + (f" (Attempt {attempt + 1})" if attempt > 0 else ""))
            
            # First, make a HEAD request to check content type
            try:
                head_response = session.head(url, allow_redirects=True, timeout=(15, 30))
                content_type = head_response.headers.get('content-type', '').lower()
                head_success = head_response.status_code == 200
            except Exception as e:
                print(f"HEAD request failed: {e}")
                head_success = False
                content_type = ''
            
            # Make a GET request
            response = session.get(url, stream=True, timeout=(15, 30))
            response.raise_for_status()
            
            # Update content type if HEAD failed
            if not head_success:
                content_type = response.headers.get('content-type', '').lower()
            
            # Check if it's a webpage (HTML content)
            is_webpage = ('text/html' in content_type or 
                         'application/xhtml' in content_type or
                         # Additional check: if content starts with HTML tags
                         response.text.strip().lower().startswith(('<!doctype', '<html')))
            
            if is_webpage and convert_webpage_to_pdf:
                return _convert_webpage_to_pdf(url, save_dir)
            else:
                return download_file(url, save_dir)
                
        except requests.exceptions.ConnectTimeout as e:
            print(f"Connection timeout on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to connection timeout")
                return None
                
        except requests.exceptions.ReadTimeout as e:
            print(f"Read timeout on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to read timeout")
                return None
                
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to connection error")
                return None
                
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error on attempt {attempt + 1}: {e}")
            # For HTTP errors like 404, 500, don't retry
            if response.status_code in [404, 403, 401]:
                print(f"Not retrying for HTTP {response.status_code} error")
                return None
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to HTTP error")
                return None
                
        except requests.exceptions.RequestException as e:
            print(f"Request error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to access {url} after {max_retries + 1} attempts due to request error")
                return None
                
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"Failed to process {url} after {max_retries + 1} attempts due to unexpected error")
                return None
    
    return None

def download_file(url, save_dir='downloads', timeout=30, retries=3):
    # Skip if not a web URL (simple check)
    if not url.startswith(('http://', 'https://', 'www.')):
        return None

    # Supported document extensions
    SUPPORTED_EXTENSIONS = ['.pdf', '.xlsx', '.xls', '.doc', '.docx']
        
    # Add proper headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Cache-Control': 'max-age=0'
    }
    
    # Create session to maintain cookies
    session = requests.Session()
    session.headers.update(headers)
    
    # Extract filename from URL (improved logic)
    parsed_url = urlparse(url)
    filename = os.path.basename(unquote(parsed_url.path))
    
    # If no filename or extension, create one
    if not filename or '.' not in filename:
        filename = f"downloaded_file_{int(time.time())}.pdf"
    elif not any(filename.lower().endswith(ext) for ext in SUPPORTED_EXTENSIONS):
        filename = f"{filename}.pdf"

    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    file_path = os.path.join(save_dir, filename)

    # Retry logic
    for attempt in range(retries):
        try:
            print(f"Downloading (attempt {attempt + 1}): {url}")
            
            # Add random delay to avoid rate limiting
            if attempt > 0:
                delay = random.uniform(1, 3)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
            
            # Make the request with proper settings
            response = session.get(
                url, 
                stream=True, 
                timeout=timeout,
                allow_redirects=True,
                verify=True  # SSL verification
            )
            response.raise_for_status()

            # Check if we actually got content
            content_length = response.headers.get('content-length')
            if content_length and int(content_length) < 100:  # Suspiciously small file
                print(f"Warning: File seems very small ({content_length} bytes)")
            
            # Check content type
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' in content_type and filename.endswith('.pdf'):
                print("Warning: Received HTML content when expecting PDF - might be blocked")
            
            # Download the file
            with open(file_path, 'wb') as file:
                downloaded_size = 0
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)
                        downloaded_size += len(chunk)
                
                print(f"Downloaded {downloaded_size} bytes")

            # Verify the file was downloaded properly
            if os.path.getsize(file_path) == 0:
                print("Error: Downloaded file is empty")
                os.remove(file_path)
                if attempt < retries - 1:
                    continue
                return None
            
            print(f"Successfully downloaded to: {file_path}")
            return file_path
            
        except requests.exceptions.Timeout:
            print(f"Timeout error on attempt {attempt + 1}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed on attempt {attempt + 1}: {e}")
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {e}")
        
        # If this was the last attempt, clean up any partial file
        if attempt == retries - 1 and os.path.exists(file_path):
            os.remove(file_path)
    
    print(f"Failed to download {url} after {retries} attempts")
    return None

def _create_retry_session(max_retries=3):
    """
    Create a requests session with built-in retry strategy.
    """
    session = requests.Session()
    
    # Define retry strategy
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,  # Will create delays like: 1, 2, 4, 8 seconds
        status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry
        allowed_methods=["HEAD", "GET", "OPTIONS"]  # HTTP methods to retry
    )
    
    # Mount the adapter with retry strategy
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Set common headers to appear more like a real browser
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    
    return session

def _get_wkhtmltopdf_config():
    """
    Get the appropriate wkhtmltopdf configuration for Windows.
    Returns the configuration dict for pdfkit.
    """
    # Common installation paths on Windows
    possible_paths = [
        r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'C:\wkhtmltopdf\bin\wkhtmltopdf.exe',
        r'/usr/bin/wkhtmltopdf'
        # Add your custom installation path here if different
    ]
    
    # Check if wkhtmltopdf is in PATH first
    try:
        import subprocess
        result = subprocess.run(['wkhtmltopdf', '--version'], 
                               capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            print("Found wkhtmltopdf in system PATH")
            return {}  # No config needed if it's in PATH
    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
        pass
    
    # Check common installation paths
    for path in possible_paths:
        if os.path.exists(path):
            print(f"Found wkhtmltopdf at: {path}")
            return {'wkhtmltopdf': path}
    
    # If not found, let user know
    print("wkhtmltopdf not found in common locations.")
    print("Please either:")
    print("1. Add wkhtmltopdf to your system PATH, or")
    print("2. Update the 'possible_paths' list in the function with your installation path")
    return None

def _convert_webpage_to_pdf(url, save_dir):
    """Convert webpage to PDF using pdfkit with Windows-specific configuration."""
    try:
        # Get wkhtmltopdf configuration
        config = _get_wkhtmltopdf_config()
        if config is None:
            return None
        
        # Create filename from URL
        parsed_url = urlparse(url)
        # Clean the filename by removing special characters
        clean_name = re.sub(r'[^\w\-_]', '_', parsed_url.netloc + parsed_url.path)
        clean_name = re.sub(r'_+', '_', clean_name).strip('_')
        filename = f"webpage_{clean_name}.pdf"
        
        file_path = os.path.join(save_dir, filename)
        
        print(f"Converting webpage to PDF: {filename}")
        
        # Configure pdfkit options for better PDF generation
        options = {
            'page-size': 'A4',
            'margin-top': '0.75in',
            'margin-right': '0.75in',
            'margin-bottom': '0.75in',
            'margin-left': '0.75in',
            'encoding': "UTF-8",
            'no-outline': None,
            'enable-local-file-access': None,
            'javascript-delay': 3000,  # Wait for JS to load
            'load-error-handling': 'ignore',
            'load-media-error-handling': 'ignore',
            'disable-smart-shrinking': '',  # Prevent content shrinking
            'print-media-type': '',  # Use print CSS
        }
        
        # Convert webpage to PDF with Windows configuration
        pdfkit.from_url(url, file_path, options=options, configuration=pdfkit.configuration(wkhtmltopdf=config.get('wkhtmltopdf')))
        
        print(f"Webpage converted to PDF: {file_path}")
        return file_path
        
    except Exception as e:
        print(f"Failed to convert webpage to PDF: {e}")
        print("Troubleshooting tips:")
        print("1. Ensure wkhtmltopdf is properly installed")
        print("2. Check if the executable path is correct")
        print("3. Try running 'wkhtmltopdf --version' in command prompt")
        return None
