import requests import os import pdfkit from urllib.parse import urlparse import re import time from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from urllib.parse import urlparse, unquote import random def download_file_main(url, save_dir='downloads', convert_webpage_to_pdf=True, max_retries=3, retry_delay=2): """ Download a file from URL or convert webpage to PDF with retry mechanism. Windows-compatible version with wkhtmltopdf path configuration. Args: url (str): URL to download from save_dir (str): Directory to save files convert_webpage_to_pdf (bool): Whether to convert webpages to PDF max_retries (int): Maximum number of retry attempts retry_delay (int): Base delay between retries in seconds (uses exponential backoff) """ # Skip if not a web URL if not url.startswith(('http://', 'https://', 'www.')): print(f"Skipping non-web URL: {url}") return if (url.strip().lower() == 'https://sso.gem.gov.in/arxsso/oauth/dologin' or url.strip().lower().startswith('https://bidplus.gem.gov.in/bidding/downloadomppdfile/')): print(f"Skipping login URL: {url}") return # Add https:// if URL starts with www. if url.startswith('www.'): url = 'https://' + url # Create the save directory if it doesn't exist os.makedirs(save_dir, exist_ok=True) # Create a requests session with retry strategy session = _create_retry_session(max_retries) for attempt in range(max_retries + 1): try: if attempt > 0: wait_time = retry_delay * (2 ** (attempt - 1)) # Exponential backoff print(f"Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries + 1})") time.sleep(wait_time) print(f"Accessing: {url}" + (f" (Attempt {attempt + 1})" if attempt > 0 else "")) # First, make a HEAD request to check content type try: head_response = session.head(url, allow_redirects=True, timeout=(15, 30)) content_type = head_response.headers.get('content-type', '').lower() head_success = head_response.status_code == 200 except Exception as e: print(f"HEAD request failed: {e}") head_success = False content_type = '' # Make a GET request response = session.get(url, stream=True, timeout=(15, 30)) response.raise_for_status() # Update content type if HEAD failed if not head_success: content_type = response.headers.get('content-type', '').lower() # Check if it's a webpage (HTML content) is_webpage = ('text/html' in content_type or 'application/xhtml' in content_type or # Additional check: if content starts with HTML tags response.text.strip().lower().startswith((' 0: delay = random.uniform(1, 3) print(f"Waiting {delay:.1f} seconds before retry...") time.sleep(delay) # Make the request with proper settings response = session.get( url, stream=True, timeout=timeout, allow_redirects=True, verify=True # SSL verification ) response.raise_for_status() # Check if we actually got content content_length = response.headers.get('content-length') if content_length and int(content_length) < 100: # Suspiciously small file print(f"Warning: File seems very small ({content_length} bytes)") # Check content type content_type = response.headers.get('content-type', '').lower() if 'text/html' in content_type and filename.endswith('.pdf'): print("Warning: Received HTML content when expecting PDF - might be blocked") # Download the file with open(file_path, 'wb') as file: downloaded_size = 0 for chunk in response.iter_content(chunk_size=8192): if chunk: # Filter out keep-alive chunks file.write(chunk) downloaded_size += len(chunk) print(f"Downloaded {downloaded_size} bytes") # Verify the file was downloaded properly if os.path.getsize(file_path) == 0: print("Error: Downloaded file is empty") os.remove(file_path) if attempt < retries - 1: continue return None print(f"Successfully downloaded to: {file_path}") return file_path except requests.exceptions.Timeout: print(f"Timeout error on attempt {attempt + 1}") except requests.exceptions.RequestException as e: print(f"Request failed on attempt {attempt + 1}: {e}") except Exception as e: print(f"Unexpected error on attempt {attempt + 1}: {e}") # If this was the last attempt, clean up any partial file if attempt == retries - 1 and os.path.exists(file_path): os.remove(file_path) print(f"Failed to download {url} after {retries} attempts") return None def _create_retry_session(max_retries=3): """ Create a requests session with built-in retry strategy. """ session = requests.Session() # Define retry strategy retry_strategy = Retry( total=max_retries, backoff_factor=1, # Will create delays like: 1, 2, 4, 8 seconds status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry allowed_methods=["HEAD", "GET", "OPTIONS"] # HTTP methods to retry ) # Mount the adapter with retry strategy adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("http://", adapter) session.mount("https://", adapter) # Set common headers to appear more like a real browser session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) return session def _get_wkhtmltopdf_config(): """ Get the appropriate wkhtmltopdf configuration for Windows. Returns the configuration dict for pdfkit. """ # Common installation paths on Windows possible_paths = [ r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe', r'C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe', r'C:\wkhtmltopdf\bin\wkhtmltopdf.exe', r'/usr/bin/wkhtmltopdf' # Add your custom installation path here if different ] # Check if wkhtmltopdf is in PATH first try: import subprocess result = subprocess.run(['wkhtmltopdf', '--version'], capture_output=True, text=True, timeout=5) if result.returncode == 0: print("Found wkhtmltopdf in system PATH") return {} # No config needed if it's in PATH except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError): pass # Check common installation paths for path in possible_paths: if os.path.exists(path): print(f"Found wkhtmltopdf at: {path}") return {'wkhtmltopdf': path} # If not found, let user know print("wkhtmltopdf not found in common locations.") print("Please either:") print("1. Add wkhtmltopdf to your system PATH, or") print("2. Update the 'possible_paths' list in the function with your installation path") return None def _convert_webpage_to_pdf(url, save_dir): """Convert webpage to PDF using pdfkit with Windows-specific configuration.""" try: # Get wkhtmltopdf configuration config = _get_wkhtmltopdf_config() if config is None: return None # Create filename from URL parsed_url = urlparse(url) # Clean the filename by removing special characters clean_name = re.sub(r'[^\w\-_]', '_', parsed_url.netloc + parsed_url.path) clean_name = re.sub(r'_+', '_', clean_name).strip('_') filename = f"webpage_{clean_name}.pdf" file_path = os.path.join(save_dir, filename) print(f"Converting webpage to PDF: {filename}") # Configure pdfkit options for better PDF generation options = { 'page-size': 'A4', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': "UTF-8", 'no-outline': None, 'enable-local-file-access': None, 'javascript-delay': 3000, # Wait for JS to load 'load-error-handling': 'ignore', 'load-media-error-handling': 'ignore', 'disable-smart-shrinking': '', # Prevent content shrinking 'print-media-type': '', # Use print CSS } # Convert webpage to PDF with Windows configuration pdfkit.from_url(url, file_path, options=options, configuration=pdfkit.configuration(wkhtmltopdf=config.get('wkhtmltopdf'))) print(f"Webpage converted to PDF: {file_path}") return file_path except Exception as e: print(f"Failed to convert webpage to PDF: {e}") print("Troubleshooting tips:") print("1. Ensure wkhtmltopdf is properly installed") print("2. Check if the executable path is correct") print("3. Try running 'wkhtmltopdf --version' in command prompt") return None