import requests
import os, shutil
from urllib.parse import urlparse
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse, unquote
import random
import json
import re
from typing import Dict, Any, List
from pathlib import Path
import anthropic
import hashlib
import PyPDF2
import file_download_robust as fdr
from document_extractor import extract_documents_text_compatible




ANTHROPIC_API_KEY = "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"


# Initialize Claude client
claude_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

llm_model = "claude" #"gemini" #

claude_model = "claude-sonnet-4-20250514"




def download_NIT_file(url, save_dir='downloads', timeout=30, retries=3):
    # Skip if not a web URL (simple check)
    if not url.startswith(('http://', 'https://', 'www.')):
        return None

    # Supported document extensions
    SUPPORTED_EXTENSIONS = ['.pdf', '.xlsx', '.xls', '.doc', '.docx']
        
    # Add proper headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Cache-Control': 'max-age=0'
    }
    
    # Create session to maintain cookies
    session = requests.Session()
    session.headers.update(headers)
    
    # Extract filename from URL (improved logic)
    parsed_url = urlparse(url)
    filename = os.path.basename(unquote(parsed_url.path))
    
    # If no filename or extension, create one
    if not filename: # or '.' not in filename:
        filename = f"downloaded_file_{int(time.time())}.pdf"
    elif not any(filename.lower().endswith(ext) for ext in SUPPORTED_EXTENSIONS):
        filename = f"{filename}.pdf"

    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    file_path = os.path.join(save_dir, filename)

    # Retry logic
    for attempt in range(retries):
        try:
            print(f"Downloading (attempt {attempt + 1}): {url}")
            
            # Add random delay to avoid rate limiting
            if attempt > 0:
                delay = random.uniform(1, 3)
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
            
            # Make the request with proper settings
            response = session.get(
                url, 
                stream=True, 
                timeout=timeout,
                allow_redirects=True,
                verify=True  # SSL verification
            )
            response.raise_for_status()

            # Check if we actually got content
            content_length = response.headers.get('content-length')
            if content_length and int(content_length) < 100:  # Suspiciously small file
                print(f"Warning: File seems very small ({content_length} bytes)")
            
            # Download the file
            with open(file_path, 'wb') as file:
                downloaded_size = 0
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)
                        downloaded_size += len(chunk)
                
                print(f"Downloaded {downloaded_size} bytes")

            # Verify the file was downloaded properly
            if os.path.getsize(file_path) == 0:
                print("❌ Error: Downloaded file is empty")
                os.remove(file_path)
                if attempt < retries - 1:
                    continue
                return None
            
            print(f"Successfully downloaded to: {file_path}")
            return file_path
            
        except requests.exceptions.Timeout:
            print(f"Timeout error on attempt {attempt + 1}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed on attempt {attempt + 1}: {e}")
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {e}")
        
        # If this was the last attempt, clean up any partial file
        if attempt == retries - 1 and os.path.exists(file_path):
            os.remove(file_path)
    
    print(f"Failed to download {url} after {retries} attempts")
    return None


def extract_links_from_pdf(pdf_path):
    """Extract embedded hyperlinks from PDF files, skipping specific display text"""
    # Define the display texts to skip
    skip_phrases = [
        'attached categories',
        'General Terms and Conditions',
        'Service Level Agreement'
    ]
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)

            # Initialize list to store all links
            links = []

            # Extract links from each page
            for page_num, page in enumerate(pdf_reader.pages):
                # Check if we're in a disclaimer section - simple text-based check
                page_text = page.extract_text().lower()
                if "disclaimer" in page_text:
                    continue  # Skip disclaimer sections

                # Access annotations (which include hyperlinks)
                if '/Annots' in page:
                    annotations = page['/Annots']
                    if annotations:
                        # Process each annotation
                        for annotation in annotations:
                            annotation_object = annotation.get_object()
                            # Check if it's a link annotation
                            if annotation_object.get('/Subtype') == '/Link':
                                # Extract the actual URL
                                if '/A' in annotation_object and '/URI' in annotation_object['/A']:
                                    uri = annotation_object['/A']['/URI']
                                    if isinstance(uri, str):
                                        # Check if this link should be skipped based on its display text
                                        skip_link = False
                                        
                                        # Try to get the display text from the annotation
                                        display_text = None
                                        
                                        # Look for text in various annotation properties
                                        if '/Contents' in annotation_object:
                                            display_text = annotation_object['/Contents']
                                        elif '/T' in annotation_object:
                                            display_text = annotation_object['/T']
                                        elif '/TU' in annotation_object:
                                            display_text = annotation_object['/TU']
                                        
                                        # If we still don't have display text, try to extract it from the annotation's text
                                        if not display_text and '/Rect' in annotation_object:
                                            try:
                                                # Get text in the annotation rectangle
                                                rect = annotation_object['/Rect']
                                                # Extract text from the page and look for text near the rectangle
                                                page_text = page.extract_text()
                                                # This is a simplified approach - in practice, you might need more sophisticated text extraction
                                                display_text = page_text
                                            except:
                                                pass
                                        
                                        # Check if display text contains any skip phrases
                                        if display_text and isinstance(display_text, str):
                                            display_text_lower = display_text.lower()
                                            for skip_phrase in skip_phrases:
                                                if skip_phrase.lower() in display_text_lower:
                                                    skip_link = True
                                                    break
                                        
                                        # Only add the link if it shouldn't be skipped
                                        if not skip_link:
                                            links.append({
                                                'url': uri,
                                                'page': page_num + 1
                                            })

                # Alternative method for newer PyPDF2 versions
                try:
                    page_links = page.get_links()
                    for link in page_links:
                        if hasattr(link, 'url') and link.url:
                            # Check if this link should be skipped
                            skip_link = False
                            
                            # Try to get display text for this link
                            if hasattr(link, 'text'):
                                display_text = link.text
                                if display_text and isinstance(display_text, str):
                                    display_text_lower = display_text.lower()
                                    for skip_phrase in skip_phrases:
                                        if skip_phrase.lower() in display_text_lower:
                                            skip_link = True
                                            break
                            
                            # Only add the link if it shouldn't be skipped
                            if not skip_link:
                                links.append({
                                    'url': link.url,
                                    'page': page_num + 1
                                })
                                
                except (AttributeError, TypeError):
                    # get_links method not available or failed
                    pass

        # Fall back to text extraction for visible URLs if no embedded links found
        if not links:
            import re
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()

                # Skip disclaimer sections
                if "disclaimer" in page_text.lower():
                    continue

                # Extract URLs from text as a fallback
                # Split text into lines to analyze context
                lines = page_text.split('\n')
                
                for i, line in enumerate(lines):
                    urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', line)
                    for url in urls:
                        # Check if the line or surrounding lines contain skip phrases
                        skip_url = False
                        
                        # Check current line and previous/next lines for context
                        context_lines = []
                        if i > 0:
                            context_lines.append(lines[i-1])
                        context_lines.append(line)
                        if i < len(lines) - 1:
                            context_lines.append(lines[i+1])
                        
                        context_text = ' '.join(context_lines).lower()
                        
                        for skip_phrase in skip_phrases:
                            if skip_phrase.lower() in context_text:
                                skip_url = True
                                break
                        
                        # Only add the URL if it shouldn't be skipped
                        if not skip_url:
                            links.append({
                                'url': url,
                                'page': page_num + 1,
                                'note': 'Extracted from text (not embedded)'
                            })

        # Format the results
        formatted_links = []
        for link in links:
            formatted_links.append(f"{link['url']}")

        if not formatted_links:
            return print("No links found in the PDF document")
        else:
            print(f"Following links found: \n{formatted_links}")

        return list(set(formatted_links))

    except Exception as e:
        print(f"❌ Error extracting links from PDF: {str(e)}")
        return print(f"❌ Failed to extract links from PDF: {str(e)}")


def download_linked_documents(bid_dir, links):

    """
    Download additional documents from links found in tender documents
    
    Args:
        bid_dir (str): Directory to save downloaded files to
        links (list): List of links to download
        
    Returns:
        list: List of paths to downloaded files
    """
    downloaded_files = []
    
    for link in links:
        try:
            print(f"Downloading file from link: {link}")
            file_path=fdr.download_file_main(link, bid_dir)
            print(f"Downloaded file from link {link} and saved to path {file_path}")
            downloaded_files.append(file_path)
        except Exception as e:
            print(f"❌ Error downloading {link}: {str(e)}")
    
    print(f"Total downloaded files {len(downloaded_files)} at paths {downloaded_files}")
    return downloaded_files


def calculate_file_hash(file_path):
    """
    Calculate SHA-256 hash of file content
    
    Args:
        file_path (Path): Path to the file
        
    Returns:
        str: Hexadecimal hash of the file content
    """
    sha256_hash = hashlib.sha256()
    
    # Read and update hash in chunks to handle large files efficiently
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    
    return sha256_hash.hexdigest()

def remove_duplicate_pdfs(directory_path):
    """
    Remove duplicate PDF files from a directory based on content.
    For each set of identical files, the first one found is kept and others are removed.
    
    Args:
        directory_path (str): Path to directory containing PDF files
        
    Returns:
        tuple: (kept_files, removed_files) lists of filenames
    """
    print(f"🔍 Checking for duplicate PDFs in: {directory_path}")
    
    # Get all PDF files in the directory
    pdf_files = list(Path(directory_path).glob("*.pdf"))
    
    if not pdf_files:
        print("No PDF files found")
        return [], []
    
    # Dictionary to store hash -> [file_paths]
    hash_map = {}
    
    # Calculate hash for each file and group by hash
    for file_path in pdf_files:
        print(f"Analyzing: {file_path.name}")
        file_hash = calculate_file_hash(file_path)
        
        if file_hash in hash_map:
            hash_map[file_hash].append(file_path)
        else:
            hash_map[file_hash] = [file_path]
    
    # Keep track of which files were kept and which were removed
    kept_files = []
    removed_files = []
    
    # Process each group of files with the same hash
    for file_hash, file_paths in hash_map.items():
        # Keep the first file
        kept_file = file_paths[0]
        kept_files.append(kept_file)
        
        # Remove all duplicates
        for duplicate in file_paths[1:]:
            try:
                os.remove(duplicate)
                removed_files.append(duplicate)
                print(f"Removed duplicate: {duplicate.name} (same as {kept_file.name})")
            except Exception as e:
                print(f"❌ Error removing {duplicate.name}: {str(e)}")
    
    print(f"Kept {len(kept_files)} unique files, removed {len(removed_files)} duplicates")
    return [f.name for f in kept_files], [f.name for f in removed_files]



def analyze_tender_with_LLM(documents_text):
    """
    Analyze tender documents with Claude LLM
    
    Args:
        documents_text (dict): Dictionary mapping file paths to their text content
        
    Returns:
        dict: Extracted information from tender documents
    """
    # Information to extract
    info_to_extract = [
        "Eligibility/Qualification Criteria or conditions for bidder",
        "Evaluation criteria or method",
        "Scope of work of the whole project",
        "Amount of EMD fee",
        "Relaxation or preference given to any kind of company or bidder",
        "Payment terms",
        "BOQ requirements",
        "Risks"    
        ]
    
    # Initialize the result dictionary with empty strings
    extracted_info = {item: "" for item in info_to_extract}
    
    # Process all documents, not just the main one
    # First, calculate the total text size to determine chunking strategy
    total_text_size = sum(len(text) for text in documents_text.values())
    print(f"Total text size across all documents: {total_text_size} characters")
    
    # Approach: Process each document separately and then combine the results
    doc_analyses = []
    
    for doc_path, doc_text in documents_text.items():
        doc_name = os.path.basename(doc_path)
        print(f"Analyzing document: {doc_name} ({len(doc_text)} characters)")
        
        # Skip empty documents
        if not doc_text.strip():
            print(f"Skipping empty document: {doc_name}")
            continue
        
        # Create chunks based on document size
        chunk_size = 50000  # Adjust based on LLM's token limits
        doc_chunks = []
        
        if len(doc_text) > chunk_size:
            # Split into chunks, but try to break at paragraph boundaries
            start = 0
            while start < len(doc_text):
                end = start + chunk_size
                
                # Adjust to end at paragraph boundary if possible
                if end < len(doc_text):
                    # Look for double newline (paragraph break) before the cutoff
                    paragraph_end = doc_text.rfind('\n\n', start, end)
                    # If found and not too far from the chunk size, use it
                    if paragraph_end > start + (chunk_size * 0.7):
                        end = paragraph_end
                    else:
                        # Otherwise look for single newline
                        line_end = doc_text.rfind('\n', start, end)
                        if line_end > start + (chunk_size * 0.8):
                            end = line_end
                
                # Add the chunk
                doc_chunks.append(doc_text[start:end])
                start = end
        else:
            # Document is small enough to process in one chunk
            doc_chunks = [doc_text]
        
        print(f"Split document into {len(doc_chunks)} chunks")
        
        # Process each chunk with Claude
        for chunk_idx, chunk in enumerate(doc_chunks):
            prompt = f"""
            You are analyzing tender documents. I'll provide you with a chunk ({chunk_idx+1}/{len(doc_chunks)}) 
            from the document {doc_name}.
            
            Please extract the following information if present in this chunk:
            1. Eligibility/Qualification Criteria or conditions for bidder
            2. Evaluation criteria or method
            3. Scope of work of the whole project
            4. Amount of EMD fee
            5. Relaxation or preference given to any kind of company or bidder
            6. Payment terms
            7. BOQ requirements
            8. Risks
            
            For each category, provide the exact text from the document. If the information isn't in this chunk, 
            just say "Not found in this chunk." Please structure your response clearly with appropriate headers
            for each section. 
            
            For evaluation criteria or method, extract the complete scoring table if specified.

            For Scope of Work please extract all deliverables, key requirements, delivery terms and specifications.
            If there is a list of items/products to be procured given, then specify quantities of each item separately. 
             
            For Risks, please extract all risky clauses like Penalty clauses, Liquidated Damages, Indemnification etc. 
            
            Here is the document chunk:
            {chunk}
            """
            
            try:
                if(llm_model == 'gemini'):
                    response = gemini_client.models.generate_content(
                        model="gemini-2.5-flash-preview-04-17", #"gemini-2.0-flash",
                        contents=[prompt],
                        config=types.GenerateContentConfig(
                            system_instruction="You are an expert in analyzing tender documents. Extract the requested information accurately.",
                            max_output_tokens=6000,
                            temperature=0.1
                        )
                    )
                    response_text = response.text

                elif(llm_model == 'claude'):
                    # Call Claude API
                    response = claude_client.messages.create(
                        model=claude_model, #"claude-3-7-sonnet-latest", #claude-3-5-haiku-20241022
                        max_tokens=6000,
                        temperature=0,
                        system="You are an expert in analyzing tender documents. Extract the requested information accurately.",
                        messages=[
                            {"role": "user", "content": prompt}
                        ]
                    )
                    
                    # Get the response text
                    response_text = response.content[0].text
                
                # Save this analysis
                doc_analyses.append({
                    "doc_name": doc_name,
                    "chunk_idx": chunk_idx,
                    "response": response_text
                })
                
            except Exception as e:
                print(f"❌ Error analyzing chunk {chunk_idx+1} with Claude: {str(e)}")
    
    # Now process all the analysis responses to extract the information
    # Use a more robust approach to extract information from Claude's responses
    for analysis in doc_analyses:
        response_text = analysis["response"]
        doc_name = analysis["doc_name"]
        chunk_idx = analysis["chunk_idx"]
        
        print(f"Processing analysis of {doc_name} (chunk {chunk_idx+1})")
        
        # Process each category of information to extract
        for idx, item in enumerate(info_to_extract):
            # Possible section headers Claude might use
            section_markers = [
                f"{idx+1}. {item}",  # 1. Eligibility/Qualification Criteria
                f"## {idx+1}. {item}",  # ## 1. Eligibility/Qualification Criteria 
                f"**{idx+1}. {item}**",  # **1. Eligibility/Qualification Criteria**
                f"#{idx+1} {item}",  # #1 Eligibility/Qualification Criteria
                f"{item}:",  # Eligibility/Qualification Criteria:
                f"**{item}**",  # **Eligibility/Qualification Criteria**
                f"## {item}",  # ## Eligibility/Qualification Criteria
                f"### {item}",  # ### Eligibility/Qualification Criteria
                item  # Plain text
            ]
            
            # Find the section
            section_start = -1
            used_marker = ""
            
            for marker in section_markers:
                pos = response_text.find(marker)
                if pos != -1:
                    section_start = pos
                    used_marker = marker
                    break
            
            if section_start == -1:
                # Section not found
                continue
            
            # Find the end of this section (start of next section or end of response)
            section_end = len(response_text)
            
            # Check where the next section starts
            for next_idx, next_item in enumerate(info_to_extract):
                if next_idx <= idx:  # Skip current and previous sections
                    continue
                
                # Check all possible markers for the next section
                for marker in [
                    f"{next_idx+1}. {next_item}", 
                    f"## {next_idx+1}. {next_item}", 
                    f"**{next_idx+1}. {next_item}**",
                    f"#{next_idx+1} {next_item}",
                    f"{next_item}:", 
                    f"**{next_item}**",
                    f"## {next_item}",
                    f"### {next_item}",
                    next_item
                ]:
                    next_pos = response_text.find(marker, section_start)
                    if next_pos != -1 and next_pos < section_end:
                        section_end = next_pos
                        break
            
            # Extract the section content
            section_content = response_text[section_start + len(used_marker):section_end].strip()
            
            # Skip if the content indicates "not found"
            if any(phrase in section_content.lower() for phrase in [
                "not found in this chunk", 
                "not mentioned in this chunk",
                "no information found",
                "not provided in this chunk",
                "not specified in this chunk"
            ]):
                continue
            
            # Add the extracted content to the result
            if section_content:
                # If we already have content for this item, add a separator
                if extracted_info[item]:
                    extracted_info[item] += f"\n\n--- From {doc_name} (chunk {chunk_idx+1}) ---\n"
                else:
                    extracted_info[item] += f"--- From {doc_name} (chunk {chunk_idx+1}) ---\n"
                
                extracted_info[item] += section_content
    
    # Final cleanup - remove any empty sections and format for readability
    for item in info_to_extract:
        if not extracted_info[item]:
            extracted_info[item] = "Not found in any document"
        else:
            # Clean up formatting and remove duplicative information
            lines = extracted_info[item].split('\n')
            cleaned_lines = []
            seen_content = set()
            
            for line in lines:
                # Skip empty lines and source markers at this stage
                if not line.strip() or line.strip().startswith('---'):
                    cleaned_lines.append(line)
                    continue
                
                # Normalize and hash the line for deduplication
                normalized = ' '.join(line.lower().split())
                if normalized not in seen_content and len(normalized) > 5:
                    seen_content.add(normalized)
                    cleaned_lines.append(line)
            
            # Combine back while preserving source markers
            extracted_info[item] = '\n'.join(cleaned_lines)
    
    # print a summary of what was found
    found_items = [item for item, content in extracted_info.items() if content != "Not found in any document"]
    print(f"Successfully extracted information for {len(found_items)} categories:")
    for item in found_items:
        content_preview = extracted_info[item].split('\n', 1)[0]
        print(f"- {item}: {content_preview[:50]}...")
    
    return extracted_info, doc_analyses





def extract_sow_from_bid(bid_url: str) -> Dict[str, Any]:
    """
    Placeholder function to extract scope of work from bid URL.
    Replace this with your actual implementation.
    
    Args:
        bid_url: The bid URL to extract information from
        
    Returns:
        Dictionary containing extracted scope of work information
    """
    # TODO: Replace with actual implementation
    print(f"Extracting SOW from: {bid_url}")

    bid_directory = './temp_dir/'
    if os.path.exists(bid_directory):
        shutil.rmtree(bid_directory)

    # Convert to Path object for easier handling
    dir_path = Path(bid_directory)

    download_NIT_file(bid_url, save_dir=bid_directory, timeout=30, retries=3)

    # List to store the full paths
    file_paths = []
    
    # Iterate through all items in the directory
    for item in dir_path.iterdir():
        # Only include files, not directories
        if item.is_file():
            # Add the full path as a string
            file_paths.append(str(item.absolute()))

    unique_links = extract_links_from_pdf(file_paths[0])

    if unique_links:
        download_linked_documents(bid_directory, unique_links)
        remove_duplicate_pdfs(bid_directory)


    documents_text = extract_documents_text_compatible(bid_directory, bid_directory, ANTHROPIC_API_KEY)
    extracted_info, doc_analyses = analyze_tender_with_LLM(documents_text)
    # if os.path.exists(bid_directory):
    #     shutil.rmtree(bid_directory)
    # Placeholder return - replace with actual extracted data
    return extracted_info

def extract_bid_number(bid_number_list: List[str]) -> str:
    """Extract clean bid number from the list"""
    if not bid_number_list:
        return ""
    
    bid_number = bid_number_list[0]
    # Extract parent bid number if it exists in the format
    if "(Parent:" in bid_number:
        # Extract the parent bid number
        parent_match = re.search(r'\(Parent:\s*([^)]+)\)', bid_number)
        if parent_match:
            return parent_match.group(1).strip()
    
    # If no parent, extract the main bid number
    main_match = re.search(r'^([^(]+)', bid_number)
    if main_match:
        return main_match.group(1).strip()
    
    return bid_number

def format_buyer_details(buyer_details: Dict[str, Any]) -> str:
    """Format buyer details into readable sentences"""
    sentences = []
    
    # Handle address (skip if only asterisks)
    address = buyer_details.get("address", "").strip()
    if address and not re.match(r'^\*+', address):
        sentences.append(f"The buyer is located at {address}.")
    
    # Handle ministry
    ministry = buyer_details.get("ministry", "").strip()
    if ministry:
        sentences.append(f"This procurement is by the {ministry}.")
    
    # Handle department
    department = buyer_details.get("department", "").strip()
    if department:
        sentences.append(f"The procuring department is {department}.")
    
    # Handle organisation
    organisation = buyer_details.get("organisation", "").strip()
    if organisation:
        sentences.append(f"The organisation is {organisation}.")
    
    # Handle office (skip if only asterisks)
    office = buyer_details.get("office", "").strip()
    if office and not re.match(r'^\*+', office):
        sentences.append(f"The office is {office}.")
    
    return " ".join(sentences)

def extract_bid_details_info(bid_details: Dict[str, Any]) -> str:
    """Extract relevant information from bid details section"""
    sentences = []
    
    # Handle quantity
    quantity = bid_details.get("quantity", "").strip()
    if quantity:
        sentences.append(f"The total quantity required is {quantity} units.")
    
    # Handle bid opening date
    opening_date = bid_details.get("bidOpeningDate", "").strip()
    if opening_date:
        sentences.append(f"The bid opening date is {opening_date}.")
    
    # Handle buyer details
    buyer_details = bid_details.get("buyerDetails", {})
    if buyer_details:
        buyer_info = format_buyer_details(buyer_details)
        if buyer_info:
            sentences.append(buyer_info)
    
    return " ".join(sentences)

def clean_scope_work_text(text: str) -> str:
    """Clean scope of work text by removing chunk references and formatting"""
    if not text or text.strip() == "Not found in any document":
        return ""
    
    # Remove "--- From xxx.pdf (chunk x) ---" patterns
    text = re.sub(r'--- From [^-]+ ---\s*', '', text)
    
    # Remove extra whitespace and newlines
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def format_scope_work(scope_work: Dict[str, Any]) -> str:
    """Format scope of work dictionary into readable text"""
    if not scope_work or not isinstance(scope_work, dict):
        return ""
    
    formatted_parts = []
    
    # Define the order and nice names for scope work sections
    scope_sections = {
        "Scope of work of the whole project": "Project Scope",
        "Eligibility/Qualification Criteria or conditions for bidder": "Eligibility Criteria",
        "Evaluation criteria or method": "Evaluation Method",
        "Amount of EMD fee": "EMD Requirements",
        "Relaxation or preference given to any kind of company or bidder": "Purchase Preferences",
        "Payment terms": "Payment Terms",
        "BOQ requirements": "BOQ Requirements",
        "Risks": "Risk Factors"
    }
    
    for key, nice_name in scope_sections.items():
        if key in scope_work:
            cleaned_text = clean_scope_work_text(scope_work[key])
            if cleaned_text:
                formatted_parts.append(f"{nice_name}: {cleaned_text}")
    
    return " ".join(formatted_parts)  


def format_evaluation_info(evaluation: Dict[str, Any]) -> str:
    """Format evaluation information for answer field"""
    if not evaluation:
        return ""
    
    eval_parts = []
    
    # Add sellers information
    sellers = evaluation.get("sellers", [])
    if sellers:
        eval_parts.append(f"Total number of sellers who participated in this bid are {len(sellers)}.")
        
        # Add seller names
        seller_names = [seller.get("sellerName", "") for seller in sellers if seller.get("sellerName")]
        if seller_names:
            eval_parts.append(f"And here are the names of all Participating sellers: {', '.join(seller_names)}.")
        
        # Add winner information if available
        winner_seller = [seller for seller in sellers if seller.get("rank") == "L1"]
        if winner_seller:
            winner_name = winner_seller[0].get("sellerName", "")
            winner_price = winner_seller[0].get("totalPrice", "")
            if winner_name:
                winner_info = f"Out of these sellers, {winner_name} became the L1 Winner"
                if winner_price:
                    winner_info += f" with quoted price of Rs. {winner_price}"
                eval_parts.append(winner_info)

        # Add L2 information if available
        L2_seller = [seller for seller in sellers if seller.get("rank") == "L2"]
        if L2_seller:
            L2_name = L2_seller[0].get("sellerName", "")
            L2_price = L2_seller[0].get("totalPrice", "")
            if L2_name:
                L2_info = f", and {L2_name} became the L2 seller"
                if L2_price:
                    L2_info += f" with quoted price of Rs. {L2_price}"
                eval_parts.append(L2_info)
    
    # Add summary information
    summary = evaluation.get("summary", {})
    if summary:
        total_sellers = summary.get("totalSellers", 0)
        lowest_price = summary.get("lowestPrice", 0)
        highest_price = summary.get("highestPrice", 0)
        
        if total_sellers:
            eval_parts.append(f"Total sellers who qualified in this bid were {total_sellers} and the")
        
        if lowest_price and highest_price:
            eval_parts.append(f"Price range for this bid was: Rs. {lowest_price} to Rs. {highest_price}")
    
    return " ".join(eval_parts)

def create_bid_info_string(bid: Dict[str, Any], answer_string: bool = False) -> str:
    """Create comprehensive bid information string"""
    info_parts = []
    
    # 1. Parent bid number
    if(answer_string):
        bid_number = extract_bid_number(bid.get("b_bid_number", []))
        if bid_number:
            info_parts.append("=" * 70)
            info_parts.append(f"Information about GeM Bid ID: {bid_number}.")
    
    # 2. Category information
    bd_category = bid.get("bd_category_name", [])
    if bd_category and bd_category[0]:
        category_text = bd_category[0]
        info_parts.append(f"This bid is for the following categories: {category_text}.")
    
    # 3. High value bid information
    is_high_value = bid.get("is_high_value", [False])
    if is_high_value and is_high_value[0]:
        info_parts.append("This bid is in the category of high value bid.")
    
    # 4. Single packet bid information
    is_single_packet = bid.get("ba_is_single_packet", [0])
    if is_single_packet and is_single_packet[0] == 1:
        info_parts.append("This bid is a single packet bid.")
    
    # 5. Bid details information
    detailed_info = bid.get("detailedInfo", {})
    if detailed_info and "sections" in detailed_info:
        sections = detailed_info["sections"]
        
        # Handle both parent-child and direct bid details
        bid_details = None
        if "bidDetails" in sections:
            if isinstance(sections["bidDetails"], dict):
                # Check if it has parent-child structure
                if "parent" in sections["bidDetails"]:
                    bid_details = sections["bidDetails"]["parent"]
                else:
                    bid_details = sections["bidDetails"]
        
        if bid_details:
            bid_details_info = extract_bid_details_info(bid_details)
            if bid_details_info:
                info_parts.append(bid_details_info)
    
        # 6. Add evaluation information if requested (for answer field)
        if answer_string and "evaluation" in sections:
            evaluation_info = format_evaluation_info(sections["evaluation"])
            if evaluation_info:
                info_parts.append(evaluation_info)
    
    # 7. Scope of work
    scope_work = bid.get("b_scope_work", "")
    if scope_work:
        info_parts.append(f"\nOther important details about the nature of this bid are as following:\n")
        if isinstance(scope_work, dict):
            # Format the dictionary scope of work
            formatted_scope = format_scope_work(scope_work)
            if formatted_scope:
                info_parts.append(formatted_scope)
        elif isinstance(scope_work, str) and scope_work.strip():
            info_parts.append(f"Scope of Work: {scope_work}")
    
    return " ".join(info_parts)

def load_existing_data(file_path: str) -> Dict[str, Any]:
    """Load existing data from file if it exists"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        return None

def save_data_safely(data: Any, file_path: str, data_type: str):
    """Save data to file with error handling"""
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file, indent=2, ensure_ascii=False)
        print(f"👉 {data_type} saved to: {file_path}")
    except Exception as e:
        print(f"❌ Error saving {data_type}: {e}")

def is_bid_already_processed(bid: Dict[str, Any], existing_embeddings: List[Dict]) -> bool:
    """Check if bid is already processed by comparing tag names"""
    if not existing_embeddings:
        return False
    
    bid_number = extract_bid_number(bid.get("b_bid_number", []))
    if not bid_number:
        return False
    
    existing_tags = {entry.get("tagName", "") for entry in existing_embeddings}
    return bid_number in existing_tags

def process_bids_for_embeddings(input_file: str, output_file: str, embeddings_file: str, no_of_bids_to_process: int):
    """
    Process bids data to extract scope of work and create embeddings training data
    Updates output files after each bid to prevent data loss
    
    Args:
        input_file: Path to input JSON file (processed_bids_data.json)
        output_file: Path to output JSON file with updated scope of work
        embeddings_file: Path to embeddings training data JSON file
    """
    
    # Load the processed bids data
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"❌ Error: {input_file} not found!")
        return
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return
    
    # Load existing output files if they exist
    print("📂 Checking for existing output files...")
    existing_bids_data = load_existing_data(output_file)
    existing_embeddings = load_existing_data(embeddings_file)
    
    if existing_bids_data:
        print(f"📄 Found existing bids data with {len(existing_bids_data.get('bids', []))} bids")
        # Use existing data as base
        bids_data = existing_bids_data
    
    if existing_embeddings:
        print(f"📊 Found existing embeddings data with {len(existing_embeddings)} entries")
        embeddings_data = existing_embeddings
    else:
        embeddings_data = []
    
    print(f"🚀 Starting processing of {len(data['bids'])} bids...")
    print("=" * 70)
    
    # Process each bid
    processed_bids = 0
    skipped_bids = 0
    
    for i, bid in enumerate(data['bids']):
        try:
            print(f"\n📝 Processing bid {i+1}/{len(data['bids'])}...")
            
            # Check if bid is already processed
            if is_bid_already_processed(bid, embeddings_data):
                print(f"⏭️  Bid already processed, skipping...")
                skipped_bids += 1
                continue
            
            # 1. Extract bid URL
            bid_url = bid.get("b_id", "")
            if not bid_url:
                print(f"⚠️  Warning: No bid URL found for bid {i+1}")
                continue
            
            # 2. Extract scope of work using the function
            print(f"🔍 Extracting SOW from: {bid_url}")
            try:
                extracted_info = extract_sow_from_bid(bid_url)
                bid["b_scope_work"] = extracted_info
                print(f"✅ SOW extracted successfully")
            except Exception as e:
                print(f"⚠️  Warning: Failed to extract SOW for {bid_url}: {e}")
                bid["b_scope_work"] = {"error": f"Failed to extract: {str(e)}"}
            
            # 3. Create bid information strings for embeddings
            # Question field (without evaluation info)
            question_string = create_bid_info_string(bid, answer_string=False)
            
            # Answer field (with evaluation info)
            answer_string = create_bid_info_string(bid, answer_string=True)
            
            # 4. Extract tag name (main bid number)
            tag_name = extract_bid_number(bid.get("b_bid_number", []))
            if not tag_name:
                tag_name = f"BID_{i+1}"
            
            # 5. Create embeddings data entry
            embeddings_entry = {
                "tagName": tag_name,
                "question": [question_string],
                "answer": answer_string,
                "question_neg": []
            }
            
            # 6. Add to embeddings data
            embeddings_data.append(embeddings_entry)
            processed_bids += 1
            
            # 7. Save both files after each bid processing
            print(f"💾 Saving progress...")
            save_data_safely(data, output_file, "Updated bids data")
            save_data_safely(embeddings_data, embeddings_file, "Embeddings training data")
            
            print(f"✅ Bid {i+1} processed successfully (Tag: {tag_name})")

            if (processed_bids >= no_of_bids_to_process):
                break
            
        except Exception as e:
            print(f"❌ Error processing bid {i+1}: {e}")
            print(f"📄 Continuing with next bid...")
            continue
    
    print("\n" + "=" * 70)
    print(f"🎉 Processing completed!")
    print(f"📈 Successfully processed: {processed_bids} new bids")
    print(f"⏭️ Skipped (already processed): {skipped_bids} bids")
    print(f"📊 Total embeddings entries: {len(embeddings_data)}")
    print(f"📁 Output files:")
    print(f"   📄 Bids with SOW: {output_file}")
    print(f"   🧠 Embeddings data: {embeddings_file}")

def main():
    """Main function"""
    input_file = "processed_bids_data_1.json"
    output_file = "bids_with_sow.json"
    embeddings_file = "bids_embeddings_training.json"
    
    print("🚀 Starting bid information extraction and embeddings data creation...")
    print("=" * 70)
    
    process_bids_for_embeddings(input_file, output_file, embeddings_file, 850)

if __name__ == "__main__":
    main()