"""
Data Formatter Module

This module takes analyzed document data and formats it into the required
output formats: human-readable text format and structured JSON format.
"""

import json
import logging
import os
from typing import Dict, Any, List, Optional
from datetime import datetime
import re

# Configure logging
logger = logging.getLogger(__name__)

class DataFormatter:
    """
    Data formatter that converts analyzed document data into required output formats.
    """
    
    def __init__(self):
        """Initialize the data formatter."""
        self.company_form_template = {
            "totalEmployees": "",
            "technicalStaff": "",
            "employeeDetailsSpreadsheet": "",
            "financialDetails": {
                "turnoverDetails": [],
                "itrDetails": [],
                "ca_certificate_details": [],
                "balanceSheetDetails": [],
                "certificate": [],
                "pastExperienceDetails": []
            },
            "otherDocuments": [],
            "defaultKeywords": [],
            "companyName": "",
            "entityType": "",
            "registeredAddress": "",
            "branchOfficeAddress": [],
            "companyWebsite": "",
            "about": "",
            "officePhoneNumber": "",
            "officeEmail": "",
            "officeFax": "",
            "authorizedPersonToSignDocs": "",
            "authorizedPersonDesignation": "",
            "cin": "",
            "pan": "",
            "gst": "",
            "msme": "",
            "typeOfMsme": "",
            "startUpRegistration": {
                "startUpRegistrationNumber": "",
                "file": ""
            },
            "cinFile": "",
            "panFile": "",
            "gstFile": "",
            "msmeFile": "",
            "directorDetails": [],
            "certificate": [],
            "pastExperienceDetails": []
        }
    
    def extract_financial_info(self, analysis_results: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
        """
        Extract financial information from analyzed documents.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            Dict[str, Any]: Extracted financial information
        """
        financial_info = {
            "turnoverDetails": [],
            "itrDetails": [],
            "ca_certificate_details": [],
            "balanceSheetDetails": [],
            "certificate": []
        }
        
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "")
            key_info = result.get("key_info", {})
            file_name = result.get("file_name", "")
            
            if "Financial:ITR" in doc_type or "Income_Tax_Return" in doc_type:
                itr_entry = {
                    "itr": key_info.get("total_income", key_info.get("gross_income", "")),
                    "itrYear": key_info.get("assessment_year", key_info.get("year", ""))
                }
                financial_info["itrDetails"].append(itr_entry)
                
            elif "Financial:Balance_Sheet" in doc_type:
                balance_entry = {
                    "auditedBalanceSheet": file_name,
                    "auditedBalanceSheetYear": key_info.get("year", key_info.get("financial_year", ""))
                }
                financial_info["balanceSheetDetails"].append(balance_entry)
                
            elif "Financial:CA_Certificate" in doc_type or "Turnover_Certificate" in doc_type:
                ca_entry = {
                    "ca_certificate": file_name,
                    "ca_certificate_description": result.get("description", "")
                }
                financial_info["ca_certificate_details"].append(ca_entry)
                
            elif "Financial:" in doc_type:
                cert_entry = {
                    "description": result.get("description", ""),
                    "fileName": file_name
                }
                financial_info["certificate"].append(cert_entry)
        
        return financial_info
    
    def extract_experience_info(self, analysis_results: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Extract past experience information from analyzed documents.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            List[Dict[str, Any]]: List of past experience details
        """
        experience_details = []
        
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "")
            key_info = result.get("key_info", {})
            file_name = result.get("file_name", "")
            
            if "Experience:" in doc_type:
                experience_entry = {
                    "customer": key_info.get("customer", key_info.get("client", "")),
                    "clientLocation": key_info.get("clientLocation", key_info.get("location", "")),
                    "project": key_info.get("project", key_info.get("project_title", "")),
                    "projectScope": key_info.get("projectScope", key_info.get("scope", "")),
                    "projectValue": key_info.get("projectValue", key_info.get("value", "")),
                    "projectStartDate": key_info.get("projectStartDate", key_info.get("start_date", "")),
                    "projectEndDate": key_info.get("projectEndDate", key_info.get("end_date", "")),
                    "document": file_name
                }
                experience_details.append(experience_entry)
        
        return experience_details
    
    def extract_legal_info(self, analysis_results: Dict[str, Dict[str, Any]]) -> Dict[str, str]:
        """
        Extract legal and registration information from analyzed documents.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            Dict[str, str]: Legal information
        """
        legal_info = {
            "companyName": "",
            "cin": "",
            "pan": "",
            "gst": "",
            "msme": "",
            "cinFile": "",
            "panFile": "",
            "gstFile": "",
            "msmeFile": ""
        }
        
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "")
            key_info = result.get("key_info", {})
            file_name = result.get("file_name", "")
            
            if "Legal:" in doc_type:
                if "PAN" in doc_type:
                    legal_info["pan"] = key_info.get("pan_number", "")
                    legal_info["panFile"] = file_name
                elif "GST" in doc_type:
                    legal_info["gst"] = key_info.get("gst_number", "")
                    legal_info["gstFile"] = file_name
                elif "CIN" in doc_type or "Incorporation" in doc_type:
                    legal_info["cin"] = key_info.get("cin", key_info.get("registration_number", ""))
                    legal_info["cinFile"] = file_name
                elif "MSME" in doc_type:
                    legal_info["msme"] = key_info.get("msme_number", key_info.get("registration_number", ""))
                    legal_info["msmeFile"] = file_name
                
                # Extract company name if available
                if key_info.get("company_name") and not legal_info["companyName"]:
                    legal_info["companyName"] = key_info.get("company_name")
        
        return legal_info
    
    def extract_certificates_info(self, analysis_results: Dict[str, Dict[str, Any]]) -> List[Dict[str, str]]:
        """
        Extract certificates information from analyzed documents.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            List[Dict[str, str]]: List of certificates
        """
        certificates = []
        
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "")
            file_name = result.get("file_name", "")
            description = result.get("description", "")
            
            if "Certificates:" in doc_type:
                cert_entry = {
                    "description": description,
                    "fileName": file_name
                }
                certificates.append(cert_entry)
        
        return certificates
    
    def extract_other_documents(self, analysis_results: Dict[str, Dict[str, Any]]) -> List[Dict[str, str]]:
        """
        Extract other documents information.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            List[Dict[str, str]]: List of other documents
        """
        other_docs = []
        
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "")
            file_name = result.get("file_name", "")
            description = result.get("description", "")
            
            if ("Other:" in doc_type or 
                "Technical:" in doc_type or 
                "HR:" in doc_type):
                doc_entry = {
                    "name": file_name,
                    "description": description,
                    "file": file_name
                }
                other_docs.append(doc_entry)
        
        return other_docs
    
    def format_to_json(self, analysis_results: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
        """
        Format analyzed data into JSON structure based on company_form.json template.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            Dict[str, Any]: Formatted JSON data
        """
        # Start with the template
        formatted_data = self.company_form_template.copy()
        
        # Extract different types of information
        financial_info = self.extract_financial_info(analysis_results)
        experience_info = self.extract_experience_info(analysis_results)
        legal_info = self.extract_legal_info(analysis_results)
        certificates_info = self.extract_certificates_info(analysis_results)
        other_docs = self.extract_other_documents(analysis_results)
        
        # Update the formatted data
        formatted_data["financialDetails"].update(financial_info)
        formatted_data["financialDetails"]["pastExperienceDetails"] = experience_info
        formatted_data.update(legal_info)
        formatted_data["certificate"] = certificates_info
        formatted_data["pastExperienceDetails"] = experience_info
        formatted_data["otherDocuments"] = other_docs
        
        return formatted_data
    
    def format_to_text(self, analysis_results: Dict[str, Dict[str, Any]]) -> str:
        """
        Format analyzed data into human-readable text format.
        
        Args:
            analysis_results: Dictionary of analysis results
            
        Returns:
            str: Formatted text
        """
        text_output = []
        text_output.append("COMPANY INFORMATION EXTRACTED FROM DOCUMENTS")
        text_output.append("=" * 60)
        text_output.append("")
        
        # Group documents by category
        categories = {}
        for file_path, result in analysis_results.items():
            doc_type = result.get("doc_type", "Other:Unknown")
            category = doc_type.split(":")[0]
            
            if category not in categories:
                categories[category] = []
            categories[category].append(result)
        
        # Format each category
        for category, docs in categories.items():
            text_output.append(f"{category.upper()} DOCUMENTS:")
            text_output.append("-" * 30)
            
            for doc in docs:
                text_output.append(f"Document: {doc.get('file_name', 'Unknown')}")
                text_output.append(f"Type: {doc.get('doc_type', 'Unknown')}")
                text_output.append(f"Description: {doc.get('description', 'No description')}")
                
                key_info = doc.get('key_info', {})
                if key_info:
                    text_output.append("Key Information:")
                    for key, value in key_info.items():
                        if value:
                            text_output.append(f"  {key}: {value}")
                
                text_output.append(f"Confidence: {doc.get('confidence', 0):.2f}")
                text_output.append("")
            
            text_output.append("")
        
        # Add summary section
        text_output.append("DOCUMENT ANALYSIS SUMMARY:")
        text_output.append("-" * 30)
        
        total_docs = len(analysis_results)
        avg_confidence = sum(result.get('confidence', 0) for result in analysis_results.values()) / total_docs if total_docs > 0 else 0
        
        text_output.append(f"Total Documents Analyzed: {total_docs}")
        text_output.append(f"Average Confidence: {avg_confidence:.2f}")
        text_output.append(f"Categories Found: {', '.join(categories.keys())}")
        text_output.append("")
        
        text_output.append(f"Analysis completed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        return "\n".join(text_output)
    
    def save_formatted_data(self, analysis_results: Dict[str, Dict[str, Any]], output_dir: str) -> Dict[str, str]:
        """
        Save formatted data to files.
        
        Args:
            analysis_results: Dictionary of analysis results
            output_dir: Directory to save output files
            
        Returns:
            Dict[str, str]: Dictionary mapping output type to file path
        """
        os.makedirs(output_dir, exist_ok=True)
        
        output_files = {}
        
        # Save JSON format
        json_data = self.format_to_json(analysis_results)
        json_path = os.path.join(output_dir, "company_info.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)
        output_files["json"] = json_path
        
        # Save text format
        text_data = self.format_to_text(analysis_results)
        text_path = os.path.join(output_dir, "company_info.txt")
        with open(text_path, 'w', encoding='utf-8') as f:
            f.write(text_data)
        output_files["text"] = text_path
        
        # Save raw analysis results
        analysis_path = os.path.join(output_dir, "raw_analysis.json")
        with open(analysis_path, 'w', encoding='utf-8') as f:
            json.dump(analysis_results, f, indent=2, ensure_ascii=False)
        output_files["raw_analysis"] = analysis_path
        
        logger.info(f"Saved formatted data to {output_dir}")
        logger.info(f"  - JSON format: {json_path}")
        logger.info(f"  - Text format: {text_path}")
        logger.info(f"  - Raw analysis: {analysis_path}")
        
        return output_files


# Factory function
def create_data_formatter() -> DataFormatter:
    """
    Factory function to create a DataFormatter instance.
    
    Returns:
        DataFormatter: Initialized formatter instance
    """
    return DataFormatter()