"""
Main Document Processor

This is the main application that orchestrates the entire document processing pipeline:
1. Recursive document discovery and text extraction
2. Document analysis using Claude LLM
3. Data formatting into required output formats
"""

import os
import sys
import json
import logging
import argparse
from typing import Dict, Any, Optional
from datetime import datetime

# Import our custom modules
from enhanced_document_extractor import create_enhanced_document_extractor
from document_analyzer import create_document_analyzer
from data_formatter import create_data_formatter

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('document_processor.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

class DocumentProcessor:
    """
    Main document processor that orchestrates the entire pipeline.
    """
    
    def __init__(self, claude_api_key: str):
        """
        Initialize the document processor.
        
        Args:
            claude_api_key (str): API key for Claude
        """
        self.claude_api_key = claude_api_key
        self.extractor = create_enhanced_document_extractor(claude_api_key)
        self.analyzer = create_document_analyzer(claude_api_key)
        self.formatter = create_data_formatter()
    
    def process_documents(self, input_dir: str, output_dir: str, 
                         use_cache: bool = True) -> Dict[str, Any]:
        """
        Process all documents in the input directory and generate outputs.
        
        Args:
            input_dir (str): Directory containing documents to process
            output_dir (str): Directory to save output files
            use_cache (bool): Whether to use caching for extraction
            
        Returns:
            Dict[str, Any]: Processing results and statistics
        """
        start_time = datetime.now()
        logger.info(f"Starting document processing pipeline")
        logger.info(f"Input directory: {input_dir}")
        logger.info(f"Output directory: {output_dir}")
        
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        processing_results = {
            "start_time": start_time.isoformat(),
            "input_dir": input_dir,
            "output_dir": output_dir,
            "steps": {}
        }
        
        try:
            # Step 1: Extract text from documents
            logger.info("Step 1: Extracting text from documents...")
            step1_start = datetime.now()
            
            cache_key = input_dir if use_cache else None
            documents_text, extraction_success, extraction_message = self.extractor.extract_documents_text_recursive(
                input_dir, cache_key=cache_key
            )
            
            step1_duration = datetime.now() - step1_start
            processing_results["steps"]["extraction"] = {
                "success": extraction_success,
                "message": extraction_message,
                "duration": step1_duration.total_seconds(),
                "documents_found": len(documents_text)
            }
            
            if not extraction_success:
                logger.error(f"Text extraction failed: {extraction_message}")
                processing_results["success"] = False
                processing_results["error"] = extraction_message
                return processing_results
            
            logger.info(f"Successfully extracted text from {len(documents_text)} documents")
            
            # Save extracted text for debugging
            extracted_text_path = os.path.join(output_dir, "extracted_text.json")
            with open(extracted_text_path, 'w', encoding='utf-8') as f:
                json.dump(documents_text, f, indent=2, ensure_ascii=False)
            
            # Step 2: Analyze documents
            logger.info("Step 2: Analyzing documents with Claude...")
            step2_start = datetime.now()
            
            analysis_results = self.analyzer.analyze_documents_batch(documents_text)
            
            step2_duration = datetime.now() - step2_start
            analysis_summary = self.analyzer.get_analysis_summary(analysis_results)
            
            processing_results["steps"]["analysis"] = {
                "success": True,
                "duration": step2_duration.total_seconds(),
                "documents_analyzed": len(analysis_results),
                "summary": analysis_summary
            }
            
            logger.info(f"Successfully analyzed {len(analysis_results)} documents")
            logger.info(f"Average confidence: {analysis_summary['avg_confidence']:.2f}")
            
            # Step 3: Format and save results
            logger.info("Step 3: Formatting and saving results...")
            step3_start = datetime.now()
            
            output_files = self.formatter.save_formatted_data(analysis_results, output_dir)
            
            step3_duration = datetime.now() - step3_start
            processing_results["steps"]["formatting"] = {
                "success": True,
                "duration": step3_duration.total_seconds(),
                "output_files": output_files
            }
            
            # Final processing results
            total_duration = datetime.now() - start_time
            processing_results["success"] = True
            processing_results["end_time"] = datetime.now().isoformat()
            processing_results["total_duration"] = total_duration.total_seconds()
            processing_results["output_files"] = output_files
            
            logger.info(f"Document processing completed successfully in {total_duration}")
            logger.info(f"Output files saved to: {output_dir}")
            
            return processing_results
            
        except Exception as e:
            logger.error(f"Error during document processing: {e}")
            processing_results["success"] = False
            processing_results["error"] = str(e)
            processing_results["end_time"] = datetime.now().isoformat()
            return processing_results
    
    def process_single_document(self, file_path: str, output_dir: str) -> Dict[str, Any]:
        """
        Process a single document file.
        
        Args:
            file_path (str): Path to the document file
            output_dir (str): Directory to save output files
            
        Returns:
            Dict[str, Any]: Processing results
        """
        logger.info(f"Processing single document: {file_path}")
        
        if not os.path.exists(file_path):
            return {"success": False, "error": f"File not found: {file_path}"}
        
        try:
            # Extract text
            text = self.extractor.extract_text_from_file(file_path)
            
            if text.startswith("Error"):
                return {"success": False, "error": text}
            
            # Analyze document
            analysis_result = self.analyzer.analyze_document(text, file_path)
            
            # Format results
            analysis_results = {file_path: analysis_result}
            output_files = self.formatter.save_formatted_data(analysis_results, output_dir)
            
            return {
                "success": True,
                "analysis_result": analysis_result,
                "output_files": output_files
            }
            
        except Exception as e:
            logger.error(f"Error processing single document: {e}")
            return {"success": False, "error": str(e)}
    
    def get_supported_file_types(self) -> Dict[str, list]:
        """
        Get the list of supported file types.
        
        Returns:
            Dict[str, list]: Dictionary of supported file extensions by category
        """
        return self.extractor.supported_extensions


# Example usage functions
def process_directory(input_dir: str, output_dir: str, api_key: str):
    """
    Example function showing how to process a directory of documents.
    
    Args:
        input_dir (str): Directory containing documents
        output_dir (str): Directory to save outputs
        api_key (str): Claude API key
    """
    processor = DocumentProcessor(api_key)
    results = processor.process_documents(input_dir, output_dir)
    
    if results["success"]:
        print("Processing completed successfully!")
        print(f"Output files: {results['output_files']}")
    else:
        print(f"Processing failed: {results['error']}")
    
    return results

def process_single_file(file_path: str, output_dir: str, api_key: str):
    """
    Example function showing how to process a single document.
    
    Args:
        file_path (str): Path to document file
        output_dir (str): Directory to save outputs
        api_key (str): Claude API key
    """
    processor = DocumentProcessor(api_key)
    results = processor.process_single_document(file_path, output_dir)
    
    if results["success"]:
        print("Document processed successfully!")
        print(f"Document type: {results['analysis_result']['doc_type']}")
        print(f"Confidence: {results['analysis_result']['confidence']:.2f}")
    else:
        print(f"Processing failed: {results['error']}")
