import { Document, DocumentChunk } from "../types";
import { DocumentLoaderFactory } from "../loaders";
import {
  RecursiveTextSplitter,
  TextSplitterConfig,
} from "../splitters/recursive";

export interface ProcessingPipelineConfig {
  textSplitter: TextSplitterConfig;
  enableMetadataExtraction?: boolean;
  enableTextCleaning?: boolean;
}

export class DocumentProcessingPipeline {
  private loaderFactory: DocumentLoaderFactory;
  private textSplitter: RecursiveTextSplitter;
  private config: ProcessingPipelineConfig;

  constructor(config: ProcessingPipelineConfig) {
    this.config = {
      enableMetadataExtraction: true,
      enableTextCleaning: true,
      ...config,
    };

    this.loaderFactory = new DocumentLoaderFactory();
    this.textSplitter = new RecursiveTextSplitter(config.textSplitter);
  }

  async processDocument(filePath: string, content: Buffer): Promise<Document> {
    try {
      // Step 1: Load and parse document
      const document = await this.loaderFactory.loadDocument(filePath, content);

      // Step 2: Extract additional metadata if enabled
      if (this.config.enableMetadataExtraction) {
        await this.extractMetadata(document);
      }

      // Step 3: Clean text if enabled
      if (this.config.enableTextCleaning) {
        this.cleanDocumentText(document);
      }

      // Step 4: Split document into chunks
      const chunks = await this.textSplitter.splitDocument(document);
      document.chunks = chunks;

      return document;
    } catch (error) {
      throw new Error(`Failed to process document ${filePath}: ${error}`);
    }
  }

  async processBatch(
    files: Array<{ path: string; content: Buffer }>
  ): Promise<Document[]> {
    const results: Document[] = [];
    const errors: Array<{ file: string; error: string }> = [];

    for (const file of files) {
      try {
        const document = await this.processDocument(file.path, file.content);
        results.push(document);
      } catch (error) {
        errors.push({ file: file.path, error: String(error) });
      }
    }

    if (errors.length > 0) {
      console.warn("Some files failed to process:", errors);
    }

    return results;
  }

  getSupportedExtensions(): string[] {
    return this.loaderFactory.getSupportedExtensions();
  }

  private async extractMetadata(document: Document): Promise<void> {
    // Extract language (simple heuristic)
    document.metadata.language = this.detectLanguage(document.content);

    // Extract additional metadata based on content
    document.metadata.description = this.extractDescription(document.content);

    // Auto-categorize based on content
    document.metadata.categories = this.extractCategories(document.content);
  }

  private cleanDocumentText(document: Document): void {
    // Remove excessive whitespace
    document.content = document.content
      .replace(/\s+/g, " ")
      .replace(/\n\s*\n/g, "\n\n")
      .trim();
  }

  private detectLanguage(text: string): string {
    // Simple Korean detection
    const koreanRegex = /[가-힣]/g;
    const koreanMatches = text.match(koreanRegex);

    if (koreanMatches && koreanMatches.length > text.length * 0.1) {
      return "ko";
    }

    return "en"; // Default to English
  }

  private extractDescription(text: string): string {
    // Extract first meaningful paragraph as description
    const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 50);
    return (
      paragraphs[0]?.substring(0, 200) + "..." || "No description available"
    );
  }

  private extractCategories(text: string): string[] {
    const categories: string[] = [];

    // Simple keyword-based categorization
    const keywords = {
      technology: [
        "기술",
        "소프트웨어",
        "개발",
        "AI",
        "머신러닝",
        "technology",
        "software",
        "development",
      ],
      business: [
        "비즈니스",
        "사업",
        "경영",
        "마케팅",
        "business",
        "marketing",
        "strategy",
      ],
      education: ["교육", "학습", "연구", "education", "learning", "research"],
      legal: ["법률", "계약", "규정", "legal", "contract", "regulation"],
    };

    const lowerText = text.toLowerCase();

    for (const [category, words] of Object.entries(keywords)) {
      if (words.some((word) => lowerText.includes(word))) {
        categories.push(category);
      }
    }

    return categories;
  }
}
