import { Document } from "../types";
import { BaseDocumentLoader } from "./base";

export class PDFDocumentLoader extends BaseDocumentLoader {
  supportedExtensions = ["pdf"];

  async load(filePath: string, content: Buffer): Promise<Document> {
    try {
      console.log(
        `[PDF Loader] Processing: ${filePath}, size: ${content.length} bytes`
      );

      // Try multiple PDF extraction methods
      let extractedText = "";
      let metadata: any = {};

      // Method 1: Try pdfjs-dist (more reliable)
      try {
        const result = await this.extractWithPDFJS(content);
        extractedText = result.text;
        metadata = result.metadata;
        console.log(
          `[PDF Loader] Successfully extracted ${extractedText.length} characters with PDF.js`
        );
      } catch (pdfjsError) {
        console.warn(`[PDF Loader] PDF.js failed:`, pdfjsError);

        // Method 2: Try pdf-parse as fallback
        try {
          const result = await this.extractWithPDFParse(content);
          extractedText = result.text;
          metadata = result.metadata;
          console.log(
            `[PDF Loader] Successfully extracted ${extractedText.length} characters with pdf-parse`
          );
        } catch (parseError) {
          console.warn(`[PDF Loader] pdf-parse failed:`, parseError);

          // Method 3: Basic text extraction as last resort
          extractedText = this.extractBasicText(content);
          console.log(
            `[PDF Loader] Using basic text extraction, got ${extractedText.length} characters`
          );
        }
      }

      if (!extractedText || extractedText.trim().length < 10) {
        throw new Error("Unable to extract meaningful text from PDF");
      }

      const cleanedText = this.cleanText(extractedText);
      const docMetadata = this.createBaseMetadata(filePath, content.length);

      // Merge extracted metadata
      docMetadata.title =
        metadata.title || this.extractTitleFromFilename(filePath);
      docMetadata.author = metadata.author;
      docMetadata.description = this.extractDescription(cleanedText);

      if (metadata.createdAt) {
        docMetadata.createdAt = new Date(metadata.createdAt);
      }

      return {
        id: this.generateDocumentId(filePath),
        content: cleanedText,
        metadata: docMetadata,
        source: filePath,
      };
    } catch (error) {
      throw new Error(`Failed to load PDF document: ${error}`);
    }
  }

  private async extractWithPDFJS(
    content: Buffer
  ): Promise<{ text: string; metadata: any }> {
    try {
      // Import pdfjs-dist dynamically
      const pdfjsLib = await import("pdfjs-dist");

      // Create a Uint8Array from Buffer
      const pdfData = new Uint8Array(content);

      // Load the PDF document
      const loadingTask = pdfjsLib.getDocument({
        data: pdfData,
        verbosity: 0, // Suppress console output
      });

      const pdfDocument = await loadingTask.promise;
      let fullText = "";
      const metadata: any = {};

      // Extract metadata
      try {
        const pdfMetadata = await pdfDocument.getMetadata();
        const info = (pdfMetadata as any).info;
        metadata.title = info?.Title;
        metadata.author = info?.Author;
        metadata.subject = info?.Subject;
        metadata.creator = info?.Creator;
        metadata.producer = info?.Producer;
        metadata.createdAt = info?.CreationDate;
        metadata.modifiedAt = info?.ModDate;
      } catch (metaError) {
        console.warn("[PDF Loader] Failed to extract PDF metadata:", metaError);
      }

      // Extract text from all pages
      const numPages = pdfDocument.numPages;
      console.log(`[PDF Loader] PDF has ${numPages} pages`);

      for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
        try {
          const page = await pdfDocument.getPage(pageNumber);
          const textContent = await page.getTextContent();

          // Combine text items
          const pageText = textContent.items
            .map((item: any) => {
              // Handle different text item types
              if (typeof item === "string") return item;
              if (item.str) return item.str;
              if (item.text) return item.text;
              return "";
            })
            .join(" ");

          fullText += pageText + "\n\n";
        } catch (pageError) {
          console.warn(
            `[PDF Loader] Failed to extract page ${pageNumber}:`,
            pageError
          );
        }
      }

      return { text: fullText.trim(), metadata };
    } catch (error) {
      throw new Error(`PDF.js extraction failed: ${error.message}`);
    }
  }

  private async extractWithPDFParse(
    content: Buffer
  ): Promise<{ text: string; metadata: any }> {
    try {
      // Import pdf-parse with specific options to avoid file path issues
      const pdfParse = await import("pdf-parse");
      const parseFn = (pdfParse.default || pdfParse) as any;

      // Use specific options to prevent internal file references
      const options = {
        // Limit the extraction to prevent infinite loops
        max: 0, // No page limit
        // Disable problematic features
        normalizeWhitespace: false,
        disableCombineTextItems: false,
        // Prevent version checking that might cause file path issues
        version: "v1.10.100",
      };

      const result = await parseFn(content, options);

      const metadata: any = {};
      if (result.info) {
        metadata.title = result.info.Title;
        metadata.author = result.info.Author;
        metadata.subject = result.info.Subject;
        metadata.creator = result.info.Creator;
        metadata.producer = result.info.Producer;
        metadata.createdAt = result.info.CreationDate;
        metadata.modifiedAt = result.info.ModDate;
      }

      return { text: result.text || "", metadata };
    } catch (error) {
      throw new Error(`pdf-parse extraction failed: ${error.message}`);
    }
  }

  private extractBasicText(content: Buffer): string {
    try {
      // Convert buffer to string and try to extract readable text
      const text = content.toString("latin1");

      // Look for text patterns commonly found in PDFs
      const textPatterns = [
        // Standard text extraction
        /BT\s+(.+?)\s+ET/g,
        // Text between parentheses (common in PDF text commands)
        /\(([^)]+)\)/g,
        // Plain text patterns
        /[\x20-\x7E\u00A0-\u00FF\u0100-\u017F\u0180-\u024F\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]{10,}/g,
      ];

      let extractedText = "";

      for (const pattern of textPatterns) {
        const matches = text.match(pattern);
        if (matches && matches.length > 0) {
          extractedText += matches.join(" ").substring(0, 10000);
          break; // Use first successful pattern
        }
      }

      // Clean up the extracted text
      extractedText = extractedText
        .replace(/\s+/g, " ")
        .replace(/[^\x20-\x7E\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]/g, " ")
        .trim();

      if (extractedText.length < 10) {
        return "PDF 문서를 처리했지만 텍스트를 추출할 수 없습니다. 이미지 기반 PDF이거나 보호된 문서일 수 있습니다.";
      }

      return extractedText;
    } catch (error) {
      return "PDF 문서 처리 중 오류가 발생했습니다.";
    }
  }

  private extractTitleFromFilename(filePath: string): string {
    const filename = filePath.split("/").pop() || filePath;
    return filename.replace(/\.[^/.]+$/, ""); // Remove extension
  }

  private extractDescription(text: string): string {
    // Extract first meaningful paragraph as description
    const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20);
    const firstParagraph = paragraphs[0]?.trim();

    if (firstParagraph && firstParagraph.length > 50) {
      return (
        firstParagraph.substring(0, 200) +
        (firstParagraph.length > 200 ? "..." : "")
      );
    }

    return "PDF 문서";
  }
}
