import "./mupdf-workaround.js";
import { Canvas } from "@napi-rs/canvas";
import { Document, type PDFPage } from "mupdf";
import { PdfReaderCommon } from "./pdf-reader-common.js";
import { type CanvasMap, type CompactPageLines, type PageLines, type PageTexts, type PageToonLines, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "./pdf.interface.js";
import { type PaddleOcrService } from "ppu-paddle-ocr";
/**
 * PdfReader class based on mupdfjs for reading and processing PDF documents.
 */
export declare class PdfReader extends PdfReaderCommon {
    private options;
    readonly startIndex = 0;
    constructor(options?: Partial<PdfReaderOptions>);
    /**
     * Opens a PDF document from a file path or an ArrayBuffer.
     * @param filename - The file path or ArrayBuffer of the PDF document.
     * @returns The opened PDFDocument instance.
     */
    open(filename: string | ArrayBuffer): Document;
    /**
     * Renders all pages of a PDF document into canvases.
     * @param doc - The PDFDocument to render.
     * @param dpi - The resolution (dots per inch) to render the PDF pages.
     *              Higher values improve OCR accuracy but increase memory usage.
     * @returns A map of page numbers to Canvas instances, where each page number
     *          corresponds to its rendered canvas representation.
     */
    renderAll(doc: Document, dpi?: number): Promise<CanvasMap>;
    /**
     * Extracts text from scanned PDF pages using ppu-paddle-ocr package.
     * @param paddleOcrService - The OCR service instance specifically from ppu-paddle-ocr to use for text recognition.
     * @param canvasMap - A map of page numbers to Canvas instances representing rendered PDF pages.
     * @returns A map of page numbers to extracted text data with OCR results.
     */
    getTextsScanned(paddleOcrService: PaddleOcrService, canvasMap: CanvasMap): Promise<PageTexts>;
    private getCanvas;
    /**
     * Extracts text from all pages of a PDF document.
     * @param doc - The PDFDocument to extract text from.
     * @returns A map of page numbers to extracted text data.
     */
    getTexts(doc: Document): Promise<PageTexts>;
    private extractTexts;
    private extractOcrTexts;
    private convertOcrToPdfWords;
    private mapStructureToPdfWord;
    private mergeTextContent;
    private filterTextContent;
    /**
     * Converts extracted text into structured lines.
     * @param pageTexts - The extracted text data from a PDF.
     * @returns A map of page numbers to structured lines.
     */
    getLinesFromTexts(pageTexts: PageTexts): PageLines;
    /**
     * Converts extracted text into TOON format string for LLM-friendly input.
     * @param pageTexts - The extracted text data from a PDF.
     * @returns A string of TOON format
     */
    getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines;
    /**
     * Converts extracted text into compact structured lines using a specified algorithm.
     * @param pageTexts - The extracted text data from a PDF.
     * @param algorithm - The algorithm for compacting lines (default: "middleY").
     * @returns A map of page numbers to compact structured lines.
     */
    getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines;
    /**
     * Saves rendered canvases as image files.
     * @param canvasMap - The map of canvases to save.
     * @param filename - The base filename for the output images.
     * @param foldername - The folder to save the images in (default: "out").
     */
    dumpCanvasMap(canvasMap: Map<number, Canvas>, filename: string, foldername?: string): Promise<void>;
    /**
     * Determines if the PDF document is scanned based on text thresholds.
     * @param pageTexts - The extracted text data from a PDF.
     * @param options - The threshold options for scanned detection.
     * @returns True if the document is likely scanned, false otherwise.
     */
    isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean;
    /**
     * Determines if the individual PDF page is a scanned/digital based on text thresholds.
     * @param pageText - The extracted page text.
     * @param options - The threshold options for scanned detection.
     * @returns True if the page is likely scanned, false otherwise.
     */
    isPageScanned(pageText: string, options?: PdfScannedThreshold): boolean;
    /**
     * Rebuilds a scanned PDF by placing invisible text over the orginial images,
     * making the PDF searchable without altering its visual appearance.
     * @param doc - The PDFDocument instance to rebuild.
     * @param pageTexts - The extracted text data to overlay.
     * @param options - Rebuild options (optional, default font is Helvetica).
     * @returns A Uint8Array containing the rebuilt PDF binary data.
     */
    rebuild(doc: Document, pageTexts: PageTexts, options?: {
        fontName?: string;
    }): Promise<Uint8Array>;
    /**
     * Destroys the PDF document instance to free memory.
     * @param doc - The PDFDocument instance to destroy.
     */
    destroy(doc: Document): void;
    /**
     * Destroys a PDF page instance to free memory.
     * @param page - The PDFPage instance to destroy.
     */
    destroyPage(page: PDFPage): void;
}
