/**
 * BM25 (Best Matching 25) implementation for keyword-based search.
 *
 * BM25 is a probabilistic ranking function used for information retrieval.
 * It ranks documents based on the query terms appearing in each document,
 * taking into account term frequency and document length normalization.
 */
import type { LineRange } from '../line-utils.js';
/**
 * BM25 configuration parameters
 */
export interface BM25Config {
    /**
     * Controls term frequency saturation.
     * Higher values give more weight to term frequency.
     * Typical range: 1.2 - 2.0
     * @default 1.5
     */
    k1?: number;
    /**
     * Controls document length normalization.
     * 0 = no length normalization, 1 = full normalization
     * @default 0.75
     */
    b?: number;
}
/**
 * Represents a document in the BM25 index
 */
export interface BM25Document {
    /** Document identifier */
    id: string;
    /** Document content */
    content: string;
    /** Pre-computed tokens for the document */
    tokens: string[];
    /** Token frequency map */
    termFrequencies: Map<string, number>;
    /** Total number of tokens */
    length: number;
    /** Optional metadata */
    metadata?: Record<string, unknown>;
}
/**
 * Result from a BM25 search
 */
export interface BM25SearchResult {
    /** Document identifier */
    id: string;
    /** Document content */
    content: string;
    /** BM25 score (higher is more relevant) */
    score: number;
    /** Optional metadata */
    metadata?: Record<string, unknown>;
    /** Line range where query terms were found (if computed) */
    lineRange?: LineRange;
}
/**
 * Tokenization options
 */
export interface TokenizeOptions {
    /** Convert to lowercase */
    lowercase?: boolean;
    /** Remove punctuation */
    removePunctuation?: boolean;
    /** Minimum token length */
    minLength?: number;
    /** Custom stopwords to remove */
    stopwords?: Set<string>;
    /** Custom split pattern (default: /\s+/) */
    splitPattern?: RegExp;
}
/**
 * Default English stopwords
 */
export declare const DEFAULT_STOPWORDS: Set<string>;
/**
 * Tokenize text into an array of terms
 */
export declare function tokenize(text: string, options?: TokenizeOptions): string[];
export { extractLines, extractLinesWithLimit, formatWithLineNumbers, replaceString, StringNotFoundError, StringNotUniqueError, } from '../line-utils.js';
/**
 * Find the line range where query terms appear in content.
 * Returns the range spanning from the first to the last line containing any query term.
 *
 * @param content - The document content
 * @param queryTerms - Tokenized query terms to find
 * @param options - Tokenization options (should match indexing options)
 * @returns LineRange if terms found, undefined otherwise
 */
export declare function findLineRange(content: string, queryTerms: string[], options?: TokenizeOptions): LineRange | undefined;
/**
 * BM25 Index for keyword-based document retrieval
 */
export declare class BM25Index {
    #private;
    /** BM25 k1 parameter */
    readonly k1: number;
    /** BM25 b parameter */
    readonly b: number;
    constructor(config?: BM25Config, tokenizeOptions?: TokenizeOptions);
    /**
     * Add a document to the index
     */
    add(id: string, content: string, metadata?: Record<string, unknown>): void;
    /**
     * Remove a document from the index
     */
    remove(id: string): boolean;
    /**
     * Clear all documents from the index
     */
    clear(): void;
    /**
     * Search for documents matching the query
     */
    search(query: string, topK?: number, minScore?: number): BM25SearchResult[];
    /**
     * Get a document by ID
     */
    get(id: string): BM25Document | undefined;
    /**
     * Check if a document exists in the index
     */
    has(id: string): boolean;
    /**
     * Get the number of documents in the index
     */
    get size(): number;
    /**
     * Get all document IDs
     */
    get documentIds(): string[];
    /**
     * Serialize the index to a JSON-compatible object
     */
    serialize(): BM25IndexData;
    /**
     * Deserialize an index from a JSON object
     */
    static deserialize(data: BM25IndexData, tokenizeOptions?: TokenizeOptions): BM25Index;
}
/**
 * Serialized document format for persistence
 */
interface SerializedBM25Document {
    id: string;
    content: string;
    tokens: string[];
    termFrequencies: Record<string, number>;
    length: number;
    metadata?: Record<string, unknown>;
}
/**
 * Serialized index data for persistence
 */
export interface BM25IndexData {
    k1: number;
    b: number;
    documents: SerializedBM25Document[];
    avgDocLength: number;
}
//# sourceMappingURL=bm25.d.ts.map