/** * NLP Scoring Manager - Jaccard similarity and Shannon entropy for semantic analysis * * Implements intelligent document similarity scoring using: * - Jaccard similarity for vocabulary overlap * - Shannon entropy for information density * - Combined scoring for meaningful semantic relationships * * Key insights from analysis: * - High Jaccard (>60%) + Moderate entropy (4.5-6.0) = Same technical domain * - High Jaccard + Low entropy (<3.0) = Stop word pollution, superficial * - Low Jaccard + Similar entropy = Different domains, equally complex * * Part of Enhanced Capability Index (#1085) */ import { IndexConfigManager } from './config/IndexConfig.js'; /** * Scoring result with detailed metrics */ export interface ScoringResult { jaccard: number; entropy: number; combinedScore: number; interpretation: string; tokenCount: number; overlapCount: number; } /** * Pairwise similarity between two elements */ export interface PairwiseSimilarity { element1: string; element2: string; similarity: ScoringResult; timestamp: string; } /** * Configuration for scoring algorithm */ export interface ScoringConfig { minTokenLength: number; cacheExpiry: number; maxCacheSize: number; entropyBands: { low: number; moderate: number; high: number; }; jaccardThresholds: { low: number; moderate: number; high: number; }; } export declare class NLPScoringManager { private cache; private cacheAccessOrder; private config; private unicodeValidator; private cleanupInterval?; constructor(config?: Partial, indexConfigManager?: IndexConfigManager); /** * Clean and tokenize text for analysis * Works with any language - no hardcoded stop words */ private cleanAndTokenize; /** * Calculate Jaccard similarity between two text strings * * Jaccard = |A ∩ B| / |A ∪ B| * * Returns value between 0 (no overlap) and 1 (identical) */ calculateJaccard(text1: string, text2: string): number; /** * Calculate Shannon entropy for text * * H(X) = -Σ p(x) * log2(p(x)) * * Measures information density/vocabulary richness * Higher entropy = more diverse vocabulary */ calculateEntropy(text: string): number; /** * Calculate combined relevance score using Jaccard and entropy * * Interprets the relationship between similarity and complexity */ scoreRelevance(text1: string, text2: string): ScoringResult; /** * Build a pairwise similarity matrix for multiple texts * * Useful for clustering and relationship discovery */ buildSimilarityMatrix(elements: Map): Map>; /** * Find most similar elements to a given text */ findSimilar(targetText: string, candidates: Map, topK?: number): Array<{ name: string; score: ScoringResult; }>; /** * Extract key terms from text based on entropy contribution * * Terms that contribute most to entropy are likely important */ extractKeyTerms(text: string, topK?: number): string[]; /** * Add result to cache with LRU eviction */ private addToCache; /** * Update access order for LRU tracking */ private updateAccessOrder; /** * Clean expired cache entries */ private cleanExpiredCache; /** * Clear the cache */ clearCache(): void; /** * Get cache statistics */ getCacheStats(): { size: number; oldestEntry: number | null; }; dispose(): void; } //# sourceMappingURL=NLPScoringManager.d.ts.map