import fs from 'fs';
import path from 'path';

// Lore chunk represents a searchable section of the document
export interface LoreChunk {
  id: string;
  title: string;
  content: string;
  startLine: number;
  endLine: number;
  section: string;
  subsection?: string;
}

// Token entry maps tokens to chunks with frequency
export interface TokenEntry {
  token: string;
  chunks: Array<{
    chunkId: string;
    frequency: number;
    positions: number[]; // Character positions within chunk
  }>;
  totalFrequency: number;
}

// The main lore index structure
export interface LoreIndex {
  chunks: Record<string, LoreChunk>;
  unigrams: Record<string, TokenEntry>; // Single words
  digraphs: Record<string, TokenEntry>; // Two-word phrases
  metadata: {
    totalChunks: number;
    totalTokens: number;
    totalDigraphs: number;
    lastUpdated: string;
    sourceFile: string;
  };
}

// Search result with relevance scoring
export interface SearchResult {
  chunk: LoreChunk;
  score: number;
  matchedTokens: string[];
  context: string; // Snippet with highlighted matches
}

/**
 * Clean and normalize text for tokenization
 */
function cleanText(text: string): string {
  return text
    // Remove markdown formatting
    .replace(/\*\*(.*?)\*\*/g, '$1') // Bold
    .replace(/\*(.*?)\*/g, '$1')     // Italic
    .replace(/`(.*?)`/g, '$1')       // Code
    .replace(/\[(.*?)\]\(.*?\)/g, '$1') // Links
    // Normalize whitespace
    .replace(/\s+/g, ' ')
    .trim();
}

/**
 * Tokenize text into clean words
 */
function tokenize(text: string): string[] {
  return cleanText(text)
    .toLowerCase()
    // Split on word boundaries, keeping apostrophes in contractions
    .split(/[^\w']+/)
    .filter(token =>
      token.length > 1 && // Skip single characters
      !token.match(/^\d+$/) && // Skip pure numbers
      !['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'].includes(token) // Skip common stop words
    );
}

/**
 * Generate digraphs (two-word phrases) from tokens
 */
function generateDigraphs(tokens: string[]): string[] {
  const digraphs: string[] = [];
  for (let i = 0; i < tokens.length - 1; i++) {
    digraphs.push(`${tokens[i]} ${tokens[i + 1]}`);
  }
  return digraphs;
}

/**
 * Parse markdown into structured chunks
 */
function parseMarkdownIntoChunks(content: string): LoreChunk[] {
  const lines = content.split('\n');
  const chunks: LoreChunk[] = [];

  let currentSection = '';
  let currentSubsection = '';
  let chunkStart = 0;
  let chunkContent: string[] = [];
  let chunkTitle = 'Document Start';

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];

    // Handle headers
    if (line.startsWith('##')) {
      // Finish previous chunk if it has content
      if (chunkContent.length > 0) {
        chunks.push({
          id: `chunk_${chunks.length}`,
          title: chunkTitle,
          content: chunkContent.join('\n').trim(),
          startLine: chunkStart + 1,
          endLine: i,
          section: currentSection,
          subsection: currentSubsection
        });
      }

      // Start new chunk
      if (line.startsWith('### ')) {
        currentSubsection = line.replace('### ', '').trim();
        chunkTitle = currentSubsection;
      } else if (line.startsWith('## ')) {
        currentSection = line.replace('## ', '').trim();
        currentSubsection = '';
        chunkTitle = currentSection;
      }

      chunkStart = i;
      chunkContent = [line];
    } else {
      chunkContent.push(line);
    }
  }

  // Don't forget the last chunk
  if (chunkContent.length > 0) {
    chunks.push({
      id: `chunk_${chunks.length}`,
      title: chunkTitle,
      content: chunkContent.join('\n').trim(),
      startLine: chunkStart + 1,
      endLine: lines.length,
      section: currentSection,
      subsection: currentSubsection
    });
  }

  return chunks;
}

/**
 * Find all positions of a token within text
 */
function findTokenPositions(text: string, token: string): number[] {
  const positions: number[] = [];
  const cleanedText = cleanText(text).toLowerCase();
  const searchToken = token.toLowerCase();

  let startIndex = 0;
  while (true) {
    const index = cleanedText.indexOf(searchToken, startIndex);
    if (index === -1) break;
    positions.push(index);
    startIndex = index + 1;
  }

  return positions;
}

/**
 * Build inverted index for tokens
 */
function buildTokenIndex(chunks: LoreChunk[], isDigraph: boolean = false): Record<string, TokenEntry> {
  const index: Record<string, TokenEntry> = {};

  for (const chunk of chunks) {
    const tokens = tokenize(chunk.content);
    const tokensToProcess = isDigraph ? generateDigraphs(tokens) : tokens;

    // Count frequency of each token in this chunk
    const tokenFrequency: Record<string, number> = {};
    for (const token of tokensToProcess) {
      tokenFrequency[token] = (tokenFrequency[token] || 0) + 1;
    }

    // Add to inverted index
    for (const [token, frequency] of Object.entries(tokenFrequency)) {
      if (!index[token]) {
        index[token] = {
          token,
          chunks: [],
          totalFrequency: 0
        };
      }

      const positions = findTokenPositions(chunk.content, token);

      index[token].chunks.push({
        chunkId: chunk.id,
        frequency,
        positions
      });

      index[token].totalFrequency += frequency;
    }
  }

  return index;
}

/**
 * Build complete lore index from markdown file
 */
export function buildLoreIndex(filePath: string): LoreIndex {
  const content = fs.readFileSync(filePath, 'utf-8');
  const chunks = parseMarkdownIntoChunks(content);

  // Build chunk lookup
  const chunkLookup: Record<string, LoreChunk> = {};
  for (const chunk of chunks) {
    chunkLookup[chunk.id] = chunk;
  }

  // Build token indexes
  const unigrams = buildTokenIndex(chunks, false);
  const digraphs = buildTokenIndex(chunks, true);

  return {
    chunks: chunkLookup,
    unigrams,
    digraphs,
    metadata: {
      totalChunks: chunks.length,
      totalTokens: Object.keys(unigrams).length,
      totalDigraphs: Object.keys(digraphs).length,
      lastUpdated: new Date().toISOString(),
      sourceFile: path.basename(filePath)
    }
  };
}

/**
 * Calculate TF-IDF score for search relevance
 */
function calculateTfIdfScore(
  tokenEntry: TokenEntry,
  chunkId: string,
  totalChunks: number
): number {
  const chunkEntry = tokenEntry.chunks.find(c => c.chunkId === chunkId);
  if (!chunkEntry) return 0;

  // Term Frequency: frequency of term in document
  const tf = chunkEntry.frequency;

  // Inverse Document Frequency: log(total docs / docs containing term)
  const idf = Math.log(totalChunks / tokenEntry.chunks.length);

  return tf * idf;
}

/**
 * Search the lore index for relevant chunks
 */
export function searchLore(
  index: LoreIndex,
  query: string,
  limit: number = 10
): SearchResult[] {
  const queryTokens = tokenize(query);
  const queryDigraphs = generateDigraphs(queryTokens);

  // Collect all relevant chunks with scores
  const chunkScores: Record<string, {
    score: number;
    matchedTokens: Set<string>;
  }> = {};

  // Search unigrams
  for (const token of queryTokens) {
    const tokenEntry = index.unigrams[token];
    if (tokenEntry) {
      for (const chunkEntry of tokenEntry.chunks) {
        if (!chunkScores[chunkEntry.chunkId]) {
          chunkScores[chunkEntry.chunkId] = {
            score: 0,
            matchedTokens: new Set()
          };
        }

        const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks);
        chunkScores[chunkEntry.chunkId].score += score;
        chunkScores[chunkEntry.chunkId].matchedTokens.add(token);
      }
    }
  }

  // Search digraphs (give them higher weight)
  for (const digraph of queryDigraphs) {
    const tokenEntry = index.digraphs[digraph];
    if (tokenEntry) {
      for (const chunkEntry of tokenEntry.chunks) {
        if (!chunkScores[chunkEntry.chunkId]) {
          chunkScores[chunkEntry.chunkId] = {
            score: 0,
            matchedTokens: new Set()
          };
        }

        const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks) * 2; // Digraph bonus
        chunkScores[chunkEntry.chunkId].score += score;
        chunkScores[chunkEntry.chunkId].matchedTokens.add(digraph);
      }
    }
  }

  // Convert to search results and sort by score
  const results: SearchResult[] = Object.entries(chunkScores)
    .map(([chunkId, { score, matchedTokens }]) => {
      const chunk = index.chunks[chunkId];
      const matchedArray = Array.from(matchedTokens);

      // Generate context snippet (first 200 chars with matches highlighted)
      let context = chunk.content.substring(0, 200);
      for (const token of matchedArray) {
        // Simple highlighting (could be improved)
        const regex = new RegExp(`\\b${token}\\b`, 'gi');
        context = context.replace(regex, `**${token}**`);
      }
      if (chunk.content.length > 200) {
        context += '...';
      }

      return {
        chunk,
        score,
        matchedTokens: matchedArray,
        context
      };
    })
    .sort((a, b) => b.score - a.score)
    .slice(0, limit);

  return results;
}

/**
 * Get lore index statistics for debugging
 */
export function getLoreStats(index: LoreIndex): any {
  const topUnigrams = Object.entries(index.unigrams)
    .sort((a, b) => b[1].totalFrequency - a[1].totalFrequency)
    .slice(0, 20)
    .map(([token, entry]) => ({ token, frequency: entry.totalFrequency }));

  const topDigraphs = Object.entries(index.digraphs)
    .sort((a, b) => b[1].totalFrequency - a[1].totalFrequency)
    .slice(0, 20)
    .map(([token, entry]) => ({ token, frequency: entry.totalFrequency }));

  return {
    metadata: index.metadata,
    topUnigrams,
    topDigraphs,
    sampleChunks: Object.values(index.chunks).slice(0, 3).map(chunk => ({
      id: chunk.id,
      title: chunk.title,
      section: chunk.section,
      lineRange: `${chunk.startLine}-${chunk.endLine}`,
      contentPreview: chunk.content.substring(0, 100) + '...'
    }))
  };
}
