{"version":3,"file":"BM25.cjs","names":[],"sources":["../../../../src/utils/@furkantoprak/bm25/BM25.ts"],"sourcesContent":["/**\n * Adapted from\n * https://github.com/FurkanToprak/OkapiBM25\n *\n * Inlined due to CJS import issues.\n */\n\n/** Gets word count. */\nexport const getWordCount = (corpus: string) => {\n  return ((corpus || \"\").match(/\\w+/g) || []).length;\n};\n\n/** Number of occurences of a word in a string. */\nexport const getTermFrequency = (term: string, corpus: string) => {\n  // Escape any RegExp metacharacters in the term so constructing a RegExp\n  // from user-provided or model-generated queries does not throw an error\n  const escaped = (term || \"\").replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\");\n  return ((corpus || \"\").match(new RegExp(escaped, \"g\")) || []).length;\n};\n\n/** Inverse document frequency. */\nexport const getIDF = <T>(term: string, documents: BMInputDocument<T>[]) => {\n  // Number of relevant documents.\n  const relevantDocuments = documents.filter((document) =>\n    document.text.includes(term)\n  ).length;\n  return Math.log(\n    (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1\n  );\n};\n\nexport interface BMInputDocument<T> {\n  /** The text from the original document */\n  text: string;\n  /** The original document */\n  document: T;\n}\n\n/** Represents a document; useful when sorting results.\n */\nexport interface BMOutputDocument<T> {\n  /** The original document */\n  document: T;\n  /** The score that the document receives. */\n  score: number;\n}\n\n/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */\nexport interface BMConstants {\n  /** Free parameter. Is 0.75 by default.  */\n  b?: number;\n  /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */\n  k1?: number;\n}\n\n/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL  */\nexport type BMSorter<T> = (\n  firstEl: BMOutputDocument<T>,\n  secondEl: BMOutputDocument<T>\n) => number;\n\n/** Implementation of Okapi BM25 algorithm.\n *  @param documents: Collection of documents with text content and associated data.\n *  @param keywords: query terms.\n *  @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default.\n *  @param sorter: A function that allows you to sort results by a given rule. If not provided, returns results in the original document order.\n */\nexport function BM25<T>(\n  documents: BMInputDocument<T>[],\n  keywords: string[],\n  constants?: BMConstants,\n  sorter?: BMSorter<T>\n): BMOutputDocument<T>[] {\n  const b = constants && constants.b ? constants.b : 0.75;\n  const k1 = constants && constants.k1 ? constants.k1 : 1.2;\n  const documentLengths = documents.map((document) =>\n    getWordCount(document.text)\n  );\n  const averageDocumentLength =\n    documentLengths.reduce((a, b) => a + b, 0) / documents.length;\n  const idfByKeyword = keywords.reduce((obj, keyword) => {\n    obj.set(keyword, getIDF(keyword, documents));\n    return obj;\n  }, new Map<string, number>());\n\n  const scoredDocs = documents.map(({ text, document }, index) => {\n    const score = keywords\n      .map((keyword: string) => {\n        const inverseDocumentFrequency = idfByKeyword.get(keyword);\n        if (inverseDocumentFrequency === undefined) {\n          throw new Error(\"Missing keyword.\");\n        }\n        const termFrequency = getTermFrequency(keyword, text);\n        const documentLength = documentLengths[index];\n        return (\n          (inverseDocumentFrequency * (termFrequency * (k1 + 1))) /\n          (termFrequency +\n            k1 * (1 - b + (b * documentLength) / averageDocumentLength))\n        );\n      })\n      .reduce((a: number, b: number) => a + b, 0);\n    return { score, document } as BMOutputDocument<T>;\n  });\n  // sort the results\n  if (sorter) {\n    return scoredDocs.sort(sorter);\n  }\n  return scoredDocs;\n}\n"],"mappings":";;;;;;;;AAQA,MAAa,gBAAgB,WAAmB;AAC9C,UAAS,UAAU,IAAI,MAAM,OAAO,IAAI,EAAE,EAAE;;;AAI9C,MAAa,oBAAoB,MAAc,WAAmB;CAGhE,MAAM,WAAW,QAAQ,IAAI,QAAQ,uBAAuB,OAAO;AACnE,UAAS,UAAU,IAAI,MAAM,IAAI,OAAO,SAAS,IAAI,CAAC,IAAI,EAAE,EAAE;;;AAIhE,MAAa,UAAa,MAAc,cAAoC;CAE1E,MAAM,oBAAoB,UAAU,QAAQ,aAC1C,SAAS,KAAK,SAAS,KAAK,CAC7B,CAAC;AACF,QAAO,KAAK,KACT,UAAU,SAAS,oBAAoB,OAAQ,oBAAoB,MAAO,EAC5E;;;;;;;;AAuCH,SAAgB,KACd,WACA,UACA,WACA,QACuB;CACvB,MAAM,IAAI,aAAa,UAAU,IAAI,UAAU,IAAI;CACnD,MAAM,KAAK,aAAa,UAAU,KAAK,UAAU,KAAK;CACtD,MAAM,kBAAkB,UAAU,KAAK,aACrC,aAAa,SAAS,KAAK,CAC5B;CACD,MAAM,wBACJ,gBAAgB,QAAQ,GAAG,MAAM,IAAI,GAAG,EAAE,GAAG,UAAU;CACzD,MAAM,eAAe,SAAS,QAAQ,KAAK,YAAY;AACrD,MAAI,IAAI,SAAS,OAAO,SAAS,UAAU,CAAC;AAC5C,SAAO;oBACN,IAAI,KAAqB,CAAC;CAE7B,MAAM,aAAa,UAAU,KAAK,EAAE,MAAM,YAAY,UAAU;AAgB9D,SAAO;GAAE,OAfK,SACX,KAAK,YAAoB;IACxB,MAAM,2BAA2B,aAAa,IAAI,QAAQ;AAC1D,QAAI,6BAA6B,KAAA,EAC/B,OAAM,IAAI,MAAM,mBAAmB;IAErC,MAAM,gBAAgB,iBAAiB,SAAS,KAAK;IACrD,MAAM,iBAAiB,gBAAgB;AACvC,WACG,4BAA4B,iBAAiB,KAAK,OAClD,gBACC,MAAM,IAAI,IAAK,IAAI,iBAAkB;KAEzC,CACD,QAAQ,GAAW,MAAc,IAAI,GAAG,EAAE;GAC7B;GAAU;GAC1B;AAEF,KAAI,OACF,QAAO,WAAW,KAAK,OAAO;AAEhC,QAAO"}