{"version":3,"file":"base.cjs","names":["sha256","Document","UUIDV5_NAMESPACE"],"sources":["../../src/indexing/base.ts"],"sourcesContent":["import { v5 as uuidv5 } from \"uuid\";\nimport { VectorStore } from \"../vectorstores.js\";\nimport { RecordManagerInterface, UUIDV5_NAMESPACE } from \"./record_manager.js\";\nimport { sha256, type HashKeyEncoder } from \"../utils/hash.js\";\nimport { DocumentInterface, Document } from \"../documents/document.js\";\nimport { BaseDocumentLoader } from \"../document_loaders/base.js\";\n\ntype Metadata = Record<string, unknown>;\n\ntype IndexingResult = {\n  numAdded: number;\n  numDeleted: number;\n  numUpdated: number;\n  numSkipped: number;\n};\n\ntype StringOrDocFunc = string | ((doc: DocumentInterface) => string);\n\nexport interface HashedDocumentInterface extends DocumentInterface {\n  uid: string;\n  hash_?: string;\n  contentHash?: string;\n  metadataHash?: string;\n  pageContent: string;\n  metadata: Metadata;\n  calculateHashes(): void;\n  toDocument(): DocumentInterface;\n}\n\ninterface HashedDocumentArgs {\n  pageContent: string;\n  metadata: Metadata;\n  uid: string;\n}\n\n/**\n * HashedDocument is a Document with hashes calculated.\n * Hashes are calculated based on page content and metadata.\n * It is used for indexing.\n */\nexport class _HashedDocument implements HashedDocumentInterface {\n  uid: string;\n\n  hash_?: string;\n\n  contentHash?: string;\n\n  metadataHash?: string;\n\n  pageContent: string;\n\n  metadata: Metadata;\n\n  private keyEncoder: HashKeyEncoder = sha256;\n\n  constructor(fields: HashedDocumentArgs) {\n    this.uid = fields.uid;\n    this.pageContent = fields.pageContent;\n    this.metadata = fields.metadata;\n  }\n\n  makeDefaultKeyEncoder(keyEncoderFn: HashKeyEncoder): void {\n    this.keyEncoder = keyEncoderFn;\n  }\n\n  calculateHashes(): void {\n    const forbiddenKeys = [\"hash_\", \"content_hash\", \"metadata_hash\"];\n\n    for (const key of forbiddenKeys) {\n      if (key in this.metadata) {\n        throw new Error(\n          `Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(\n            \", \"\n          )}]`\n        );\n      }\n    }\n\n    const contentHash = this._hashStringToUUID(this.pageContent);\n\n    try {\n      const metadataHash = this._hashNestedDictToUUID(this.metadata);\n      this.contentHash = contentHash;\n      this.metadataHash = metadataHash;\n    } catch (e) {\n      throw new Error(\n        `Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`\n      );\n    }\n\n    this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);\n\n    if (!this.uid) {\n      this.uid = this.hash_;\n    }\n  }\n\n  toDocument(): DocumentInterface {\n    return new Document({\n      pageContent: this.pageContent,\n      metadata: this.metadata,\n    });\n  }\n\n  static fromDocument(\n    document: DocumentInterface,\n    uid?: string\n  ): _HashedDocument {\n    const doc = new this({\n      pageContent: document.pageContent,\n      metadata: document.metadata,\n      uid: uid || (document as DocumentInterface & { uid: string }).uid,\n    });\n    doc.calculateHashes();\n    return doc;\n  }\n\n  private _hashStringToUUID(inputString: string): string {\n    const hash_value = this.keyEncoder(inputString);\n    return uuidv5(hash_value, UUIDV5_NAMESPACE);\n  }\n\n  private _hashNestedDictToUUID(data: Record<string, unknown>): string {\n    const serialized_data = JSON.stringify(data, Object.keys(data).sort());\n    const hash_value = this.keyEncoder(serialized_data);\n    return uuidv5(hash_value, UUIDV5_NAMESPACE);\n  }\n}\n\nexport type CleanupMode = \"full\" | \"incremental\";\n\nexport type IndexOptions = {\n  /**\n   * The number of documents to index in one batch.\n   */\n  batchSize?: number;\n  /**\n   * The cleanup mode to use. Can be \"full\", \"incremental\" or undefined.\n   * - **Incremental**: Cleans up all documents that haven't been updated AND\n   *   that are associated with source ids that were seen\n   *   during indexing.\n   *   Clean up is done continuously during indexing helping\n   *   to minimize the probability of users seeing duplicated\n   *   content.\n   * - **Full**: Delete all documents that haven to been returned by the loader.\n   *   Clean up runs after all documents have been indexed.\n   *   This means that users may see duplicated content during indexing.\n   * - **undefined**: Do not delete any documents.\n   */\n  cleanup?: CleanupMode;\n  /**\n   * Optional key that helps identify the original source of the document.\n   * Must either be a string representing the key of the source in the metadata\n   * or a function that takes a document and returns a string representing the source.\n   * **Required when cleanup is incremental**.\n   */\n  sourceIdKey?: StringOrDocFunc;\n  /**\n   * Batch size to use when cleaning up documents.\n   */\n  cleanupBatchSize?: number;\n  /**\n   * Force update documents even if they are present in the\n   * record manager. Useful if you are re-indexing with updated embeddings.\n   */\n  forceUpdate?: boolean;\n};\n\nexport function _batch<T>(size: number, iterable: T[]): T[][] {\n  const batches: T[][] = [];\n  let currentBatch: T[] = [];\n\n  iterable.forEach((item) => {\n    currentBatch.push(item);\n\n    if (currentBatch.length >= size) {\n      batches.push(currentBatch);\n      currentBatch = [];\n    }\n  });\n\n  if (currentBatch.length > 0) {\n    batches.push(currentBatch);\n  }\n\n  return batches;\n}\n\nexport function _deduplicateInOrder(\n  hashedDocuments: HashedDocumentInterface[]\n): HashedDocumentInterface[] {\n  const seen = new Set<string>();\n  const deduplicated: HashedDocumentInterface[] = [];\n\n  for (const hashedDoc of hashedDocuments) {\n    if (!hashedDoc.hash_) {\n      throw new Error(\"Hashed document does not have a hash\");\n    }\n\n    if (!seen.has(hashedDoc.hash_)) {\n      seen.add(hashedDoc.hash_);\n      deduplicated.push(hashedDoc);\n    }\n  }\n  return deduplicated;\n}\n\nexport function _getSourceIdAssigner(\n  sourceIdKey: StringOrDocFunc | null\n): (doc: DocumentInterface) => string | null {\n  if (sourceIdKey === null) {\n    return (_doc: DocumentInterface) => null;\n  } else if (typeof sourceIdKey === \"string\") {\n    return (doc: DocumentInterface) => doc.metadata[sourceIdKey];\n  } else if (typeof sourceIdKey === \"function\") {\n    return sourceIdKey;\n  } else {\n    throw new Error(\n      `sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`\n    );\n  }\n}\n\n// oxlint-disable-next-line @typescript-eslint/no-explicit-any\nexport const _isBaseDocumentLoader = (arg: any): arg is BaseDocumentLoader => {\n  if (\n    \"load\" in arg &&\n    typeof arg.load === \"function\" &&\n    \"loadAndSplit\" in arg &&\n    typeof arg.loadAndSplit === \"function\"\n  ) {\n    return true;\n  }\n  return false;\n};\n\ninterface IndexArgs {\n  docsSource: BaseDocumentLoader | DocumentInterface[];\n  recordManager: RecordManagerInterface;\n  vectorStore: VectorStore;\n  options?: IndexOptions;\n}\n\n/**\n * Index data from the doc source into the vector store.\n *\n * Indexing functionality uses a manager to keep track of which documents\n * are in the vector store.\n *\n * This allows us to keep track of which documents were updated, and which\n * documents were deleted, which documents should be skipped.\n *\n * For the time being, documents are indexed using their hashes, and users\n *  are not able to specify the uid of the document.\n *\n * @param {IndexArgs} args\n * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.\n * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.\n * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.\n * @param {IndexOptions | undefined} args.options Options for indexing.\n * @returns {Promise<IndexingResult>}\n */\nexport async function index(args: IndexArgs): Promise<IndexingResult> {\n  const { docsSource, recordManager, vectorStore, options } = args;\n  const {\n    batchSize = 100,\n    cleanup,\n    sourceIdKey,\n    cleanupBatchSize = 1000,\n    forceUpdate = false,\n  } = options ?? {};\n\n  if (cleanup === \"incremental\" && !sourceIdKey) {\n    throw new Error(\n      \"sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.\"\n    );\n  }\n\n  const docs = _isBaseDocumentLoader(docsSource)\n    ? await docsSource.load()\n    : docsSource;\n\n  const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);\n\n  const indexStartDt = await recordManager.getTime();\n  let numAdded = 0;\n  let numDeleted = 0;\n  let numUpdated = 0;\n  let numSkipped = 0;\n\n  const batches = _batch<DocumentInterface>(batchSize ?? 100, docs);\n\n  for (const batch of batches) {\n    const hashedDocs = _deduplicateInOrder(\n      batch.map((doc) => _HashedDocument.fromDocument(doc))\n    );\n\n    const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));\n\n    if (cleanup === \"incremental\") {\n      hashedDocs.forEach((_hashedDoc, index) => {\n        const source = sourceIds[index];\n        if (source === null) {\n          throw new Error(\n            \"sourceIdKey must be provided when cleanup is incremental\"\n          );\n        }\n      });\n    }\n\n    const batchExists = await recordManager.exists(\n      hashedDocs.map((doc) => doc.uid)\n    );\n\n    const uids: string[] = [];\n    const docsToIndex: DocumentInterface[] = [];\n    const docsToUpdate: string[] = [];\n    const seenDocs = new Set<string>();\n    hashedDocs.forEach((hashedDoc, i) => {\n      const docExists = batchExists[i];\n      if (docExists) {\n        if (forceUpdate) {\n          seenDocs.add(hashedDoc.uid);\n        } else {\n          docsToUpdate.push(hashedDoc.uid);\n          return;\n        }\n      }\n      uids.push(hashedDoc.uid);\n      docsToIndex.push(hashedDoc.toDocument());\n    });\n\n    if (docsToUpdate.length > 0) {\n      await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });\n      numSkipped += docsToUpdate.length;\n    }\n\n    if (docsToIndex.length > 0) {\n      await vectorStore.addDocuments(docsToIndex, { ids: uids });\n      numAdded += docsToIndex.length - seenDocs.size;\n      numUpdated += seenDocs.size;\n    }\n\n    await recordManager.update(\n      hashedDocs.map((doc) => doc.uid),\n      { timeAtLeast: indexStartDt, groupIds: sourceIds }\n    );\n\n    if (cleanup === \"incremental\") {\n      sourceIds.forEach((sourceId) => {\n        if (!sourceId) throw new Error(\"Source id cannot be null\");\n      });\n      const uidsToDelete = await recordManager.listKeys({\n        before: indexStartDt,\n        groupIds: sourceIds,\n      });\n\n      if (uidsToDelete.length > 0) {\n        await vectorStore.delete({ ids: uidsToDelete });\n        await recordManager.deleteKeys(uidsToDelete);\n        numDeleted += uidsToDelete.length;\n      }\n    }\n  }\n\n  if (cleanup === \"full\") {\n    let uidsToDelete = await recordManager.listKeys({\n      before: indexStartDt,\n      limit: cleanupBatchSize,\n    });\n    while (uidsToDelete.length > 0) {\n      await vectorStore.delete({ ids: uidsToDelete });\n      await recordManager.deleteKeys(uidsToDelete);\n      numDeleted += uidsToDelete.length;\n      uidsToDelete = await recordManager.listKeys({\n        before: indexStartDt,\n        limit: cleanupBatchSize,\n      });\n    }\n  }\n\n  return {\n    numAdded,\n    numDeleted,\n    numUpdated,\n    numSkipped,\n  };\n}\n"],"mappings":";;;;;;;;;;;;AAwCA,IAAa,kBAAb,MAAgE;CAC9D;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,aAAqCA,aAAAA;CAErC,YAAY,QAA4B;AACtC,OAAK,MAAM,OAAO;AAClB,OAAK,cAAc,OAAO;AAC1B,OAAK,WAAW,OAAO;;CAGzB,sBAAsB,cAAoC;AACxD,OAAK,aAAa;;CAGpB,kBAAwB;EACtB,MAAM,gBAAgB;GAAC;GAAS;GAAgB;GAAgB;AAEhE,OAAK,MAAM,OAAO,cAChB,KAAI,OAAO,KAAK,SACd,OAAM,IAAI,MACR,+BAA+B,IAAI,yDAAyD,cAAc,KACxG,KACD,CAAC,GACH;EAIL,MAAM,cAAc,KAAK,kBAAkB,KAAK,YAAY;AAE5D,MAAI;GACF,MAAM,eAAe,KAAK,sBAAsB,KAAK,SAAS;AAC9D,QAAK,cAAc;AACnB,QAAK,eAAe;WACb,GAAG;AACV,SAAM,IAAI,MACR,4BAA4B,EAAE,wDAC/B;;AAGH,OAAK,QAAQ,KAAK,kBAAkB,KAAK,cAAc,KAAK,aAAa;AAEzE,MAAI,CAAC,KAAK,IACR,MAAK,MAAM,KAAK;;CAIpB,aAAgC;AAC9B,SAAO,IAAIC,iBAAAA,SAAS;GAClB,aAAa,KAAK;GAClB,UAAU,KAAK;GAChB,CAAC;;CAGJ,OAAO,aACL,UACA,KACiB;EACjB,MAAM,MAAM,IAAI,KAAK;GACnB,aAAa,SAAS;GACtB,UAAU,SAAS;GACnB,KAAK,OAAQ,SAAiD;GAC/D,CAAC;AACF,MAAI,iBAAiB;AACrB,SAAO;;CAGT,kBAA0B,aAA6B;AAErD,UAAA,GAAA,KAAA,IADmB,KAAK,WAAW,YAAY,EACrBC,uBAAAA,iBAAiB;;CAG7C,sBAA8B,MAAuC;EACnE,MAAM,kBAAkB,KAAK,UAAU,MAAM,OAAO,KAAK,KAAK,CAAC,MAAM,CAAC;AAEtE,UAAA,GAAA,KAAA,IADmB,KAAK,WAAW,gBAAgB,EACzBA,uBAAAA,iBAAiB;;;AA2C/C,SAAgB,OAAU,MAAc,UAAsB;CAC5D,MAAM,UAAiB,EAAE;CACzB,IAAI,eAAoB,EAAE;AAE1B,UAAS,SAAS,SAAS;AACzB,eAAa,KAAK,KAAK;AAEvB,MAAI,aAAa,UAAU,MAAM;AAC/B,WAAQ,KAAK,aAAa;AAC1B,kBAAe,EAAE;;GAEnB;AAEF,KAAI,aAAa,SAAS,EACxB,SAAQ,KAAK,aAAa;AAG5B,QAAO;;AAGT,SAAgB,oBACd,iBAC2B;CAC3B,MAAM,uBAAO,IAAI,KAAa;CAC9B,MAAM,eAA0C,EAAE;AAElD,MAAK,MAAM,aAAa,iBAAiB;AACvC,MAAI,CAAC,UAAU,MACb,OAAM,IAAI,MAAM,uCAAuC;AAGzD,MAAI,CAAC,KAAK,IAAI,UAAU,MAAM,EAAE;AAC9B,QAAK,IAAI,UAAU,MAAM;AACzB,gBAAa,KAAK,UAAU;;;AAGhC,QAAO;;AAGT,SAAgB,qBACd,aAC2C;AAC3C,KAAI,gBAAgB,KAClB,SAAQ,SAA4B;UAC3B,OAAO,gBAAgB,SAChC,SAAQ,QAA2B,IAAI,SAAS;UACvC,OAAO,gBAAgB,WAChC,QAAO;KAEP,OAAM,IAAI,MACR,2DAA2D,OAAO,cACnE;;AAKL,MAAa,yBAAyB,QAAwC;AAC5E,KACE,UAAU,OACV,OAAO,IAAI,SAAS,cACpB,kBAAkB,OAClB,OAAO,IAAI,iBAAiB,WAE5B,QAAO;AAET,QAAO;;;;;;;;;;;;;;;;;;;;;AA6BT,eAAsB,MAAM,MAA0C;CACpE,MAAM,EAAE,YAAY,eAAe,aAAa,YAAY;CAC5D,MAAM,EACJ,YAAY,KACZ,SACA,aACA,mBAAmB,KACnB,cAAc,UACZ,WAAW,EAAE;AAEjB,KAAI,YAAY,iBAAiB,CAAC,YAChC,OAAM,IAAI,MACR,0GACD;CAGH,MAAM,OAAO,sBAAsB,WAAW,GAC1C,MAAM,WAAW,MAAM,GACvB;CAEJ,MAAM,mBAAmB,qBAAqB,eAAe,KAAK;CAElE,MAAM,eAAe,MAAM,cAAc,SAAS;CAClD,IAAI,WAAW;CACf,IAAI,aAAa;CACjB,IAAI,aAAa;CACjB,IAAI,aAAa;CAEjB,MAAM,UAAU,OAA0B,aAAa,KAAK,KAAK;AAEjE,MAAK,MAAM,SAAS,SAAS;EAC3B,MAAM,aAAa,oBACjB,MAAM,KAAK,QAAQ,gBAAgB,aAAa,IAAI,CAAC,CACtD;EAED,MAAM,YAAY,WAAW,KAAK,QAAQ,iBAAiB,IAAI,CAAC;AAEhE,MAAI,YAAY,cACd,YAAW,SAAS,YAAY,UAAU;AAExC,OADe,UAAU,WACV,KACb,OAAM,IAAI,MACR,2DACD;IAEH;EAGJ,MAAM,cAAc,MAAM,cAAc,OACtC,WAAW,KAAK,QAAQ,IAAI,IAAI,CACjC;EAED,MAAM,OAAiB,EAAE;EACzB,MAAM,cAAmC,EAAE;EAC3C,MAAM,eAAyB,EAAE;EACjC,MAAM,2BAAW,IAAI,KAAa;AAClC,aAAW,SAAS,WAAW,MAAM;AAEnC,OADkB,YAAY,GAE5B,KAAI,YACF,UAAS,IAAI,UAAU,IAAI;QACtB;AACL,iBAAa,KAAK,UAAU,IAAI;AAChC;;AAGJ,QAAK,KAAK,UAAU,IAAI;AACxB,eAAY,KAAK,UAAU,YAAY,CAAC;IACxC;AAEF,MAAI,aAAa,SAAS,GAAG;AAC3B,SAAM,cAAc,OAAO,cAAc,EAAE,aAAa,cAAc,CAAC;AACvE,iBAAc,aAAa;;AAG7B,MAAI,YAAY,SAAS,GAAG;AAC1B,SAAM,YAAY,aAAa,aAAa,EAAE,KAAK,MAAM,CAAC;AAC1D,eAAY,YAAY,SAAS,SAAS;AAC1C,iBAAc,SAAS;;AAGzB,QAAM,cAAc,OAClB,WAAW,KAAK,QAAQ,IAAI,IAAI,EAChC;GAAE,aAAa;GAAc,UAAU;GAAW,CACnD;AAED,MAAI,YAAY,eAAe;AAC7B,aAAU,SAAS,aAAa;AAC9B,QAAI,CAAC,SAAU,OAAM,IAAI,MAAM,2BAA2B;KAC1D;GACF,MAAM,eAAe,MAAM,cAAc,SAAS;IAChD,QAAQ;IACR,UAAU;IACX,CAAC;AAEF,OAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,YAAY,OAAO,EAAE,KAAK,cAAc,CAAC;AAC/C,UAAM,cAAc,WAAW,aAAa;AAC5C,kBAAc,aAAa;;;;AAKjC,KAAI,YAAY,QAAQ;EACtB,IAAI,eAAe,MAAM,cAAc,SAAS;GAC9C,QAAQ;GACR,OAAO;GACR,CAAC;AACF,SAAO,aAAa,SAAS,GAAG;AAC9B,SAAM,YAAY,OAAO,EAAE,KAAK,cAAc,CAAC;AAC/C,SAAM,cAAc,WAAW,aAAa;AAC5C,iBAAc,aAAa;AAC3B,kBAAe,MAAM,cAAc,SAAS;IAC1C,QAAQ;IACR,OAAO;IACR,CAAC;;;AAIN,QAAO;EACL;EACA;EACA;EACA;EACD"}