{"version":3,"file":"base.cjs","names":["sha256","fields: HashedDocumentArgs","keyEncoderFn: HashKeyEncoder","Document","document: DocumentInterface","uid?: string","inputString: string","UUIDV5_NAMESPACE","data: Record<string, unknown>","size: number","iterable: T[]","batches: T[][]","currentBatch: T[]","hashedDocuments: HashedDocumentInterface[]","deduplicated: HashedDocumentInterface[]","sourceIdKey: StringOrDocFunc | null","_doc: DocumentInterface","doc: DocumentInterface","arg: any","args: IndexArgs","index","uids: string[]","docsToIndex: DocumentInterface[]","docsToUpdate: string[]"],"sources":["../../src/indexing/base.ts"],"sourcesContent":["import { v5 as uuidv5 } from \"uuid\";\nimport { VectorStore } from \"../vectorstores.js\";\nimport { RecordManagerInterface, UUIDV5_NAMESPACE } from \"./record_manager.js\";\nimport { sha256, type HashKeyEncoder } from \"../utils/hash.js\";\nimport { DocumentInterface, Document } from \"../documents/document.js\";\nimport { BaseDocumentLoader } from \"../document_loaders/base.js\";\n\ntype Metadata = Record<string, unknown>;\n\ntype IndexingResult = {\n  numAdded: number;\n  numDeleted: number;\n  numUpdated: number;\n  numSkipped: number;\n};\n\ntype StringOrDocFunc = string | ((doc: DocumentInterface) => string);\n\nexport interface HashedDocumentInterface extends DocumentInterface {\n  uid: string;\n  hash_?: string;\n  contentHash?: string;\n  metadataHash?: string;\n  pageContent: string;\n  metadata: Metadata;\n  calculateHashes(): void;\n  toDocument(): DocumentInterface;\n}\n\ninterface HashedDocumentArgs {\n  pageContent: string;\n  metadata: Metadata;\n  uid: string;\n}\n\n/**\n * HashedDocument is a Document with hashes calculated.\n * Hashes are calculated based on page content and metadata.\n * It is used for indexing.\n */\nexport class _HashedDocument implements HashedDocumentInterface {\n  uid: string;\n\n  hash_?: string;\n\n  contentHash?: string;\n\n  metadataHash?: string;\n\n  pageContent: string;\n\n  metadata: Metadata;\n\n  private keyEncoder: HashKeyEncoder = sha256;\n\n  constructor(fields: HashedDocumentArgs) {\n    this.uid = fields.uid;\n    this.pageContent = fields.pageContent;\n    this.metadata = fields.metadata;\n  }\n\n  makeDefaultKeyEncoder(keyEncoderFn: HashKeyEncoder): void {\n    this.keyEncoder = keyEncoderFn;\n  }\n\n  calculateHashes(): void {\n    const forbiddenKeys = [\"hash_\", \"content_hash\", \"metadata_hash\"];\n\n    for (const key of forbiddenKeys) {\n      if (key in this.metadata) {\n        throw new Error(\n          `Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(\n            \", \"\n          )}]`\n        );\n      }\n    }\n\n    const contentHash = this._hashStringToUUID(this.pageContent);\n\n    try {\n      const metadataHash = this._hashNestedDictToUUID(this.metadata);\n      this.contentHash = contentHash;\n      this.metadataHash = metadataHash;\n    } catch (e) {\n      throw new Error(\n        `Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`\n      );\n    }\n\n    this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);\n\n    if (!this.uid) {\n      this.uid = this.hash_;\n    }\n  }\n\n  toDocument(): DocumentInterface {\n    return new Document({\n      pageContent: this.pageContent,\n      metadata: this.metadata,\n    });\n  }\n\n  static fromDocument(\n    document: DocumentInterface,\n    uid?: string\n  ): _HashedDocument {\n    const doc = new this({\n      pageContent: document.pageContent,\n      metadata: document.metadata,\n      uid: uid || (document as DocumentInterface & { uid: string }).uid,\n    });\n    doc.calculateHashes();\n    return doc;\n  }\n\n  private _hashStringToUUID(inputString: string): string {\n    const hash_value = this.keyEncoder(inputString);\n    return uuidv5(hash_value, UUIDV5_NAMESPACE);\n  }\n\n  private _hashNestedDictToUUID(data: Record<string, unknown>): string {\n    const serialized_data = JSON.stringify(data, Object.keys(data).sort());\n    const hash_value = this.keyEncoder(serialized_data);\n    return uuidv5(hash_value, UUIDV5_NAMESPACE);\n  }\n}\n\nexport type CleanupMode = \"full\" | \"incremental\";\n\nexport type IndexOptions = {\n  /**\n   * The number of documents to index in one batch.\n   */\n  batchSize?: number;\n  /**\n   * The cleanup mode to use. Can be \"full\", \"incremental\" or undefined.\n   * - **Incremental**: Cleans up all documents that haven't been updated AND\n   *   that are associated with source ids that were seen\n   *   during indexing.\n   *   Clean up is done continuously during indexing helping\n   *   to minimize the probability of users seeing duplicated\n   *   content.\n   * - **Full**: Delete all documents that haven to been returned by the loader.\n   *   Clean up runs after all documents have been indexed.\n   *   This means that users may see duplicated content during indexing.\n   * - **undefined**: Do not delete any documents.\n   */\n  cleanup?: CleanupMode;\n  /**\n   * Optional key that helps identify the original source of the document.\n   * Must either be a string representing the key of the source in the metadata\n   * or a function that takes a document and returns a string representing the source.\n   * **Required when cleanup is incremental**.\n   */\n  sourceIdKey?: StringOrDocFunc;\n  /**\n   * Batch size to use when cleaning up documents.\n   */\n  cleanupBatchSize?: number;\n  /**\n   * Force update documents even if they are present in the\n   * record manager. Useful if you are re-indexing with updated embeddings.\n   */\n  forceUpdate?: boolean;\n};\n\nexport function _batch<T>(size: number, iterable: T[]): T[][] {\n  const batches: T[][] = [];\n  let currentBatch: T[] = [];\n\n  iterable.forEach((item) => {\n    currentBatch.push(item);\n\n    if (currentBatch.length >= size) {\n      batches.push(currentBatch);\n      currentBatch = [];\n    }\n  });\n\n  if (currentBatch.length > 0) {\n    batches.push(currentBatch);\n  }\n\n  return batches;\n}\n\nexport function _deduplicateInOrder(\n  hashedDocuments: HashedDocumentInterface[]\n): HashedDocumentInterface[] {\n  const seen = new Set<string>();\n  const deduplicated: HashedDocumentInterface[] = [];\n\n  for (const hashedDoc of hashedDocuments) {\n    if (!hashedDoc.hash_) {\n      throw new Error(\"Hashed document does not have a hash\");\n    }\n\n    if (!seen.has(hashedDoc.hash_)) {\n      seen.add(hashedDoc.hash_);\n      deduplicated.push(hashedDoc);\n    }\n  }\n  return deduplicated;\n}\n\nexport function _getSourceIdAssigner(\n  sourceIdKey: StringOrDocFunc | null\n): (doc: DocumentInterface) => string | null {\n  if (sourceIdKey === null) {\n    return (_doc: DocumentInterface) => null;\n  } else if (typeof sourceIdKey === \"string\") {\n    return (doc: DocumentInterface) => doc.metadata[sourceIdKey];\n  } else if (typeof sourceIdKey === \"function\") {\n    return sourceIdKey;\n  } else {\n    throw new Error(\n      `sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`\n    );\n  }\n}\n\n// eslint-disable-next-line @typescript-eslint/no-explicit-any\nexport const _isBaseDocumentLoader = (arg: any): arg is BaseDocumentLoader => {\n  if (\n    \"load\" in arg &&\n    typeof arg.load === \"function\" &&\n    \"loadAndSplit\" in arg &&\n    typeof arg.loadAndSplit === \"function\"\n  ) {\n    return true;\n  }\n  return false;\n};\n\ninterface IndexArgs {\n  docsSource: BaseDocumentLoader | DocumentInterface[];\n  recordManager: RecordManagerInterface;\n  vectorStore: VectorStore;\n  options?: IndexOptions;\n}\n\n/**\n * Index data from the doc source into the vector store.\n *\n * Indexing functionality uses a manager to keep track of which documents\n * are in the vector store.\n *\n * This allows us to keep track of which documents were updated, and which\n * documents were deleted, which documents should be skipped.\n *\n * For the time being, documents are indexed using their hashes, and users\n *  are not able to specify the uid of the document.\n *\n * @param {IndexArgs} args\n * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.\n * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.\n * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.\n * @param {IndexOptions | undefined} args.options Options for indexing.\n * @returns {Promise<IndexingResult>}\n */\nexport async function index(args: IndexArgs): Promise<IndexingResult> {\n  const { docsSource, recordManager, vectorStore, options } = args;\n  const {\n    batchSize = 100,\n    cleanup,\n    sourceIdKey,\n    cleanupBatchSize = 1000,\n    forceUpdate = false,\n  } = options ?? {};\n\n  if (cleanup === \"incremental\" && !sourceIdKey) {\n    throw new Error(\n      \"sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.\"\n    );\n  }\n\n  const docs = _isBaseDocumentLoader(docsSource)\n    ? await docsSource.load()\n    : docsSource;\n\n  const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);\n\n  const indexStartDt = await recordManager.getTime();\n  let numAdded = 0;\n  let numDeleted = 0;\n  let numUpdated = 0;\n  let numSkipped = 0;\n\n  const batches = _batch<DocumentInterface>(batchSize ?? 100, docs);\n\n  for (const batch of batches) {\n    const hashedDocs = _deduplicateInOrder(\n      batch.map((doc) => _HashedDocument.fromDocument(doc))\n    );\n\n    const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));\n\n    if (cleanup === \"incremental\") {\n      hashedDocs.forEach((_hashedDoc, index) => {\n        const source = sourceIds[index];\n        if (source === null) {\n          throw new Error(\n            \"sourceIdKey must be provided when cleanup is incremental\"\n          );\n        }\n      });\n    }\n\n    const batchExists = await recordManager.exists(\n      hashedDocs.map((doc) => doc.uid)\n    );\n\n    const uids: string[] = [];\n    const docsToIndex: DocumentInterface[] = [];\n    const docsToUpdate: string[] = [];\n    const seenDocs = new Set<string>();\n    hashedDocs.forEach((hashedDoc, i) => {\n      const docExists = batchExists[i];\n      if (docExists) {\n        if (forceUpdate) {\n          seenDocs.add(hashedDoc.uid);\n        } else {\n          docsToUpdate.push(hashedDoc.uid);\n          return;\n        }\n      }\n      uids.push(hashedDoc.uid);\n      docsToIndex.push(hashedDoc.toDocument());\n    });\n\n    if (docsToUpdate.length > 0) {\n      await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });\n      numSkipped += docsToUpdate.length;\n    }\n\n    if (docsToIndex.length > 0) {\n      await vectorStore.addDocuments(docsToIndex, { ids: uids });\n      numAdded += docsToIndex.length - seenDocs.size;\n      numUpdated += seenDocs.size;\n    }\n\n    await recordManager.update(\n      hashedDocs.map((doc) => doc.uid),\n      { timeAtLeast: indexStartDt, groupIds: sourceIds }\n    );\n\n    if (cleanup === \"incremental\") {\n      sourceIds.forEach((sourceId) => {\n        if (!sourceId) throw new Error(\"Source id cannot be null\");\n      });\n      const uidsToDelete = await recordManager.listKeys({\n        before: indexStartDt,\n        groupIds: sourceIds,\n      });\n\n      if (uidsToDelete.length > 0) {\n        await vectorStore.delete({ ids: uidsToDelete });\n        await recordManager.deleteKeys(uidsToDelete);\n        numDeleted += uidsToDelete.length;\n      }\n    }\n  }\n\n  if (cleanup === \"full\") {\n    let uidsToDelete = await recordManager.listKeys({\n      before: indexStartDt,\n      limit: cleanupBatchSize,\n    });\n    while (uidsToDelete.length > 0) {\n      await vectorStore.delete({ ids: uidsToDelete });\n      await recordManager.deleteKeys(uidsToDelete);\n      numDeleted += uidsToDelete.length;\n      uidsToDelete = await recordManager.listKeys({\n        before: indexStartDt,\n        limit: cleanupBatchSize,\n      });\n    }\n  }\n\n  return {\n    numAdded,\n    numDeleted,\n    numUpdated,\n    numSkipped,\n  };\n}\n"],"mappings":";;;;;;;;;;;;;AAwCA,IAAa,kBAAb,MAAgE;CAC9D;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,AAAQ,aAA6BA;CAErC,YAAYC,QAA4B;EACtC,KAAK,MAAM,OAAO;EAClB,KAAK,cAAc,OAAO;EAC1B,KAAK,WAAW,OAAO;CACxB;CAED,sBAAsBC,cAAoC;EACxD,KAAK,aAAa;CACnB;CAED,kBAAwB;EACtB,MAAM,gBAAgB;GAAC;GAAS;GAAgB;EAAgB;AAEhE,OAAK,MAAM,OAAO,cAChB,KAAI,OAAO,KAAK,SACd,OAAM,IAAI,MACR,CAAC,4BAA4B,EAAE,IAAI,uDAAuD,EAAE,cAAc,KACxG,KACD,CAAC,CAAC,CAAC;EAKV,MAAM,cAAc,KAAK,kBAAkB,KAAK,YAAY;AAE5D,MAAI;GACF,MAAM,eAAe,KAAK,sBAAsB,KAAK,SAAS;GAC9D,KAAK,cAAc;GACnB,KAAK,eAAe;EACrB,SAAQ,GAAG;AACV,SAAM,IAAI,MACR,CAAC,yBAAyB,EAAE,EAAE,sDAAsD,CAAC;EAExF;EAED,KAAK,QAAQ,KAAK,kBAAkB,KAAK,cAAc,KAAK,aAAa;AAEzE,MAAI,CAAC,KAAK,KACR,KAAK,MAAM,KAAK;CAEnB;CAED,aAAgC;AAC9B,SAAO,IAAIC,0BAAS;GAClB,aAAa,KAAK;GAClB,UAAU,KAAK;EAChB;CACF;CAED,OAAO,aACLC,UACAC,KACiB;EACjB,MAAM,MAAM,IAAI,KAAK;GACnB,aAAa,SAAS;GACtB,UAAU,SAAS;GACnB,KAAK,OAAQ,SAAiD;EAC/D;EACD,IAAI,iBAAiB;AACrB,SAAO;CACR;CAED,AAAQ,kBAAkBC,aAA6B;EACrD,MAAM,aAAa,KAAK,WAAW,YAAY;AAC/C,sBAAc,YAAYC,wCAAiB;CAC5C;CAED,AAAQ,sBAAsBC,MAAuC;EACnE,MAAM,kBAAkB,KAAK,UAAU,MAAM,OAAO,KAAK,KAAK,CAAC,MAAM,CAAC;EACtE,MAAM,aAAa,KAAK,WAAW,gBAAgB;AACnD,sBAAc,YAAYD,wCAAiB;CAC5C;AACF;AAyCD,SAAgB,OAAUE,MAAcC,UAAsB;CAC5D,MAAMC,UAAiB,CAAE;CACzB,IAAIC,eAAoB,CAAE;CAE1B,SAAS,QAAQ,CAAC,SAAS;EACzB,aAAa,KAAK,KAAK;AAEvB,MAAI,aAAa,UAAU,MAAM;GAC/B,QAAQ,KAAK,aAAa;GAC1B,eAAe,CAAE;EAClB;CACF,EAAC;AAEF,KAAI,aAAa,SAAS,GACxB,QAAQ,KAAK,aAAa;AAG5B,QAAO;AACR;AAED,SAAgB,oBACdC,iBAC2B;CAC3B,MAAM,uBAAO,IAAI;CACjB,MAAMC,eAA0C,CAAE;AAElD,MAAK,MAAM,aAAa,iBAAiB;AACvC,MAAI,CAAC,UAAU,MACb,OAAM,IAAI,MAAM;AAGlB,MAAI,CAAC,KAAK,IAAI,UAAU,MAAM,EAAE;GAC9B,KAAK,IAAI,UAAU,MAAM;GACzB,aAAa,KAAK,UAAU;EAC7B;CACF;AACD,QAAO;AACR;AAED,SAAgB,qBACdC,aAC2C;AAC3C,KAAI,gBAAgB,KAClB,QAAO,CAACC,SAA4B;UAC3B,OAAO,gBAAgB,SAChC,QAAO,CAACC,QAA2B,IAAI,SAAS;UACvC,OAAO,gBAAgB,WAChC,QAAO;KAEP,OAAM,IAAI,MACR,CAAC,wDAAwD,EAAE,OAAO,aAAa;AAGpF;AAGD,MAAa,wBAAwB,CAACC,QAAwC;AAC5E,KACE,UAAU,OACV,OAAO,IAAI,SAAS,cACpB,kBAAkB,OAClB,OAAO,IAAI,iBAAiB,WAE5B,QAAO;AAET,QAAO;AACR;;;;;;;;;;;;;;;;;;;;AA4BD,eAAsB,MAAMC,MAA0C;CACpE,MAAM,EAAE,YAAY,eAAe,aAAa,SAAS,GAAG;CAC5D,MAAM,EACJ,YAAY,KACZ,SACA,aACA,mBAAmB,KACnB,cAAc,OACf,GAAG,WAAW,CAAE;AAEjB,KAAI,YAAY,iBAAiB,CAAC,YAChC,OAAM,IAAI,MACR;CAIJ,MAAM,OAAO,sBAAsB,WAAW,GAC1C,MAAM,WAAW,MAAM,GACvB;CAEJ,MAAM,mBAAmB,qBAAqB,eAAe,KAAK;CAElE,MAAM,eAAe,MAAM,cAAc,SAAS;CAClD,IAAI,WAAW;CACf,IAAI,aAAa;CACjB,IAAI,aAAa;CACjB,IAAI,aAAa;CAEjB,MAAM,UAAU,OAA0B,aAAa,KAAK,KAAK;AAEjE,MAAK,MAAM,SAAS,SAAS;EAC3B,MAAM,aAAa,oBACjB,MAAM,IAAI,CAAC,QAAQ,gBAAgB,aAAa,IAAI,CAAC,CACtD;EAED,MAAM,YAAY,WAAW,IAAI,CAAC,QAAQ,iBAAiB,IAAI,CAAC;AAEhE,MAAI,YAAY,eACd,WAAW,QAAQ,CAAC,YAAYC,YAAU;GACxC,MAAM,SAAS,UAAUA;AACzB,OAAI,WAAW,KACb,OAAM,IAAI,MACR;EAGL,EAAC;EAGJ,MAAM,cAAc,MAAM,cAAc,OACtC,WAAW,IAAI,CAAC,QAAQ,IAAI,IAAI,CACjC;EAED,MAAMC,OAAiB,CAAE;EACzB,MAAMC,cAAmC,CAAE;EAC3C,MAAMC,eAAyB,CAAE;EACjC,MAAM,2BAAW,IAAI;EACrB,WAAW,QAAQ,CAAC,WAAW,MAAM;GACnC,MAAM,YAAY,YAAY;AAC9B,OAAI,UACF,KAAI,aACF,SAAS,IAAI,UAAU,IAAI;QACtB;IACL,aAAa,KAAK,UAAU,IAAI;AAChC;GACD;GAEH,KAAK,KAAK,UAAU,IAAI;GACxB,YAAY,KAAK,UAAU,YAAY,CAAC;EACzC,EAAC;AAEF,MAAI,aAAa,SAAS,GAAG;GAC3B,MAAM,cAAc,OAAO,cAAc,EAAE,aAAa,aAAc,EAAC;GACvE,cAAc,aAAa;EAC5B;AAED,MAAI,YAAY,SAAS,GAAG;GAC1B,MAAM,YAAY,aAAa,aAAa,EAAE,KAAK,KAAM,EAAC;GAC1D,YAAY,YAAY,SAAS,SAAS;GAC1C,cAAc,SAAS;EACxB;EAED,MAAM,cAAc,OAClB,WAAW,IAAI,CAAC,QAAQ,IAAI,IAAI,EAChC;GAAE,aAAa;GAAc,UAAU;EAAW,EACnD;AAED,MAAI,YAAY,eAAe;GAC7B,UAAU,QAAQ,CAAC,aAAa;AAC9B,QAAI,CAAC,SAAU,OAAM,IAAI,MAAM;GAChC,EAAC;GACF,MAAM,eAAe,MAAM,cAAc,SAAS;IAChD,QAAQ;IACR,UAAU;GACX,EAAC;AAEF,OAAI,aAAa,SAAS,GAAG;IAC3B,MAAM,YAAY,OAAO,EAAE,KAAK,aAAc,EAAC;IAC/C,MAAM,cAAc,WAAW,aAAa;IAC5C,cAAc,aAAa;GAC5B;EACF;CACF;AAED,KAAI,YAAY,QAAQ;EACtB,IAAI,eAAe,MAAM,cAAc,SAAS;GAC9C,QAAQ;GACR,OAAO;EACR,EAAC;AACF,SAAO,aAAa,SAAS,GAAG;GAC9B,MAAM,YAAY,OAAO,EAAE,KAAK,aAAc,EAAC;GAC/C,MAAM,cAAc,WAAW,aAAa;GAC5C,cAAc,aAAa;GAC3B,eAAe,MAAM,cAAc,SAAS;IAC1C,QAAQ;IACR,OAAO;GACR,EAAC;EACH;CACF;AAED,QAAO;EACL;EACA;EACA;EACA;CACD;AACF"}