{"version":3,"file":"unstructured.cjs","names":["BaseDocumentLoader","Document","DirectoryLoader","UnknownHandling"],"sources":["../../../src/document_loaders/fs/unstructured.ts"],"sourcesContent":["import type { basename as BasenameT } from \"node:path\";\nimport type { readFile as ReadFileT } from \"node:fs/promises\";\nimport { Document } from \"@langchain/core/documents\";\nimport { getEnv, getEnvironmentVariable } from \"@langchain/core/utils/env\";\nimport { StringWithAutocomplete } from \"@langchain/core/utils/types\";\nimport {\n  DirectoryLoader,\n  UnknownHandling,\n  LoadersMapping,\n} from \"@langchain/classic/document_loaders/fs/directory\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\nexport const UNSTRUCTURED_API_FILETYPES = [\n  \".txt\",\n  \".text\",\n  \".pdf\",\n  \".docx\",\n  \".doc\",\n  \".jpg\",\n  \".jpeg\",\n  \".eml\",\n  \".html\",\n  \".htm\",\n  \".md\",\n  \".pptx\",\n  \".ppt\",\n  \".msg\",\n  \".rtf\",\n  \".xlsx\",\n  \".xls\",\n  \".odt\",\n  \".epub\",\n];\n\n/**\n * Represents an element returned by the Unstructured API. It has\n * properties for the element type, text content, and metadata.\n */\ntype Element = {\n  type: string;\n  text: string;\n  // this is purposefully loosely typed\n  metadata: {\n    [key: string]: unknown;\n  };\n};\n\n/**\n * Represents the available strategies for the UnstructuredLoader. It can\n * be one of \"hi_res\", \"fast\", \"ocr_only\", or \"auto\".\n */\nexport type UnstructuredLoaderStrategy =\n  | \"hi_res\"\n  | \"fast\"\n  | \"ocr_only\"\n  | \"auto\";\n\n/**\n * Represents the available hi-res models for the UnstructuredLoader. It can\n * be one of \"chipper\".\n */\nexport type HiResModelName = \"chipper\";\n\n/**\n * To enable or disable table extraction for file types other than PDF, set\n * the skipInferTableTypes property in the UnstructuredLoaderOptions object.\n * The skipInferTableTypes property is an array of file types for which table\n * extraction is disabled. For example, to disable table extraction for .docx\n * and .doc files, set the skipInferTableTypes property to [\"docx\", \"doc\"].\n * You can also disable table extraction for all file types other than PDF by\n * setting the skipInferTableTypes property to [].\n */\nexport type SkipInferTableTypes =\n  | \"txt\"\n  | \"text\"\n  | \"pdf\"\n  | \"docx\"\n  | \"doc\"\n  | \"jpg\"\n  | \"jpeg\"\n  | \"eml\"\n  | \"html\"\n  | \"htm\"\n  | \"md\"\n  | \"pptx\"\n  | \"ppt\"\n  | \"msg\"\n  | \"rtf\"\n  | \"xlsx\"\n  | \"xls\"\n  | \"odt\"\n  | \"epub\";\n\n/**\n * Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title\n */\nexport type ChunkingStrategy = \"None\" | \"by_title\";\n\nexport type UnstructuredLoaderOptions = {\n  apiKey?: string;\n  apiUrl?: string;\n  strategy?: StringWithAutocomplete<UnstructuredLoaderStrategy>;\n  encoding?: string;\n  ocrLanguages?: Array<string>;\n  coordinates?: boolean;\n  pdfInferTableStructure?: boolean;\n  xmlKeepTags?: boolean;\n  skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>;\n  hiResModelName?: StringWithAutocomplete<HiResModelName>;\n  includePageBreaks?: boolean;\n  chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;\n  multiPageSections?: boolean;\n  combineUnderNChars?: number;\n  newAfterNChars?: number;\n  maxCharacters?: number;\n  extractImageBlockTypes?: string[];\n  overlap?: number;\n  overlapAll?: boolean;\n};\n\nexport type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {\n  recursive?: boolean;\n  unknown?: UnknownHandling;\n};\n\nexport type UnstructuredMemoryLoaderOptions = {\n  buffer: Buffer;\n  fileName: string;\n};\n\n/**\n * A document loader that uses the Unstructured API to load unstructured\n * documents. It supports both the new syntax with options object and the\n * legacy syntax for backward compatibility. The load() method sends a\n * partitioning request to the Unstructured API and retrieves the\n * partitioned elements. It creates a Document instance for each element\n * and returns an array of Document instances.\n *\n * It accepts either a filepath or an object containing a buffer and a filename\n * as input.\n */\nexport class UnstructuredLoader extends BaseDocumentLoader {\n  public filePath: string;\n\n  private buffer?: Buffer;\n\n  private fileName?: string;\n\n  private apiUrl = \"https://api.unstructured.io/general/v0/general\";\n\n  private apiKey?: string;\n\n  private strategy: StringWithAutocomplete<UnstructuredLoaderStrategy> =\n    \"hi_res\";\n\n  private encoding?: string;\n\n  private ocrLanguages: Array<string> = [];\n\n  private coordinates?: boolean;\n\n  private pdfInferTableStructure?: boolean;\n\n  private xmlKeepTags?: boolean;\n\n  private skipInferTableTypes?: Array<\n    StringWithAutocomplete<SkipInferTableTypes>\n  >;\n\n  private hiResModelName?: StringWithAutocomplete<HiResModelName>;\n\n  private includePageBreaks?: boolean;\n\n  private chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;\n\n  private multiPageSections?: boolean;\n\n  private combineUnderNChars?: number;\n\n  private newAfterNChars?: number;\n\n  private maxCharacters?: number;\n\n  private extractImageBlockTypes?: string[];\n\n  private overlap?: number;\n\n  private overlapAll?: boolean;\n\n  constructor(\n    filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions,\n    unstructuredOptions: UnstructuredLoaderOptions | string = {}\n  ) {\n    super();\n\n    // Temporary shim to avoid breaking existing users\n    // Remove when API keys are enforced by Unstructured and existing code will break anyway\n    const isLegacySyntax = typeof unstructuredOptions === \"string\";\n    const isMemorySyntax = typeof filepathOrBufferOptions === \"object\";\n\n    if (isMemorySyntax) {\n      this.buffer = filepathOrBufferOptions.buffer;\n      this.fileName = filepathOrBufferOptions.fileName;\n    } else if (isLegacySyntax) {\n      this.filePath = unstructuredOptions;\n      this.apiUrl = filepathOrBufferOptions;\n    } else {\n      this.filePath = filepathOrBufferOptions;\n    }\n\n    if (!isLegacySyntax) {\n      const options = unstructuredOptions;\n      this.apiKey =\n        options.apiKey ?? getEnvironmentVariable(\"UNSTRUCTURED_API_KEY\");\n      this.apiUrl =\n        options.apiUrl ??\n        getEnvironmentVariable(\"UNSTRUCTURED_API_URL\") ??\n        this.apiUrl;\n      this.strategy = options.strategy ?? this.strategy;\n      this.encoding = options.encoding;\n      this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;\n      this.coordinates = options.coordinates;\n      this.pdfInferTableStructure = options.pdfInferTableStructure;\n      this.xmlKeepTags = options.xmlKeepTags;\n      this.skipInferTableTypes = options.skipInferTableTypes;\n      this.hiResModelName = options.hiResModelName;\n      this.includePageBreaks = options.includePageBreaks;\n      this.chunkingStrategy = options.chunkingStrategy;\n      this.multiPageSections = options.multiPageSections;\n      this.combineUnderNChars = options.combineUnderNChars;\n      this.newAfterNChars = options.newAfterNChars;\n      this.maxCharacters = options.maxCharacters;\n      this.extractImageBlockTypes = options.extractImageBlockTypes;\n      this.overlap = options.overlap;\n      this.overlapAll = options.overlapAll ?? false;\n    }\n  }\n\n  async _partition() {\n    let buffer = this.buffer;\n    let fileName = this.fileName;\n\n    if (!buffer) {\n      const { readFile, basename } = await this.imports();\n\n      buffer = await readFile(this.filePath);\n      fileName = basename(this.filePath);\n\n      // I'm aware this reads the file into memory first, but we have lots of work\n      // to do on then consuming Documents in a streaming fashion anyway, so not\n      // worried about this for now.\n    }\n\n    const formData = new FormData();\n    formData.append(\"files\", new Blob([buffer]), fileName);\n    formData.append(\"strategy\", this.strategy);\n    this.ocrLanguages.forEach((language) => {\n      formData.append(\"ocr_languages\", language);\n    });\n    if (this.encoding) {\n      formData.append(\"encoding\", this.encoding);\n    }\n    if (this.coordinates === true) {\n      formData.append(\"coordinates\", \"true\");\n    }\n    if (this.pdfInferTableStructure === true) {\n      formData.append(\"pdf_infer_table_structure\", \"true\");\n    }\n    if (this.xmlKeepTags === true) {\n      formData.append(\"xml_keep_tags\", \"true\");\n    }\n    if (this.skipInferTableTypes) {\n      formData.append(\n        \"skip_infer_table_types\",\n        JSON.stringify(this.skipInferTableTypes)\n      );\n    }\n    if (this.hiResModelName) {\n      formData.append(\"hi_res_model_name\", this.hiResModelName);\n    }\n    if (this.includePageBreaks) {\n      formData.append(\"include_page_breaks\", \"true\");\n    }\n    if (this.chunkingStrategy) {\n      formData.append(\"chunking_strategy\", this.chunkingStrategy);\n    }\n    if (this.multiPageSections !== undefined) {\n      formData.append(\n        \"multipage_sections\",\n        this.multiPageSections ? \"true\" : \"false\"\n      );\n    }\n    if (this.combineUnderNChars !== undefined) {\n      formData.append(\"combine_under_n_chars\", String(this.combineUnderNChars));\n    }\n    if (this.newAfterNChars !== undefined) {\n      formData.append(\"new_after_n_chars\", String(this.newAfterNChars));\n    }\n    if (this.maxCharacters !== undefined) {\n      formData.append(\"max_characters\", String(this.maxCharacters));\n    }\n\n    if (this.extractImageBlockTypes !== undefined) {\n      formData.append(\n        \"extract_image_block_types\",\n        JSON.stringify(this.extractImageBlockTypes)\n      );\n    }\n\n    if (this.overlap !== undefined) {\n      formData.append(\"overlap\", String(this.overlap));\n    }\n\n    if (this.overlapAll === true) {\n      formData.append(\"overlap_all\", \"true\");\n    }\n\n    const headers = {\n      \"UNSTRUCTURED-API-KEY\": this.apiKey ?? \"\",\n    };\n\n    const response = await fetch(this.apiUrl, {\n      method: \"POST\",\n      body: formData,\n      headers,\n    });\n\n    if (!response.ok) {\n      throw new Error(\n        `Failed to partition file ${this.filePath} with error ${\n          response.status\n        } and message ${await response.text()}`\n      );\n    }\n\n    const elements = await response.json();\n    if (!Array.isArray(elements)) {\n      throw new Error(\n        `Expected partitioning request to return an array, but got ${elements}`\n      );\n    }\n    return elements.filter((el) => typeof el.text === \"string\") as Element[];\n  }\n\n  async load(): Promise<Document[]> {\n    const elements = await this._partition();\n\n    const documents: Document[] = [];\n    for (const element of elements) {\n      const { metadata, text } = element;\n      if (typeof text === \"string\" && text !== \"\") {\n        documents.push(\n          new Document({\n            pageContent: text,\n            metadata: {\n              ...metadata,\n              category: element.type,\n            },\n          })\n        );\n      }\n    }\n\n    return documents;\n  }\n\n  async imports(): Promise<{\n    readFile: typeof ReadFileT;\n    basename: typeof BasenameT;\n  }> {\n    try {\n      const { readFile } = await import(\"node:fs/promises\");\n      const { basename } = await import(\"node:path\");\n      return { readFile, basename };\n    } catch (e) {\n      console.error(e);\n      throw new Error(\n        `Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`\n      );\n    }\n  }\n}\n\n/**\n * A document loader that loads unstructured documents from a directory\n * using the UnstructuredLoader. It creates a UnstructuredLoader instance\n * for each supported file type and passes it to the DirectoryLoader\n * constructor.\n * @example\n * ```typescript\n * const loader = new UnstructuredDirectoryLoader(\"path/to/directory\", {\n *   apiKey: \"MY_API_KEY\",\n * });\n * const docs = await loader.load();\n * ```\n */\nexport class UnstructuredDirectoryLoader extends DirectoryLoader {\n  constructor(\n    directoryPathOrLegacyApiUrl: string,\n    optionsOrLegacyDirectoryPath: UnstructuredDirectoryLoaderOptions | string,\n    legacyOptionRecursive = true,\n    legacyOptionUnknown: UnknownHandling = UnknownHandling.Warn\n  ) {\n    let directoryPath;\n    let options: UnstructuredDirectoryLoaderOptions;\n    // Temporary shim to avoid breaking existing users\n    // Remove when API keys are enforced by Unstructured and existing code will break anyway\n    const isLegacySyntax = typeof optionsOrLegacyDirectoryPath === \"string\";\n    if (isLegacySyntax) {\n      directoryPath = optionsOrLegacyDirectoryPath;\n      options = {\n        apiUrl: directoryPathOrLegacyApiUrl,\n        recursive: legacyOptionRecursive,\n        unknown: legacyOptionUnknown,\n      };\n    } else {\n      directoryPath = directoryPathOrLegacyApiUrl;\n      options = optionsOrLegacyDirectoryPath;\n    }\n    const loader = (p: string) => new UnstructuredLoader(p, options);\n    const loaders = UNSTRUCTURED_API_FILETYPES.reduce(\n      (loadersObject: LoadersMapping, filetype: string) => {\n        loadersObject[filetype] = loader;\n        return loadersObject;\n      },\n      {}\n    );\n    super(directoryPath, loaders, options.recursive, options.unknown);\n  }\n}\n\nexport { UnknownHandling };\n"],"mappings":";;;;;;;;;;;;;AAYA,MAAa,6BAA6B;CACxC;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACD;;;;;;;;;;;;AA6GD,IAAa,qBAAb,cAAwCA,sCAAAA,mBAAmB;CACzD;CAEA;CAEA;CAEA,SAAiB;CAEjB;CAEA,WACE;CAEF;CAEA,eAAsC,EAAE;CAExC;CAEA;CAEA;CAEA;CAIA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,YACE,yBACA,sBAA0D,EAAE,EAC5D;AACA,SAAO;EAIP,MAAM,iBAAiB,OAAO,wBAAwB;AAGtD,MAFuB,OAAO,4BAA4B,UAEtC;AAClB,QAAK,SAAS,wBAAwB;AACtC,QAAK,WAAW,wBAAwB;aAC/B,gBAAgB;AACzB,QAAK,WAAW;AAChB,QAAK,SAAS;QAEd,MAAK,WAAW;AAGlB,MAAI,CAAC,gBAAgB;GACnB,MAAM,UAAU;AAChB,QAAK,SACH,QAAQ,WAAA,GAAA,0BAAA,wBAAiC,uBAAuB;AAClE,QAAK,SACH,QAAQ,WAAA,GAAA,0BAAA,wBACe,uBAAuB,IAC9C,KAAK;AACP,QAAK,WAAW,QAAQ,YAAY,KAAK;AACzC,QAAK,WAAW,QAAQ;AACxB,QAAK,eAAe,QAAQ,gBAAgB,KAAK;AACjD,QAAK,cAAc,QAAQ;AAC3B,QAAK,yBAAyB,QAAQ;AACtC,QAAK,cAAc,QAAQ;AAC3B,QAAK,sBAAsB,QAAQ;AACnC,QAAK,iBAAiB,QAAQ;AAC9B,QAAK,oBAAoB,QAAQ;AACjC,QAAK,mBAAmB,QAAQ;AAChC,QAAK,oBAAoB,QAAQ;AACjC,QAAK,qBAAqB,QAAQ;AAClC,QAAK,iBAAiB,QAAQ;AAC9B,QAAK,gBAAgB,QAAQ;AAC7B,QAAK,yBAAyB,QAAQ;AACtC,QAAK,UAAU,QAAQ;AACvB,QAAK,aAAa,QAAQ,cAAc;;;CAI5C,MAAM,aAAa;EACjB,IAAI,SAAS,KAAK;EAClB,IAAI,WAAW,KAAK;AAEpB,MAAI,CAAC,QAAQ;GACX,MAAM,EAAE,UAAU,aAAa,MAAM,KAAK,SAAS;AAEnD,YAAS,MAAM,SAAS,KAAK,SAAS;AACtC,cAAW,SAAS,KAAK,SAAS;;EAOpC,MAAM,WAAW,IAAI,UAAU;AAC/B,WAAS,OAAO,SAAS,IAAI,KAAK,CAAC,OAAO,CAAC,EAAE,SAAS;AACtD,WAAS,OAAO,YAAY,KAAK,SAAS;AAC1C,OAAK,aAAa,SAAS,aAAa;AACtC,YAAS,OAAO,iBAAiB,SAAS;IAC1C;AACF,MAAI,KAAK,SACP,UAAS,OAAO,YAAY,KAAK,SAAS;AAE5C,MAAI,KAAK,gBAAgB,KACvB,UAAS,OAAO,eAAe,OAAO;AAExC,MAAI,KAAK,2BAA2B,KAClC,UAAS,OAAO,6BAA6B,OAAO;AAEtD,MAAI,KAAK,gBAAgB,KACvB,UAAS,OAAO,iBAAiB,OAAO;AAE1C,MAAI,KAAK,oBACP,UAAS,OACP,0BACA,KAAK,UAAU,KAAK,oBAAoB,CACzC;AAEH,MAAI,KAAK,eACP,UAAS,OAAO,qBAAqB,KAAK,eAAe;AAE3D,MAAI,KAAK,kBACP,UAAS,OAAO,uBAAuB,OAAO;AAEhD,MAAI,KAAK,iBACP,UAAS,OAAO,qBAAqB,KAAK,iBAAiB;AAE7D,MAAI,KAAK,sBAAsB,KAAA,EAC7B,UAAS,OACP,sBACA,KAAK,oBAAoB,SAAS,QACnC;AAEH,MAAI,KAAK,uBAAuB,KAAA,EAC9B,UAAS,OAAO,yBAAyB,OAAO,KAAK,mBAAmB,CAAC;AAE3E,MAAI,KAAK,mBAAmB,KAAA,EAC1B,UAAS,OAAO,qBAAqB,OAAO,KAAK,eAAe,CAAC;AAEnE,MAAI,KAAK,kBAAkB,KAAA,EACzB,UAAS,OAAO,kBAAkB,OAAO,KAAK,cAAc,CAAC;AAG/D,MAAI,KAAK,2BAA2B,KAAA,EAClC,UAAS,OACP,6BACA,KAAK,UAAU,KAAK,uBAAuB,CAC5C;AAGH,MAAI,KAAK,YAAY,KAAA,EACnB,UAAS,OAAO,WAAW,OAAO,KAAK,QAAQ,CAAC;AAGlD,MAAI,KAAK,eAAe,KACtB,UAAS,OAAO,eAAe,OAAO;EAGxC,MAAM,UAAU,EACd,wBAAwB,KAAK,UAAU,IACxC;EAED,MAAM,WAAW,MAAM,MAAM,KAAK,QAAQ;GACxC,QAAQ;GACR,MAAM;GACN;GACD,CAAC;AAEF,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MACR,4BAA4B,KAAK,SAAS,cACxC,SAAS,OACV,eAAe,MAAM,SAAS,MAAM,GACtC;EAGH,MAAM,WAAW,MAAM,SAAS,MAAM;AACtC,MAAI,CAAC,MAAM,QAAQ,SAAS,CAC1B,OAAM,IAAI,MACR,6DAA6D,WAC9D;AAEH,SAAO,SAAS,QAAQ,OAAO,OAAO,GAAG,SAAS,SAAS;;CAG7D,MAAM,OAA4B;EAChC,MAAM,WAAW,MAAM,KAAK,YAAY;EAExC,MAAM,YAAwB,EAAE;AAChC,OAAK,MAAM,WAAW,UAAU;GAC9B,MAAM,EAAE,UAAU,SAAS;AAC3B,OAAI,OAAO,SAAS,YAAY,SAAS,GACvC,WAAU,KACR,IAAIC,0BAAAA,SAAS;IACX,aAAa;IACb,UAAU;KACR,GAAG;KACH,UAAU,QAAQ;KACnB;IACF,CAAC,CACH;;AAIL,SAAO;;CAGT,MAAM,UAGH;AACD,MAAI;GACF,MAAM,EAAE,aAAa,MAAM,OAAO;GAClC,MAAM,EAAE,aAAa,MAAM,OAAO;AAClC,UAAO;IAAE;IAAU;IAAU;WACtB,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,yHAAA,GAAA,0BAAA,SAAgI,CAAC,iDAClI;;;;;;;;;;;;;;;;;AAkBP,IAAa,8BAAb,cAAiDC,iDAAAA,gBAAgB;CAC/D,YACE,6BACA,8BACA,wBAAwB,MACxB,sBAAuCC,iDAAAA,gBAAgB,MACvD;EACA,IAAI;EACJ,IAAI;AAIJ,MADuB,OAAO,iCAAiC,UAC3C;AAClB,mBAAgB;AAChB,aAAU;IACR,QAAQ;IACR,WAAW;IACX,SAAS;IACV;SACI;AACL,mBAAgB;AAChB,aAAU;;EAEZ,MAAM,UAAU,MAAc,IAAI,mBAAmB,GAAG,QAAQ;EAChE,MAAM,UAAU,2BAA2B,QACxC,eAA+B,aAAqB;AACnD,iBAAc,YAAY;AAC1B,UAAO;KAET,EAAE,CACH;AACD,QAAM,eAAe,SAAS,QAAQ,WAAW,QAAQ,QAAQ"}