{"version":3,"file":"pdf.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/pdf.ts"],"sourcesContent":["import { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\ntype PDFLoaderV1Imports = {\n  isV2: false;\n  getDocument: typeof import(\"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\").getDocument;\n  version: typeof import(\"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\").version;\n};\n\ntype PDFLoaderV2Imports = {\n  isV2: true;\n  PDFParse: typeof import(\"pdf-parse\").PDFParse;\n};\n\ntype PDFLoaderImportsResult = PDFLoaderV1Imports | PDFLoaderV2Imports;\n\nconst PDF_PARSE_V1_IMPORT_PATH = \"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\";\n\n/**\n * A document loader for loading data from PDFs.\n * @example\n * ```typescript\n * const loader = new WebPDFLoader(new Blob());\n * const docs = await loader.load();\n * console.log({ docs });\n * ```\n */\nexport class WebPDFLoader extends BaseDocumentLoader {\n  protected blob: Blob;\n\n  protected splitPages = true;\n\n  private pdfjs: typeof PDFLoaderImports;\n\n  protected parsedItemSeparator: string;\n\n  constructor(\n    blob: Blob,\n    {\n      splitPages = true,\n      pdfjs = PDFLoaderImports,\n      parsedItemSeparator = \"\",\n    } = {}\n  ) {\n    super();\n    this.blob = blob;\n    this.splitPages = splitPages ?? this.splitPages;\n    this.pdfjs = pdfjs;\n    this.parsedItemSeparator = parsedItemSeparator;\n  }\n\n  /**\n   * Loads the contents of the PDF as documents.\n   * @returns An array of Documents representing the retrieved data.\n   */\n  async load(): Promise<Document[]> {\n    const raw = new Uint8Array(await this.blob.arrayBuffer());\n    const pdfjsResult = await this.pdfjs();\n\n    if (pdfjsResult.isV2) {\n      return this.parseWithV2(raw, pdfjsResult.PDFParse);\n    }\n\n    const { getDocument, version } = pdfjsResult;\n    const parsedPdf = await getDocument({\n      data: raw,\n      useWorkerFetch: false,\n      isEvalSupported: false,\n      useSystemFonts: true,\n    }).promise;\n    const meta = await parsedPdf.getMetadata().catch(() => null);\n\n    const documents: Document[] = [];\n\n    for (let i = 1; i <= parsedPdf.numPages; i += 1) {\n      const page = await parsedPdf.getPage(i);\n      const content = await page.getTextContent();\n\n      if (content.items.length === 0) {\n        continue;\n      }\n\n      // Eliminate excessive newlines\n      // Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16\n      let lastY;\n      const textItems = [];\n      for (const item of content.items) {\n        if (\"str\" in item) {\n          if (lastY === item.transform[5] || !lastY) {\n            textItems.push(item.str);\n          } else {\n            textItems.push(`\\n${item.str}`);\n          }\n          lastY = item.transform[5];\n        }\n      }\n      const text = textItems.join(this.parsedItemSeparator);\n\n      documents.push(\n        new Document({\n          pageContent: text,\n          metadata: {\n            pdf: {\n              version,\n              info: meta?.info,\n              metadata: meta?.metadata,\n              totalPages: parsedPdf.numPages,\n            },\n            loc: {\n              pageNumber: i,\n            },\n          },\n        })\n      );\n    }\n\n    if (this.splitPages) {\n      return documents;\n    }\n\n    if (documents.length === 0) {\n      return [];\n    }\n\n    return [\n      new Document({\n        pageContent: documents.map((doc) => doc.pageContent).join(\"\\n\\n\"),\n        metadata: {\n          pdf: {\n            version,\n            info: meta?.info,\n            metadata: meta?.metadata,\n            totalPages: parsedPdf.numPages,\n          },\n        },\n      }),\n    ];\n  }\n\n  private async parseWithV2(\n    raw: Uint8Array,\n    PDFParseClass: typeof import(\"pdf-parse\").PDFParse\n  ): Promise<Document[]> {\n    const parser = new PDFParseClass({ data: raw });\n\n    try {\n      const textResult = await parser.getText();\n      const infoResult = await parser.getInfo();\n\n      const documents: Document[] = [];\n\n      for (const page of textResult.pages) {\n        if (!page.text || page.text.trim().length === 0) {\n          continue;\n        }\n\n        documents.push(\n          new Document({\n            pageContent: page.text,\n            metadata: {\n              pdf: {\n                version: infoResult.metadata?.format || \"unknown\",\n                info: infoResult.info,\n                metadata: infoResult.metadata,\n                totalPages: textResult.total,\n              },\n              loc: {\n                pageNumber: page.num,\n              },\n            },\n          })\n        );\n      }\n\n      if (this.splitPages) {\n        return documents;\n      }\n\n      if (documents.length === 0) {\n        return [];\n      }\n\n      return [\n        new Document({\n          pageContent: documents.map((doc) => doc.pageContent).join(\"\\n\\n\"),\n          metadata: {\n            pdf: {\n              version: infoResult.metadata?.format || \"unknown\",\n              info: infoResult.info,\n              metadata: infoResult.metadata,\n              totalPages: textResult.total,\n            },\n          },\n        }),\n      ];\n    } finally {\n      await parser.destroy();\n    }\n  }\n}\n\nasync function PDFLoaderImports(): Promise<PDFLoaderImportsResult> {\n  try {\n    const pdfParseModule = await import(\"pdf-parse\");\n    if (\"PDFParse\" in pdfParseModule) {\n      return { isV2: true as const, PDFParse: pdfParseModule.PDFParse };\n    }\n  } catch {\n    // Fall back to the pdf-parse v1 import path below.\n  }\n\n  try {\n    const { default: mod } = await import(\n      /* @vite-ignore */ PDF_PARSE_V1_IMPORT_PATH\n    );\n    const { getDocument, version } = mod;\n    return { isV2: false as const, getDocument, version };\n  } catch (e) {\n    console.error(e);\n    throw new Error(\n      \"Failed to load pdf-parse. Please install pdf-parse v1 or v2, e.g. `npm install pdf-parse@^1` or `npm install pdf-parse@^2`.\"\n    );\n  }\n}\n"],"mappings":";;;;;;AAgBA,MAAM,2BAA2B;;;;;;;;;;AAWjC,IAAa,eAAb,cAAkCA,sCAAAA,mBAAmB;CACnD;CAEA,aAAuB;CAEvB;CAEA;CAEA,YACE,MACA,EACE,aAAa,MACb,QAAQ,kBACR,sBAAsB,OACpB,EAAE,EACN;AACA,SAAO;AACP,OAAK,OAAO;AACZ,OAAK,aAAa,cAAc,KAAK;AACrC,OAAK,QAAQ;AACb,OAAK,sBAAsB;;;;;;CAO7B,MAAM,OAA4B;EAChC,MAAM,MAAM,IAAI,WAAW,MAAM,KAAK,KAAK,aAAa,CAAC;EACzD,MAAM,cAAc,MAAM,KAAK,OAAO;AAEtC,MAAI,YAAY,KACd,QAAO,KAAK,YAAY,KAAK,YAAY,SAAS;EAGpD,MAAM,EAAE,aAAa,YAAY;EACjC,MAAM,YAAY,MAAM,YAAY;GAClC,MAAM;GACN,gBAAgB;GAChB,iBAAiB;GACjB,gBAAgB;GACjB,CAAC,CAAC;EACH,MAAM,OAAO,MAAM,UAAU,aAAa,CAAC,YAAY,KAAK;EAE5D,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,KAAK,UAAU,UAAU,KAAK,GAAG;GAE/C,MAAM,UAAU,OADH,MAAM,UAAU,QAAQ,EAAE,EACZ,gBAAgB;AAE3C,OAAI,QAAQ,MAAM,WAAW,EAC3B;GAKF,IAAI;GACJ,MAAM,YAAY,EAAE;AACpB,QAAK,MAAM,QAAQ,QAAQ,MACzB,KAAI,SAAS,MAAM;AACjB,QAAI,UAAU,KAAK,UAAU,MAAM,CAAC,MAClC,WAAU,KAAK,KAAK,IAAI;QAExB,WAAU,KAAK,KAAK,KAAK,MAAM;AAEjC,YAAQ,KAAK,UAAU;;GAG3B,MAAM,OAAO,UAAU,KAAK,KAAK,oBAAoB;AAErD,aAAU,KACR,IAAIC,0BAAAA,SAAS;IACX,aAAa;IACb,UAAU;KACR,KAAK;MACH;MACA,MAAM,MAAM;MACZ,UAAU,MAAM;MAChB,YAAY,UAAU;MACvB;KACD,KAAK,EACH,YAAY,GACb;KACF;IACF,CAAC,CACH;;AAGH,MAAI,KAAK,WACP,QAAO;AAGT,MAAI,UAAU,WAAW,EACvB,QAAO,EAAE;AAGX,SAAO,CACL,IAAIA,0BAAAA,SAAS;GACX,aAAa,UAAU,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;GACjE,UAAU,EACR,KAAK;IACH;IACA,MAAM,MAAM;IACZ,UAAU,MAAM;IAChB,YAAY,UAAU;IACvB,EACF;GACF,CAAC,CACH;;CAGH,MAAc,YACZ,KACA,eACqB;EACrB,MAAM,SAAS,IAAI,cAAc,EAAE,MAAM,KAAK,CAAC;AAE/C,MAAI;GACF,MAAM,aAAa,MAAM,OAAO,SAAS;GACzC,MAAM,aAAa,MAAM,OAAO,SAAS;GAEzC,MAAM,YAAwB,EAAE;AAEhC,QAAK,MAAM,QAAQ,WAAW,OAAO;AACnC,QAAI,CAAC,KAAK,QAAQ,KAAK,KAAK,MAAM,CAAC,WAAW,EAC5C;AAGF,cAAU,KACR,IAAIA,0BAAAA,SAAS;KACX,aAAa,KAAK;KAClB,UAAU;MACR,KAAK;OACH,SAAS,WAAW,UAAU,UAAU;OACxC,MAAM,WAAW;OACjB,UAAU,WAAW;OACrB,YAAY,WAAW;OACxB;MACD,KAAK,EACH,YAAY,KAAK,KAClB;MACF;KACF,CAAC,CACH;;AAGH,OAAI,KAAK,WACP,QAAO;AAGT,OAAI,UAAU,WAAW,EACvB,QAAO,EAAE;AAGX,UAAO,CACL,IAAIA,0BAAAA,SAAS;IACX,aAAa,UAAU,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;IACjE,UAAU,EACR,KAAK;KACH,SAAS,WAAW,UAAU,UAAU;KACxC,MAAM,WAAW;KACjB,UAAU,WAAW;KACrB,YAAY,WAAW;KACxB,EACF;IACF,CAAC,CACH;YACO;AACR,SAAM,OAAO,SAAS;;;;AAK5B,eAAe,mBAAoD;AACjE,KAAI;EACF,MAAM,iBAAiB,MAAM,OAAO;AACpC,MAAI,cAAc,eAChB,QAAO;GAAE,MAAM;GAAe,UAAU,eAAe;GAAU;SAE7D;AAIR,KAAI;EACF,MAAM,EAAE,SAAS,QAAQ,MAAM;;GACV;;EAErB,MAAM,EAAE,aAAa,YAAY;AACjC,SAAO;GAAE,MAAM;GAAgB;GAAa;GAAS;UAC9C,GAAG;AACV,UAAQ,MAAM,EAAE;AAChB,QAAM,IAAI,MACR,8HACD"}