{"version":3,"file":"cheerio.cjs","names":["BaseDocumentLoader","AsyncCaller","Document"],"sources":["../../../src/document_loaders/web/cheerio.ts"],"sourcesContent":["import type {\n  CheerioAPI,\n  CheerioOptions,\n  load as LoadT,\n  SelectorType,\n} from \"cheerio\";\nimport { Document } from \"@langchain/core/documents\";\nimport { AsyncCaller } from \"@langchain/core/utils/async_caller\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\nimport type { WebBaseLoaderParams, WebBaseLoader } from \"./html.js\";\n\n/**\n * Represents the parameters for configuring the CheerioWebBaseLoader. It\n * extends the WebBaseLoaderParams interface and adds additional parameters\n * specific to loading with Cheerio.\n */\nexport interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams {\n  /**\n   * The selector to use to extract the text from the document. Defaults to\n   * \"body\".\n   */\n  selector?: SelectorType;\n}\n\n/**\n * A class that extends the BaseDocumentLoader and implements the\n * DocumentLoader interface. It represents a document loader for loading\n * web-based documents using Cheerio.\n * @example\n * ```typescript\n * const loader = new CheerioWebBaseLoader(\"https://exampleurl.com\");\n * const docs = await loader.load();\n * console.log({ docs });\n * ```\n */\nexport class CheerioWebBaseLoader\n  extends BaseDocumentLoader\n  implements WebBaseLoader\n{\n  timeout: number;\n\n  caller: AsyncCaller;\n\n  selector?: SelectorType;\n\n  textDecoder?: TextDecoder;\n\n  headers?: HeadersInit;\n\n  constructor(\n    public webPath: string,\n    fields?: CheerioWebBaseLoaderParams\n  ) {\n    super();\n    const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {};\n    this.timeout = timeout ?? 10000;\n    this.caller = new AsyncCaller(rest);\n    this.selector = selector ?? \"body\";\n    this.textDecoder = textDecoder;\n    this.headers = headers;\n  }\n\n  /**\n   * Fetches web documents from the given array of URLs and loads them using Cheerio.\n   * It returns an array of CheerioAPI instances.\n   * @param urls An array of URLs to fetch and load.\n   * @returns A Promise that resolves to an array of CheerioAPI instances.\n   */\n  static async scrapeAll(\n    urls: string[],\n    caller: AsyncCaller,\n    timeout: number | undefined,\n    textDecoder?: TextDecoder,\n    options?: CheerioOptions & {\n      headers?: HeadersInit;\n    }\n  ): Promise<CheerioAPI[]> {\n    return Promise.all(\n      urls.map((url) =>\n        CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)\n      )\n    );\n  }\n\n  static async _scrape(\n    url: string,\n    caller: AsyncCaller,\n    timeout: number | undefined,\n    textDecoder?: TextDecoder,\n    options?: CheerioOptions & {\n      headers?: HeadersInit;\n    }\n  ): Promise<CheerioAPI> {\n    const { headers, ...cheerioOptions } = options ?? {};\n    const { load } = await CheerioWebBaseLoader.imports();\n    const response = await caller.call(fetch, url, {\n      signal: timeout ? AbortSignal.timeout(timeout) : undefined,\n      headers,\n    });\n    const html =\n      textDecoder?.decode(await response.arrayBuffer()) ??\n      (await response.text());\n    return load(html, cheerioOptions);\n  }\n\n  /**\n   * Fetches the web document from the webPath and loads it using Cheerio.\n   * It returns a CheerioAPI instance.\n   * @returns A Promise that resolves to a CheerioAPI instance.\n   */\n  async scrape(): Promise<CheerioAPI> {\n    const options = { headers: this.headers };\n    return CheerioWebBaseLoader._scrape(\n      this.webPath,\n      this.caller,\n      this.timeout,\n      this.textDecoder,\n      options\n    );\n  }\n\n  /**\n   * Extracts the text content from the loaded document using the selector\n   * and creates a Document instance with the extracted text and metadata.\n   * It returns an array of Document instances.\n   * @returns A Promise that resolves to an array of Document instances.\n   */\n  async load(): Promise<Document[]> {\n    const $ = await this.scrape();\n    const title = $(\"title\").text();\n    const text = $(this.selector).text();\n    const metadata = { source: this.webPath, title };\n    return [new Document({ pageContent: text, metadata })];\n  }\n\n  /**\n   * A static method that dynamically imports the Cheerio library and\n   * returns the load function. If the import fails, it throws an error.\n   * @returns A Promise that resolves to an object containing the load function from the Cheerio library.\n   */\n  static async imports(): Promise<{\n    load: typeof LoadT;\n  }> {\n    try {\n      const { load } = await import(\"cheerio\");\n      return { load };\n    } catch (e) {\n      console.error(e);\n      throw new Error(\n        \"Please install cheerio as a dependency with, e.g. `pnpm install cheerio`\"\n      );\n    }\n  }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAmCA,IAAa,uBAAb,MAAa,6BACHA,sCAAAA,mBAEV;CACE;CAEA;CAEA;CAEA;CAEA;CAEA,YACE,SACA,QACA;AACA,SAAO;AAHA,OAAA,UAAA;EAIP,MAAM,EAAE,SAAS,UAAU,aAAa,SAAS,GAAG,SAAS,UAAU,EAAE;AACzE,OAAK,UAAU,WAAW;AAC1B,OAAK,SAAS,IAAIC,mCAAAA,YAAY,KAAK;AACnC,OAAK,WAAW,YAAY;AAC5B,OAAK,cAAc;AACnB,OAAK,UAAU;;;;;;;;CASjB,aAAa,UACX,MACA,QACA,SACA,aACA,SAGuB;AACvB,SAAO,QAAQ,IACb,KAAK,KAAK,QACR,qBAAqB,QAAQ,KAAK,QAAQ,SAAS,aAAa,QAAQ,CACzE,CACF;;CAGH,aAAa,QACX,KACA,QACA,SACA,aACA,SAGqB;EACrB,MAAM,EAAE,SAAS,GAAG,mBAAmB,WAAW,EAAE;EACpD,MAAM,EAAE,SAAS,MAAM,qBAAqB,SAAS;EACrD,MAAM,WAAW,MAAM,OAAO,KAAK,OAAO,KAAK;GAC7C,QAAQ,UAAU,YAAY,QAAQ,QAAQ,GAAG,KAAA;GACjD;GACD,CAAC;AAIF,SAAO,KAFL,aAAa,OAAO,MAAM,SAAS,aAAa,CAAC,IAChD,MAAM,SAAS,MAAM,EACN,eAAe;;;;;;;CAQnC,MAAM,SAA8B;EAClC,MAAM,UAAU,EAAE,SAAS,KAAK,SAAS;AACzC,SAAO,qBAAqB,QAC1B,KAAK,SACL,KAAK,QACL,KAAK,SACL,KAAK,aACL,QACD;;;;;;;;CASH,MAAM,OAA4B;EAChC,MAAM,IAAI,MAAM,KAAK,QAAQ;EAC7B,MAAM,QAAQ,EAAE,QAAQ,CAAC,MAAM;AAG/B,SAAO,CAAC,IAAIC,0BAAAA,SAAS;GAAE,aAFV,EAAE,KAAK,SAAS,CAAC,MAAM;GAEM,UADzB;IAAE,QAAQ,KAAK;IAAS;IAAO;GACI,CAAC,CAAC;;;;;;;CAQxD,aAAa,UAEV;AACD,MAAI;GACF,MAAM,EAAE,SAAS,MAAM,OAAO;AAC9B,UAAO,EAAE,MAAM;WACR,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,2EACD"}