{"version":3,"file":"puppeteer.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/puppeteer.ts"],"sourcesContent":["import type {\n  launch,\n  WaitForOptions,\n  Page,\n  Browser,\n  PuppeteerLaunchOptions,\n  connect,\n  ConnectOptions,\n} from \"puppeteer\";\n\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\nimport type { DocumentLoader } from \"@langchain/core/document_loaders/base\";\n\nexport { Page, Browser };\n\nexport type PuppeteerGotoOptions = WaitForOptions & {\n  referer?: string;\n  referrerPolicy?: string;\n};\n\n/**\n * Type representing a function for evaluating JavaScript code on a web\n * page using Puppeteer. It takes a Page and Browser object as parameters\n * and returns a Promise that resolves to a string.\n */\nexport type PuppeteerEvaluate = (\n  page: Page,\n  browser: Browser\n) => Promise<string>;\n\nexport type PuppeteerWebBaseLoaderOptions = {\n  launchOptions?: PuppeteerLaunchOptions & ConnectOptions;\n  gotoOptions?: PuppeteerGotoOptions;\n  evaluate?: PuppeteerEvaluate;\n};\n\n/**\n * Class that extends the BaseDocumentLoader class and implements the\n * DocumentLoader interface. It represents a document loader for scraping\n * web pages using Puppeteer.\n * @example\n * ```typescript\n * const loader = new PuppeteerWebBaseLoader(\"https:exampleurl.com\", {\n *   launchOptions: {\n *     headless: true,\n *   },\n *   gotoOptions: {\n *     waitUntil: \"domcontentloaded\",\n *   },\n * });\n * const screenshot = await loader.screenshot();\n * ```\n */\nexport class PuppeteerWebBaseLoader\n  extends BaseDocumentLoader\n  implements DocumentLoader\n{\n  options: PuppeteerWebBaseLoaderOptions | undefined;\n\n  constructor(\n    public webPath: string,\n    options?: PuppeteerWebBaseLoaderOptions\n  ) {\n    super();\n    this.options = options ?? undefined;\n  }\n\n  static async _scrape(\n    url: string,\n    options?: PuppeteerWebBaseLoaderOptions\n  ): Promise<string> {\n    const { launch, connect } = await PuppeteerWebBaseLoader.imports();\n\n    let browser: Browser;\n\n    if (options?.launchOptions?.browserWSEndpoint) {\n      browser = await connect({\n        browserWSEndpoint: options?.launchOptions?.browserWSEndpoint,\n      });\n    } else {\n      browser = await launch({\n        headless: true,\n        defaultViewport: null,\n        ignoreDefaultArgs: [\"--disable-extensions\"],\n        ...options?.launchOptions,\n      });\n    }\n    const page = await browser.newPage();\n\n    await page.goto(url, {\n      timeout: 180000,\n      waitUntil: \"domcontentloaded\",\n      ...options?.gotoOptions,\n    });\n    const bodyHTML = options?.evaluate\n      ? await options?.evaluate(page, browser)\n      : await page.evaluate(() => document.body.innerHTML);\n\n    await browser.close();\n\n    return bodyHTML;\n  }\n\n  /**\n   * Method that calls the _scrape method to perform the scraping of the web\n   * page specified by the webPath property.\n   * @returns Promise that resolves to the scraped HTML content of the web page.\n   */\n  async scrape(): Promise<string> {\n    return PuppeteerWebBaseLoader._scrape(this.webPath, this.options);\n  }\n\n  /**\n   * Method that calls the scrape method and returns the scraped HTML\n   * content as a Document object.\n   * @returns Promise that resolves to an array of Document objects.\n   */\n  async load(): Promise<Document[]> {\n    const text = await this.scrape();\n\n    const metadata = { source: this.webPath };\n    return [new Document({ pageContent: text, metadata })];\n  }\n\n  /**\n   * Static class method used to screenshot a web page and return\n   * it as a {@link Document} object where  the pageContent property\n   * is the screenshot encoded in base64.\n   *\n   * @param {string} url\n   * @param {PuppeteerWebBaseLoaderOptions} options\n   * @returns {Document} A document object containing the screenshot of the page encoded in base64.\n   */\n  static async _screenshot(\n    url: string,\n    options?: PuppeteerWebBaseLoaderOptions\n  ): Promise<Document> {\n    const { launch, connect } = await PuppeteerWebBaseLoader.imports();\n\n    let browser: Browser;\n    if (options?.launchOptions?.browserWSEndpoint) {\n      browser = await connect({\n        browserWSEndpoint: options?.launchOptions?.browserWSEndpoint,\n      });\n    } else {\n      browser = await launch({\n        headless: true,\n        defaultViewport: null,\n        ignoreDefaultArgs: [\"--disable-extensions\"],\n        ...options?.launchOptions,\n      });\n    }\n    const page = await browser.newPage();\n\n    await page.goto(url, {\n      timeout: 180000,\n      waitUntil: \"domcontentloaded\",\n      ...options?.gotoOptions,\n    });\n    const screenshot = await page.screenshot();\n    const base64 = screenshot.toString(\"base64\");\n    const metadata = { source: url };\n    return new Document({ pageContent: base64, metadata });\n  }\n\n  /**\n   * Screenshot a web page and return it as a {@link Document} object where\n   * the pageContent property is the screenshot encoded in base64.\n   *\n   * @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.\n   */\n  async screenshot(): Promise<Document> {\n    return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);\n  }\n\n  /**\n   * Static method that imports the necessary Puppeteer modules. It returns\n   * a Promise that resolves to an object containing the imported modules.\n   * @returns Promise that resolves to an object containing the imported Puppeteer modules.\n   */\n  static async imports(): Promise<{\n    launch: typeof launch;\n    connect: typeof connect;\n  }> {\n    try {\n      const { launch, connect } = await import(\"puppeteer\");\n\n      return { launch, connect };\n    } catch (e) {\n      console.error(e);\n      throw new Error(\n        \"Please install puppeteer as a dependency with, e.g. `pnpm install puppeteer`\"\n      );\n    }\n  }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;AAsDA,IAAa,yBAAb,MAAa,+BACHA,sCAAAA,mBAEV;CACE;CAEA,YACE,SACA,SACA;AACA,SAAO;AAHA,OAAA,UAAA;AAIP,OAAK,UAAU,WAAW,KAAA;;CAG5B,aAAa,QACX,KACA,SACiB;EACjB,MAAM,EAAE,QAAQ,YAAY,MAAM,uBAAuB,SAAS;EAElE,IAAI;AAEJ,MAAI,SAAS,eAAe,kBAC1B,WAAU,MAAM,QAAQ,EACtB,mBAAmB,SAAS,eAAe,mBAC5C,CAAC;MAEF,WAAU,MAAM,OAAO;GACrB,UAAU;GACV,iBAAiB;GACjB,mBAAmB,CAAC,uBAAuB;GAC3C,GAAG,SAAS;GACb,CAAC;EAEJ,MAAM,OAAO,MAAM,QAAQ,SAAS;AAEpC,QAAM,KAAK,KAAK,KAAK;GACnB,SAAS;GACT,WAAW;GACX,GAAG,SAAS;GACb,CAAC;EACF,MAAM,WAAW,SAAS,WACtB,MAAM,SAAS,SAAS,MAAM,QAAQ,GACtC,MAAM,KAAK,eAAe,SAAS,KAAK,UAAU;AAEtD,QAAM,QAAQ,OAAO;AAErB,SAAO;;;;;;;CAQT,MAAM,SAA0B;AAC9B,SAAO,uBAAuB,QAAQ,KAAK,SAAS,KAAK,QAAQ;;;;;;;CAQnE,MAAM,OAA4B;AAIhC,SAAO,CAAC,IAAIC,0BAAAA,SAAS;GAAE,aAHV,MAAM,KAAK,QAAQ;GAGU,UADzB,EAAE,QAAQ,KAAK,SAAS;GACW,CAAC,CAAC;;;;;;;;;;;CAYxD,aAAa,YACX,KACA,SACmB;EACnB,MAAM,EAAE,QAAQ,YAAY,MAAM,uBAAuB,SAAS;EAElE,IAAI;AACJ,MAAI,SAAS,eAAe,kBAC1B,WAAU,MAAM,QAAQ,EACtB,mBAAmB,SAAS,eAAe,mBAC5C,CAAC;MAEF,WAAU,MAAM,OAAO;GACrB,UAAU;GACV,iBAAiB;GACjB,mBAAmB,CAAC,uBAAuB;GAC3C,GAAG,SAAS;GACb,CAAC;EAEJ,MAAM,OAAO,MAAM,QAAQ,SAAS;AAEpC,QAAM,KAAK,KAAK,KAAK;GACnB,SAAS;GACT,WAAW;GACX,GAAG,SAAS;GACb,CAAC;AAIF,SAAO,IAAIA,0BAAAA,SAAS;GAAE,cAHH,MAAM,KAAK,YAAY,EAChB,SAAS,SAAS;GAED,UAD1B,EAAE,QAAQ,KAAK;GACqB,CAAC;;;;;;;;CASxD,MAAM,aAAgC;AACpC,SAAO,uBAAuB,YAAY,KAAK,SAAS,KAAK,QAAQ;;;;;;;CAQvE,aAAa,UAGV;AACD,MAAI;GACF,MAAM,EAAE,QAAQ,YAAY,MAAM,OAAO;AAEzC,UAAO;IAAE;IAAQ;IAAS;WACnB,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,+EACD"}