{"version":3,"file":"gitbook.cjs","names":["CheerioWebBaseLoader","Document"],"sources":["../../../src/document_loaders/web/gitbook.ts"],"sourcesContent":["import type { CheerioAPI } from \"cheerio\";\nimport { Document } from \"@langchain/core/documents\";\nimport { CheerioWebBaseLoader } from \"./cheerio.js\";\n\n/**\n * Interface representing the parameters for configuring the\n * GitbookLoader. It has an optional property shouldLoadAllPaths, which\n * indicates whether all paths should be loaded.\n */\ninterface GitbookLoaderParams {\n  shouldLoadAllPaths?: boolean;\n}\n\n/**\n * Class representing a document loader specifically designed for loading\n * documents from Gitbook. It extends the CheerioWebBaseLoader.\n */\nexport class GitbookLoader extends CheerioWebBaseLoader {\n  shouldLoadAllPaths = false;\n\n  private readonly baseUrl: string;\n\n  constructor(\n    public webPath: string,\n    params: GitbookLoaderParams = {}\n  ) {\n    const path =\n      params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;\n    super(path);\n\n    this.baseUrl = webPath;\n    this.webPath = path;\n\n    this.shouldLoadAllPaths =\n      params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;\n  }\n\n  /**\n   * Method that scrapes the web document using Cheerio and loads the\n   * content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths\n   * is true, it calls the loadAllPaths() method to load all paths.\n   * Otherwise, it calls the loadPath() method to load a single path.\n   * @returns Promise resolving to an array of Document instances.\n   */\n  public async load(): Promise<Document[]> {\n    const $ = await this.scrape();\n\n    if (this.shouldLoadAllPaths === true) {\n      return this.loadAllPaths($);\n    }\n    return this.loadPath($);\n  }\n\n  /**\n   * Private method that loads the content of a single path from the Gitbook\n   * web document. It extracts the page content by selecting all elements\n   * inside the \"main\" element, filters out empty text nodes, and joins the\n   * remaining text nodes with line breaks. It extracts the title by\n   * selecting the first \"h1\" element inside the \"main\" element. It creates\n   * a Document instance with the extracted page content and metadata\n   * containing the source URL and title.\n   * @param $ CheerioAPI instance representing the loaded web document.\n   * @param url Optional string representing the URL of the web document.\n   * @returns Array of Document instances.\n   */\n  private loadPath($: CheerioAPI, url?: string): Document[] {\n    const pageContent = $(\"main *\")\n      .contents()\n      .toArray()\n      .map((element) =>\n        element.type === \"text\" ? $(element).text().trim() : null\n      )\n      .filter((text) => text)\n      .join(\"\\n\");\n\n    const title = $(\"main h1\").first().text().trim();\n\n    return [\n      new Document({\n        pageContent,\n        metadata: { source: url ?? this.webPath, title },\n      }),\n    ];\n  }\n\n  /**\n   * Private method that loads the content of all paths from the Gitbook web\n   * document. It extracts the URLs of all paths from the \"loc\" elements in\n   * the sitemap.xml. It iterates over each URL, scrapes the web document\n   * using the _scrape() method, and calls the loadPath() method to load the\n   * content of each path. It collects all the loaded documents and returns\n   * them as an array.\n   * @param $ CheerioAPI instance representing the loaded web document.\n   * @returns Promise resolving to an array of Document instances.\n   */\n  private async loadAllPaths($: CheerioAPI): Promise<Document[]> {\n    const urls = $(\"loc\")\n      .toArray()\n      .map((element) => $(element).text());\n\n    const documents: Document[] = [];\n    for (const url of urls) {\n      const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;\n      console.log(`Fetching text from ${buildUrl}`);\n      const html = await GitbookLoader._scrape(\n        buildUrl,\n        this.caller,\n        this.timeout\n      );\n      documents.push(...this.loadPath(html, buildUrl));\n    }\n    console.log(`Fetched ${documents.length} documents.`);\n    return documents;\n  }\n}\n"],"mappings":";;;;;;;;;;AAiBA,IAAa,gBAAb,MAAa,sBAAsBA,qCAAAA,qBAAqB;CACtD,qBAAqB;CAErB;CAEA,YACE,SACA,SAA8B,EAAE,EAChC;EACA,MAAM,OACJ,OAAO,uBAAuB,OAAO,GAAG,QAAQ,gBAAgB;AAClE,QAAM,KAAK;AALJ,OAAA,UAAA;AAOP,OAAK,UAAU;AACf,OAAK,UAAU;AAEf,OAAK,qBACH,OAAO,sBAAsB,KAAK;;;;;;;;;CAUtC,MAAa,OAA4B;EACvC,MAAM,IAAI,MAAM,KAAK,QAAQ;AAE7B,MAAI,KAAK,uBAAuB,KAC9B,QAAO,KAAK,aAAa,EAAE;AAE7B,SAAO,KAAK,SAAS,EAAE;;;;;;;;;;;;;;CAezB,SAAiB,GAAe,KAA0B;EACxD,MAAM,cAAc,EAAE,SAAS,CAC5B,UAAU,CACV,SAAS,CACT,KAAK,YACJ,QAAQ,SAAS,SAAS,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,GAAG,KACtD,CACA,QAAQ,SAAS,KAAK,CACtB,KAAK,KAAK;EAEb,MAAM,QAAQ,EAAE,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM;AAEhD,SAAO,CACL,IAAIC,0BAAAA,SAAS;GACX;GACA,UAAU;IAAE,QAAQ,OAAO,KAAK;IAAS;IAAO;GACjD,CAAC,CACH;;;;;;;;;;;;CAaH,MAAc,aAAa,GAAoC;EAC7D,MAAM,OAAO,EAAE,MAAM,CAClB,SAAS,CACT,KAAK,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC;EAEtC,MAAM,YAAwB,EAAE;AAChC,OAAK,MAAM,OAAO,MAAM;GACtB,MAAM,WAAW,IAAI,SAAS,KAAK,QAAQ,GAAG,MAAM,KAAK,UAAU;AACnE,WAAQ,IAAI,sBAAsB,WAAW;GAC7C,MAAM,OAAO,MAAM,cAAc,QAC/B,UACA,KAAK,QACL,KAAK,QACN;AACD,aAAU,KAAK,GAAG,KAAK,SAAS,MAAM,SAAS,CAAC;;AAElD,UAAQ,IAAI,WAAW,UAAU,OAAO,aAAa;AACrD,SAAO"}