{"version":3,"file":"sitemap.cjs","names":["CheerioWebBaseLoader","Document"],"sources":["../../../src/document_loaders/web/sitemap.ts"],"sourcesContent":["import { Document, DocumentInterface } from \"@langchain/core/documents\";\nimport { chunkArray } from \"@langchain/core/utils/chunk_array\";\nimport { CheerioWebBaseLoader, CheerioWebBaseLoaderParams } from \"./cheerio.js\";\n\n/**\n * Interface representing the parameters for initializing a SitemapLoader.\n * @interface SitemapLoaderParams\n * @extends CheerioWebBaseLoaderParams\n */\nexport interface SitemapLoaderParams extends CheerioWebBaseLoaderParams {\n  /**\n   * @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.\n   * WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.\n   */\n  filterUrls?: (string | RegExp)[];\n  /**\n   * The size to chunk the sitemap URLs into for scraping.\n   * @default {300}\n   */\n  chunkSize?: number;\n}\n\nconst DEFAULT_CHUNK_SIZE = 300;\n\ntype SiteMapElement = {\n  loc: string;\n  changefreq?: string;\n  lastmod?: string;\n  priority?: string;\n};\n\nexport class SitemapLoader\n  extends CheerioWebBaseLoader\n  implements SitemapLoaderParams\n{\n  allowUrlPatterns: (string | RegExp)[] | undefined;\n\n  chunkSize: number;\n\n  constructor(\n    public webPath: string,\n    params: SitemapLoaderParams = {}\n  ) {\n    const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };\n    let path = webPath.endsWith(\"/\") ? webPath.slice(0, -1) : webPath;\n    // Allow for custom sitemap paths to be passed in with the url.\n    path = path.endsWith(\".xml\") ? path : `${path}/sitemap.xml`;\n    super(path, paramsWithDefaults);\n\n    this.webPath = path;\n    this.allowUrlPatterns = paramsWithDefaults.filterUrls;\n    this.chunkSize = paramsWithDefaults.chunkSize;\n  }\n\n  _checkUrlPatterns(url: string): boolean {\n    if (!this.allowUrlPatterns) {\n      return false;\n    }\n    return !this.allowUrlPatterns.some(\n      (pattern) => !new RegExp(pattern).test(url)\n    );\n  }\n\n  async parseSitemap() {\n    const $ = await CheerioWebBaseLoader._scrape(\n      this.webPath,\n      this.caller,\n      this.timeout,\n      this.textDecoder,\n      {\n        xmlMode: true,\n        xml: true,\n      }\n    );\n\n    const elements: Array<SiteMapElement> = [];\n\n    $(\"url\").each((_, element) => {\n      const loc = $(element).find(\"loc\").text();\n      if (!loc) {\n        return;\n      }\n\n      if (this._checkUrlPatterns(loc)) {\n        return;\n      }\n\n      const changefreq = $(element).find(\"changefreq\").text();\n      const lastmod = $(element).find(\"lastmod\").text();\n      const priority = $(element).find(\"priority\").text();\n\n      elements.push({ loc, changefreq, lastmod, priority });\n    });\n\n    $(\"sitemap\").each((_, element) => {\n      const loc = $(element).find(\"loc\").text();\n      if (!loc) {\n        return;\n      }\n      const changefreq = $(element).find(\"changefreq\").text();\n      const lastmod = $(element).find(\"lastmod\").text();\n      const priority = $(element).find(\"priority\").text();\n\n      elements.push({ loc, changefreq, lastmod, priority });\n    });\n\n    return elements;\n  }\n\n  async _loadSitemapUrls(\n    elements: Array<SiteMapElement>\n  ): Promise<DocumentInterface[]> {\n    const all = await CheerioWebBaseLoader.scrapeAll(\n      elements.map((ele) => ele.loc),\n      this.caller,\n      this.timeout,\n      this.textDecoder\n    );\n    const documents: Array<DocumentInterface> = all.map(($, i) => {\n      if (!elements[i]) {\n        throw new Error(\"Scraped docs and elements not in sync\");\n      }\n      const text = $(this.selector).text();\n      const { loc: source, ...metadata } = elements[i];\n\n      // extract page metadata\n      const description = $(\"meta[name='description']\").attr(\"content\");\n      const title = $(\"meta[property='og:title']\").attr(\"content\");\n      const lang = $(\"meta[property='og:locale']\").attr(\"content\");\n\n      return new Document({\n        pageContent: text,\n        metadata: {\n          ...metadata,\n          description,\n          title,\n          lang,\n          source: source.trim(),\n        },\n      });\n    });\n    return documents;\n  }\n\n  async load(): Promise<Document[]> {\n    const elements = await this.parseSitemap();\n    const chunks = chunkArray(elements, this.chunkSize);\n\n    const documents: DocumentInterface[] = [];\n    for await (const chunk of chunks) {\n      const chunkedDocuments = await this._loadSitemapUrls(chunk);\n      documents.push(...chunkedDocuments);\n    }\n    return documents;\n  }\n}\n"],"mappings":";;;;;;;AAsBA,MAAM,qBAAqB;AAS3B,IAAa,gBAAb,cACUA,qCAAAA,qBAEV;CACE;CAEA;CAEA,YACE,SACA,SAA8B,EAAE,EAChC;EACA,MAAM,qBAAqB;GAAE,WAAW;GAAoB,GAAG;GAAQ;EACvE,IAAI,OAAO,QAAQ,SAAS,IAAI,GAAG,QAAQ,MAAM,GAAG,GAAG,GAAG;AAE1D,SAAO,KAAK,SAAS,OAAO,GAAG,OAAO,GAAG,KAAK;AAC9C,QAAM,MAAM,mBAAmB;AAPxB,OAAA,UAAA;AASP,OAAK,UAAU;AACf,OAAK,mBAAmB,mBAAmB;AAC3C,OAAK,YAAY,mBAAmB;;CAGtC,kBAAkB,KAAsB;AACtC,MAAI,CAAC,KAAK,iBACR,QAAO;AAET,SAAO,CAAC,KAAK,iBAAiB,MAC3B,YAAY,CAAC,IAAI,OAAO,QAAQ,CAAC,KAAK,IAAI,CAC5C;;CAGH,MAAM,eAAe;EACnB,MAAM,IAAI,MAAMA,qCAAAA,qBAAqB,QACnC,KAAK,SACL,KAAK,QACL,KAAK,SACL,KAAK,aACL;GACE,SAAS;GACT,KAAK;GACN,CACF;EAED,MAAM,WAAkC,EAAE;AAE1C,IAAE,MAAM,CAAC,MAAM,GAAG,YAAY;GAC5B,MAAM,MAAM,EAAE,QAAQ,CAAC,KAAK,MAAM,CAAC,MAAM;AACzC,OAAI,CAAC,IACH;AAGF,OAAI,KAAK,kBAAkB,IAAI,CAC7B;GAGF,MAAM,aAAa,EAAE,QAAQ,CAAC,KAAK,aAAa,CAAC,MAAM;GACvD,MAAM,UAAU,EAAE,QAAQ,CAAC,KAAK,UAAU,CAAC,MAAM;GACjD,MAAM,WAAW,EAAE,QAAQ,CAAC,KAAK,WAAW,CAAC,MAAM;AAEnD,YAAS,KAAK;IAAE;IAAK;IAAY;IAAS;IAAU,CAAC;IACrD;AAEF,IAAE,UAAU,CAAC,MAAM,GAAG,YAAY;GAChC,MAAM,MAAM,EAAE,QAAQ,CAAC,KAAK,MAAM,CAAC,MAAM;AACzC,OAAI,CAAC,IACH;GAEF,MAAM,aAAa,EAAE,QAAQ,CAAC,KAAK,aAAa,CAAC,MAAM;GACvD,MAAM,UAAU,EAAE,QAAQ,CAAC,KAAK,UAAU,CAAC,MAAM;GACjD,MAAM,WAAW,EAAE,QAAQ,CAAC,KAAK,WAAW,CAAC,MAAM;AAEnD,YAAS,KAAK;IAAE;IAAK;IAAY;IAAS;IAAU,CAAC;IACrD;AAEF,SAAO;;CAGT,MAAM,iBACJ,UAC8B;AA8B9B,UA7BY,MAAMA,qCAAAA,qBAAqB,UACrC,SAAS,KAAK,QAAQ,IAAI,IAAI,EAC9B,KAAK,QACL,KAAK,SACL,KAAK,YACN,EAC+C,KAAK,GAAG,MAAM;AAC5D,OAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,wCAAwC;GAE1D,MAAM,OAAO,EAAE,KAAK,SAAS,CAAC,MAAM;GACpC,MAAM,EAAE,KAAK,QAAQ,GAAG,aAAa,SAAS;GAG9C,MAAM,cAAc,EAAE,2BAA2B,CAAC,KAAK,UAAU;GACjE,MAAM,QAAQ,EAAE,4BAA4B,CAAC,KAAK,UAAU;GAC5D,MAAM,OAAO,EAAE,6BAA6B,CAAC,KAAK,UAAU;AAE5D,UAAO,IAAIC,0BAAAA,SAAS;IAClB,aAAa;IACb,UAAU;KACR,GAAG;KACH;KACA;KACA;KACA,QAAQ,OAAO,MAAM;KACtB;IACF,CAAC;IACF;;CAIJ,MAAM,OAA4B;EAEhC,MAAM,UAAA,GAAA,kCAAA,YADW,MAAM,KAAK,cAAc,EACN,KAAK,UAAU;EAEnD,MAAM,YAAiC,EAAE;AACzC,aAAW,MAAM,SAAS,QAAQ;GAChC,MAAM,mBAAmB,MAAM,KAAK,iBAAiB,MAAM;AAC3D,aAAU,KAAK,GAAG,iBAAiB;;AAErC,SAAO"}