{"version":3,"file":"recursive_url.cjs","names":["VirtualConsole","BaseDocumentLoader","AsyncCaller","JSDOM"],"sources":["../../../src/document_loaders/web/recursive_url.ts"],"sourcesContent":["import { JSDOM, VirtualConsole } from \"jsdom\";\nimport { Document } from \"@langchain/core/documents\";\nimport { AsyncCaller } from \"@langchain/core/utils/async_caller\";\nimport { isSameOrigin, validateSafeUrl } from \"@langchain/core/utils/ssrf\";\nimport {\n  BaseDocumentLoader,\n  DocumentLoader,\n} from \"@langchain/core/document_loaders/base\";\n\nconst virtualConsole = new VirtualConsole();\nvirtualConsole.on(\"error\", () => {});\n\nconst MAX_REDIRECTS = 10;\nconst REDIRECT_CODES = new Set([301, 302, 303, 307, 308]);\n\nexport interface RecursiveUrlLoaderOptions {\n  excludeDirs?: string[];\n  extractor?: (text: string) => string;\n  maxDepth?: number;\n  timeout?: number;\n  preventOutside?: boolean;\n  callerOptions?: ConstructorParameters<typeof AsyncCaller>[0];\n}\n\nexport class RecursiveUrlLoader\n  extends BaseDocumentLoader\n  implements DocumentLoader\n{\n  private caller: AsyncCaller;\n\n  private url: string;\n\n  private excludeDirs: string[];\n\n  private extractor: (text: string) => string;\n\n  private maxDepth: number;\n\n  private timeout: number;\n\n  private preventOutside: boolean;\n\n  constructor(url: string, options: RecursiveUrlLoaderOptions) {\n    super();\n\n    this.caller = new AsyncCaller({\n      maxConcurrency: 64,\n      maxRetries: 0,\n      ...options.callerOptions,\n    });\n\n    this.url = url;\n    this.excludeDirs = options.excludeDirs ?? [];\n    this.extractor = options.extractor ?? ((s: string) => s);\n    this.maxDepth = options.maxDepth ?? 2;\n    this.timeout = options.timeout ?? 10000;\n    this.preventOutside = options.preventOutside ?? true;\n  }\n\n  private async fetchWithTimeout(\n    resource: string,\n    options: { timeout: number } & RequestInit\n  ): Promise<Response> {\n    const { timeout, ...rest } = options;\n    let currentUrl = resource;\n\n    for (let i = 0; i <= MAX_REDIRECTS; i++) {\n      validateSafeUrl(currentUrl, { allowHttp: true });\n\n      const response = await this.caller.call(() =>\n        fetch(currentUrl, {\n          ...rest,\n          redirect: \"manual\",\n          signal: AbortSignal.timeout(timeout),\n        })\n      );\n\n      if (REDIRECT_CODES.has(response.status)) {\n        const location = response.headers.get(\"location\");\n        if (!location) {\n          throw new Error(\"Redirect response missing Location header\");\n        }\n        currentUrl = new URL(location, currentUrl).href;\n        continue;\n      }\n\n      return response;\n    }\n\n    throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`);\n  }\n\n  private getChildLinks(html: string, baseUrl: string): Array<string> {\n    const allLinks = Array.from(\n      new JSDOM(html, { virtualConsole }).window.document.querySelectorAll(\"a\")\n    ).map((a) => a.href);\n    const absolutePaths = [];\n    const invalidPrefixes = [\"javascript:\", \"mailto:\", \"#\"];\n    const invalidSuffixes = [\n      \".css\",\n      \".js\",\n      \".ico\",\n      \".png\",\n      \".jpg\",\n      \".jpeg\",\n      \".gif\",\n      \".svg\",\n    ];\n\n    for (const link of allLinks) {\n      if (\n        invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||\n        invalidSuffixes.some((suffix) => link.endsWith(suffix))\n      )\n        continue;\n\n      let standardizedLink: string;\n\n      if (link.startsWith(\"http\")) {\n        standardizedLink = link;\n      } else if (link.startsWith(\"//\")) {\n        const base = new URL(baseUrl);\n        standardizedLink = base.protocol + link;\n      } else {\n        standardizedLink = new URL(link, baseUrl).href;\n      }\n\n      if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))\n        continue;\n\n      if (link.startsWith(\"http\")) {\n        const isAllowed = !this.preventOutside || isSameOrigin(link, baseUrl);\n        if (isAllowed) absolutePaths.push(link);\n      } else if (link.startsWith(\"//\")) {\n        const base = new URL(baseUrl);\n        absolutePaths.push(base.protocol + link);\n      } else {\n        const newLink = new URL(link, baseUrl).href;\n        absolutePaths.push(newLink);\n      }\n    }\n\n    return Array.from(new Set(absolutePaths));\n  }\n\n  private extractMetadata(rawHtml: string, url: string) {\n    // oxlint-disable-next-line typescript/no-explicit-any\n    const metadata: Record<string, any> = { source: url };\n    const { document } = new JSDOM(rawHtml, { virtualConsole }).window;\n\n    const title = document.getElementsByTagName(\"title\")[0];\n    if (title) {\n      metadata.title = title.textContent;\n    }\n\n    const description = document.querySelector(\"meta[name=description]\");\n    if (description) {\n      metadata.description = description.getAttribute(\"content\");\n    }\n\n    const html = document.getElementsByTagName(\"html\")[0];\n    if (html) {\n      metadata.language = html.getAttribute(\"lang\");\n    }\n\n    return metadata;\n  }\n\n  private async getUrlAsDoc(url: string): Promise<Document | null> {\n    let res;\n    try {\n      res = await this.fetchWithTimeout(url, { timeout: this.timeout });\n      res = await res.text();\n    } catch {\n      return null;\n    }\n\n    return {\n      pageContent: this.extractor(res),\n      metadata: this.extractMetadata(res, url),\n    };\n  }\n\n  private async getChildUrlsRecursive(\n    inputUrl: string,\n    visited: Set<string> = new Set<string>(),\n    depth = 0\n  ): Promise<Document[]> {\n    if (depth >= this.maxDepth) return [];\n\n    let url = inputUrl;\n    if (!inputUrl.endsWith(\"/\")) url += \"/\";\n\n    const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));\n    if (isExcluded) return [];\n\n    let res;\n    try {\n      res = await this.fetchWithTimeout(url, { timeout: this.timeout });\n      res = await res.text();\n    } catch {\n      return [];\n    }\n\n    const childUrls: string[] = this.getChildLinks(res, url);\n\n    const results = await Promise.all(\n      childUrls.map((childUrl) =>\n        (async () => {\n          if (visited.has(childUrl)) return null;\n          visited.add(childUrl);\n\n          const childDoc = await this.getUrlAsDoc(childUrl);\n          if (!childDoc) return null;\n\n          if (childUrl.endsWith(\"/\")) {\n            const childUrlResponses = await this.getChildUrlsRecursive(\n              childUrl,\n              visited,\n              depth + 1\n            );\n            return [childDoc, ...childUrlResponses];\n          }\n\n          return [childDoc];\n        })()\n      )\n    );\n\n    return results.flat().filter((docs) => docs !== null) as Document[];\n  }\n\n  async load(): Promise<Document[]> {\n    const rootDoc = await this.getUrlAsDoc(this.url);\n    if (!rootDoc) return [];\n\n    const docs = [rootDoc];\n    docs.push(\n      ...(await this.getChildUrlsRecursive(this.url, new Set([this.url])))\n    );\n    return docs;\n  }\n}\n"],"mappings":";;;;;;;;AASA,MAAM,iBAAiB,IAAIA,MAAAA,gBAAgB;AAC3C,eAAe,GAAG,eAAe,GAAG;AAEpC,MAAM,gBAAgB;AACtB,MAAM,iBAAiB,IAAI,IAAI;CAAC;CAAK;CAAK;CAAK;CAAK;CAAI,CAAC;AAWzD,IAAa,qBAAb,cACUC,sCAAAA,mBAEV;CACE;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,YAAY,KAAa,SAAoC;AAC3D,SAAO;AAEP,OAAK,SAAS,IAAIC,mCAAAA,YAAY;GAC5B,gBAAgB;GAChB,YAAY;GACZ,GAAG,QAAQ;GACZ,CAAC;AAEF,OAAK,MAAM;AACX,OAAK,cAAc,QAAQ,eAAe,EAAE;AAC5C,OAAK,YAAY,QAAQ,eAAe,MAAc;AACtD,OAAK,WAAW,QAAQ,YAAY;AACpC,OAAK,UAAU,QAAQ,WAAW;AAClC,OAAK,iBAAiB,QAAQ,kBAAkB;;CAGlD,MAAc,iBACZ,UACA,SACmB;EACnB,MAAM,EAAE,SAAS,GAAG,SAAS;EAC7B,IAAI,aAAa;AAEjB,OAAK,IAAI,IAAI,GAAG,KAAK,eAAe,KAAK;AACvC,IAAA,GAAA,2BAAA,iBAAgB,YAAY,EAAE,WAAW,MAAM,CAAC;GAEhD,MAAM,WAAW,MAAM,KAAK,OAAO,WACjC,MAAM,YAAY;IAChB,GAAG;IACH,UAAU;IACV,QAAQ,YAAY,QAAQ,QAAQ;IACrC,CAAC,CACH;AAED,OAAI,eAAe,IAAI,SAAS,OAAO,EAAE;IACvC,MAAM,WAAW,SAAS,QAAQ,IAAI,WAAW;AACjD,QAAI,CAAC,SACH,OAAM,IAAI,MAAM,4CAA4C;AAE9D,iBAAa,IAAI,IAAI,UAAU,WAAW,CAAC;AAC3C;;AAGF,UAAO;;AAGT,QAAM,IAAI,MAAM,2BAA2B,cAAc,GAAG;;CAG9D,cAAsB,MAAc,SAAgC;EAClE,MAAM,WAAW,MAAM,KACrB,IAAIC,MAAAA,MAAM,MAAM,EAAE,gBAAgB,CAAC,CAAC,OAAO,SAAS,iBAAiB,IAAI,CAC1E,CAAC,KAAK,MAAM,EAAE,KAAK;EACpB,MAAM,gBAAgB,EAAE;EACxB,MAAM,kBAAkB;GAAC;GAAe;GAAW;GAAI;EACvD,MAAM,kBAAkB;GACtB;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACD;AAED,OAAK,MAAM,QAAQ,UAAU;AAC3B,OACE,gBAAgB,MAAM,WAAW,KAAK,WAAW,OAAO,CAAC,IACzD,gBAAgB,MAAM,WAAW,KAAK,SAAS,OAAO,CAAC,CAEvD;GAEF,IAAI;AAEJ,OAAI,KAAK,WAAW,OAAO,CACzB,oBAAmB;YACV,KAAK,WAAW,KAAK,CAE9B,oBADa,IAAI,IAAI,QAAQ,CACL,WAAW;OAEnC,oBAAmB,IAAI,IAAI,MAAM,QAAQ,CAAC;AAG5C,OAAI,KAAK,YAAY,MAAM,UAAU,iBAAiB,WAAW,MAAM,CAAC,CACtE;AAEF,OAAI,KAAK,WAAW,OAAO;QACP,CAAC,KAAK,mBAAA,GAAA,2BAAA,cAA+B,MAAM,QAAQ,CACtD,eAAc,KAAK,KAAK;cAC9B,KAAK,WAAW,KAAK,EAAE;IAChC,MAAM,OAAO,IAAI,IAAI,QAAQ;AAC7B,kBAAc,KAAK,KAAK,WAAW,KAAK;UACnC;IACL,MAAM,UAAU,IAAI,IAAI,MAAM,QAAQ,CAAC;AACvC,kBAAc,KAAK,QAAQ;;;AAI/B,SAAO,MAAM,KAAK,IAAI,IAAI,cAAc,CAAC;;CAG3C,gBAAwB,SAAiB,KAAa;EAEpD,MAAM,WAAgC,EAAE,QAAQ,KAAK;EACrD,MAAM,EAAE,aAAa,IAAIA,MAAAA,MAAM,SAAS,EAAE,gBAAgB,CAAC,CAAC;EAE5D,MAAM,QAAQ,SAAS,qBAAqB,QAAQ,CAAC;AACrD,MAAI,MACF,UAAS,QAAQ,MAAM;EAGzB,MAAM,cAAc,SAAS,cAAc,yBAAyB;AACpE,MAAI,YACF,UAAS,cAAc,YAAY,aAAa,UAAU;EAG5D,MAAM,OAAO,SAAS,qBAAqB,OAAO,CAAC;AACnD,MAAI,KACF,UAAS,WAAW,KAAK,aAAa,OAAO;AAG/C,SAAO;;CAGT,MAAc,YAAY,KAAuC;EAC/D,IAAI;AACJ,MAAI;AACF,SAAM,MAAM,KAAK,iBAAiB,KAAK,EAAE,SAAS,KAAK,SAAS,CAAC;AACjE,SAAM,MAAM,IAAI,MAAM;UAChB;AACN,UAAO;;AAGT,SAAO;GACL,aAAa,KAAK,UAAU,IAAI;GAChC,UAAU,KAAK,gBAAgB,KAAK,IAAI;GACzC;;CAGH,MAAc,sBACZ,UACA,0BAAuB,IAAI,KAAa,EACxC,QAAQ,GACa;AACrB,MAAI,SAAS,KAAK,SAAU,QAAO,EAAE;EAErC,IAAI,MAAM;AACV,MAAI,CAAC,SAAS,SAAS,IAAI,CAAE,QAAO;AAGpC,MADmB,KAAK,YAAY,MAAM,UAAU,IAAI,WAAW,MAAM,CAAC,CAC1D,QAAO,EAAE;EAEzB,IAAI;AACJ,MAAI;AACF,SAAM,MAAM,KAAK,iBAAiB,KAAK,EAAE,SAAS,KAAK,SAAS,CAAC;AACjE,SAAM,MAAM,IAAI,MAAM;UAChB;AACN,UAAO,EAAE;;EAGX,MAAM,YAAsB,KAAK,cAAc,KAAK,IAAI;AAyBxD,UAvBgB,MAAM,QAAQ,IAC5B,UAAU,KAAK,cACZ,YAAY;AACX,OAAI,QAAQ,IAAI,SAAS,CAAE,QAAO;AAClC,WAAQ,IAAI,SAAS;GAErB,MAAM,WAAW,MAAM,KAAK,YAAY,SAAS;AACjD,OAAI,CAAC,SAAU,QAAO;AAEtB,OAAI,SAAS,SAAS,IAAI,CAMxB,QAAO,CAAC,UAAU,GALQ,MAAM,KAAK,sBACnC,UACA,SACA,QAAQ,EACT,CACsC;AAGzC,UAAO,CAAC,SAAS;MACf,CACL,CACF,EAEc,MAAM,CAAC,QAAQ,SAAS,SAAS,KAAK;;CAGvD,MAAM,OAA4B;EAChC,MAAM,UAAU,MAAM,KAAK,YAAY,KAAK,IAAI;AAChD,MAAI,CAAC,QAAS,QAAO,EAAE;EAEvB,MAAM,OAAO,CAAC,QAAQ;AACtB,OAAK,KACH,GAAI,MAAM,KAAK,sBAAsB,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,IAAI,CAAC,CAAC,CACpE;AACD,SAAO"}