{"version":3,"file":"arxiv.cjs","names":["XMLParser","PDFLoader","Document"],"sources":["../../src/utils/arxiv.ts"],"sourcesContent":["/* oxlint-disable typescript/no-explicit-any */\nimport { Document } from \"@langchain/core/documents\";\nimport { XMLParser } from \"fast-xml-parser\";\n\nimport { PDFLoader } from \"../document_loaders/fs/pdf.js\";\n\n// Interface for processed arXiv entry\ninterface ArxivEntry {\n  id: string;\n  title: string;\n  summary: string;\n  published: string;\n  updated: string;\n  authors: string[];\n  pdfUrl: string;\n  links: any[];\n}\n\n// Used to check if the query is an arXiv ID, or a natural language query\nexport function isArXivIdentifier(query: string): boolean {\n  const arxivIdRegex = /^\\d{4}\\.\\d{4,5}(v\\d+)?$|^\\d{7}(\\.\\d+)?(v\\d+)?$/;\n  return arxivIdRegex.test(query.trim());\n}\n\n// Used to fetch direct arXiv articles by IDs (supports multiple IDs)\nexport async function fetchDirectArxivArticle(\n  arxivIds: string\n): Promise<ArxivEntry[]> {\n  try {\n    const idList = arxivIds\n      .split(/[\\s,]+/)\n      .map((id) => id.trim())\n      .filter(Boolean)\n      .join(\",\");\n    const url = `http://export.arxiv.org/api/query?id_list=${idList}`;\n    const response = await fetch(url);\n\n    if (!response.ok) {\n      throw new Error(`HTTP error! status: ${response.status}`);\n    }\n\n    const xml = await response.text();\n\n    const parser = new XMLParser({\n      ignoreAttributes: false,\n      attributeNamePrefix: \"@_\",\n    });\n    const result = parser.parse(xml);\n    let entries = result.feed.entry;\n\n    if (!entries) {\n      return [];\n    }\n\n    // Ensure entries is an array\n    if (!Array.isArray(entries)) {\n      entries = [entries];\n    }\n\n    const processedEntries = entries.map(processEntry);\n\n    return processedEntries;\n  } catch {\n    throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);\n  }\n}\n\n// Used to fetch arXiv results by natural language query with maxResults parameter\nexport async function fetchArxivResultsByQuery(\n  query: string,\n  start = 0,\n  maxResults = 10\n): Promise<ArxivEntry[]> {\n  try {\n    const encodedQuery = encodeURIComponent(query);\n    const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;\n    const response = await fetch(url);\n\n    if (!response.ok) {\n      throw new Error(`HTTP error! status: ${response.status}`);\n    }\n\n    const xml = await response.text();\n\n    const parser = new XMLParser({\n      ignoreAttributes: false,\n      attributeNamePrefix: \"@_\",\n    });\n    const result = parser.parse(xml);\n    let entries = result.feed.entry;\n\n    if (!entries) {\n      return [];\n    }\n\n    // Ensure entries is an array\n    if (!Array.isArray(entries)) {\n      entries = [entries];\n    }\n\n    const processedEntries = entries.map(processEntry);\n\n    return processedEntries;\n  } catch {\n    throw new Error(`Failed to fetch articles with query \"${query}\"`);\n  }\n}\n\n// Used to search for arXiv articles with a maxResults parameter\nexport async function searchArxiv(\n  query: string,\n  maxResults = 3\n): Promise<ArxivEntry[]> {\n  if (isArXivIdentifier(query)) {\n    return await fetchDirectArxivArticle(query);\n  } else {\n    return await fetchArxivResultsByQuery(query, 0, maxResults);\n  }\n}\n\n// Used to fetch and parse PDF to text\nexport async function fetchAndParsePDF(pdfUrl: string): Promise<string> {\n  try {\n    // Fetch the PDF\n    const response = await fetch(pdfUrl);\n\n    if (!response.ok) {\n      throw new Error(`HTTP error! status: ${response.status}`);\n    }\n\n    const buffer = await response.arrayBuffer();\n\n    // Convert the ArrayBuffer to a Blob\n    const blob = new Blob([buffer], { type: \"application/pdf\" });\n\n    // Use PDFLoader to process the PDF\n    const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob\n    const docs: Document[] = await loader.load();\n\n    // Combine all document content into a single string\n    const content = docs.map((doc) => doc.pageContent).join(\"\\n\\n\");\n    return content;\n  } catch {\n    throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);\n  }\n}\n\n// Used to load raw text from each search result, and convert to Document instances\nexport async function loadDocsFromResults(\n  results: ArxivEntry[]\n): Promise<Document[]> {\n  const docs: Document[] = [];\n  for (const result of results) {\n    const pdfUrl = result.pdfUrl;\n    try {\n      const pdfContent = await fetchAndParsePDF(pdfUrl);\n      const metadata = {\n        id: result.id,\n        title: result.title,\n        authors: result.authors,\n        published: result.published,\n        updated: result.updated,\n        source: \"arxiv\",\n        url: result.id,\n        summary: result.summary,\n      };\n      const doc = new Document({\n        pageContent: pdfContent,\n        metadata,\n      });\n      docs.push(doc);\n    } catch {\n      throw new Error(`Error loading document from ${pdfUrl}`);\n    }\n  }\n  return docs;\n}\n\n// Used to convert metadata and summaries to Document instances\nexport function getDocsFromSummaries(results: ArxivEntry[]): Document[] {\n  const docs: Document[] = [];\n  for (const result of results) {\n    const metadata = {\n      id: result.id,\n      title: result.title,\n      authors: result.authors,\n      published: result.published,\n      updated: result.updated,\n      source: \"arxiv\",\n      url: result.id,\n    };\n    const doc = new Document({\n      pageContent: result.summary,\n      metadata,\n    });\n    docs.push(doc);\n  }\n  return docs;\n}\n\n// Helper function to process each arXiv entry\nfunction processEntry(entry: any): ArxivEntry {\n  const id = entry.id;\n  const title = entry.title.replace(/\\s+/g, \" \").trim();\n  const summary = entry.summary.replace(/\\s+/g, \" \").trim();\n  const published = entry.published;\n  const updated = entry.updated;\n\n  // Extract authors\n  let authors: string[] = [];\n  if (Array.isArray(entry.author)) {\n    authors = entry.author.map((author: any) => author.name);\n  } else if (entry.author) {\n    authors = [entry.author.name];\n  }\n\n  // Extract links\n  let links: any[] = [];\n  if (Array.isArray(entry.link)) {\n    links = entry.link;\n  } else if (entry.link) {\n    links = [entry.link];\n  }\n\n  // Extract PDF link\n  let pdfUrl = `${id.replace(\"/abs/\", \"/pdf/\")}.pdf`;\n  const pdfLinkObj = links.find((link: any) => link[\"@_title\"] === \"pdf\");\n  if (pdfLinkObj && pdfLinkObj[\"@_href\"]) {\n    pdfUrl = pdfLinkObj[\"@_href\"];\n  }\n\n  return {\n    id,\n    title,\n    summary,\n    published,\n    updated,\n    authors,\n    pdfUrl,\n    links,\n  };\n}\n"],"mappings":";;;;;AAmBA,SAAgB,kBAAkB,OAAwB;AAExD,QADqB,iDACD,KAAK,MAAM,MAAM,CAAC;;AAIxC,eAAsB,wBACpB,UACuB;AACvB,KAAI;EAMF,MAAM,MAAM,6CALG,SACZ,MAAM,SAAS,CACf,KAAK,OAAO,GAAG,MAAM,CAAC,CACtB,OAAO,QAAQ,CACf,KAAK,IAAI;EAEZ,MAAM,WAAW,MAAM,MAAM,IAAI;AAEjC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,MAAM,MAAM,SAAS,MAAM;EAOjC,IAAI,UALW,IAAIA,gBAAAA,UAAU;GAC3B,kBAAkB;GAClB,qBAAqB;GACtB,CAAC,CACoB,MAAM,IAAI,CACX,KAAK;AAE1B,MAAI,CAAC,QACH,QAAO,EAAE;AAIX,MAAI,CAAC,MAAM,QAAQ,QAAQ,CACzB,WAAU,CAAC,QAAQ;AAKrB,SAFyB,QAAQ,IAAI,aAAa;SAG5C;AACN,QAAM,IAAI,MAAM,qCAAqC,WAAW;;;AAKpE,eAAsB,yBACpB,OACA,QAAQ,GACR,aAAa,IACU;AACvB,KAAI;EAEF,MAAM,MAAM,sDADS,mBAAmB,MAAM,CACiC,SAAS,MAAM,eAAe;EAC7G,MAAM,WAAW,MAAM,MAAM,IAAI;AAEjC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,MAAM,MAAM,SAAS,MAAM;EAOjC,IAAI,UALW,IAAIA,gBAAAA,UAAU;GAC3B,kBAAkB;GAClB,qBAAqB;GACtB,CAAC,CACoB,MAAM,IAAI,CACX,KAAK;AAE1B,MAAI,CAAC,QACH,QAAO,EAAE;AAIX,MAAI,CAAC,MAAM,QAAQ,QAAQ,CACzB,WAAU,CAAC,QAAQ;AAKrB,SAFyB,QAAQ,IAAI,aAAa;SAG5C;AACN,QAAM,IAAI,MAAM,wCAAwC,MAAM,GAAG;;;AAKrE,eAAsB,YACpB,OACA,aAAa,GACU;AACvB,KAAI,kBAAkB,MAAM,CAC1B,QAAO,MAAM,wBAAwB,MAAM;KAE3C,QAAO,MAAM,yBAAyB,OAAO,GAAG,WAAW;;AAK/D,eAAsB,iBAAiB,QAAiC;AACtE,KAAI;EAEF,MAAM,WAAW,MAAM,MAAM,OAAO;AAEpC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,SAAS,MAAM,SAAS,aAAa;AAW3C,UAJyB,MADV,IAAIC,gCAAAA,UAHN,IAAI,KAAK,CAAC,OAAO,EAAE,EAAE,MAAM,mBAAmB,CAAC,EAGzB,EAAE,YAAY,OAAO,CAAC,CACnB,MAAM,EAGvB,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;SAEzD;AACN,QAAM,IAAI,MAAM,qCAAqC,SAAS;;;AAKlE,eAAsB,oBACpB,SACqB;CACrB,MAAM,OAAmB,EAAE;AAC3B,MAAK,MAAM,UAAU,SAAS;EAC5B,MAAM,SAAS,OAAO;AACtB,MAAI;GAYF,MAAM,MAAM,IAAIC,0BAAAA,SAAS;IACvB,aAZiB,MAAM,iBAAiB,OAAO;IAa/C,UAZe;KACf,IAAI,OAAO;KACX,OAAO,OAAO;KACd,SAAS,OAAO;KAChB,WAAW,OAAO;KAClB,SAAS,OAAO;KAChB,QAAQ;KACR,KAAK,OAAO;KACZ,SAAS,OAAO;KACjB;IAIA,CAAC;AACF,QAAK,KAAK,IAAI;UACR;AACN,SAAM,IAAI,MAAM,+BAA+B,SAAS;;;AAG5D,QAAO;;AAIT,SAAgB,qBAAqB,SAAmC;CACtE,MAAM,OAAmB,EAAE;AAC3B,MAAK,MAAM,UAAU,SAAS;EAC5B,MAAM,WAAW;GACf,IAAI,OAAO;GACX,OAAO,OAAO;GACd,SAAS,OAAO;GAChB,WAAW,OAAO;GAClB,SAAS,OAAO;GAChB,QAAQ;GACR,KAAK,OAAO;GACb;EACD,MAAM,MAAM,IAAIA,0BAAAA,SAAS;GACvB,aAAa,OAAO;GACpB;GACD,CAAC;AACF,OAAK,KAAK,IAAI;;AAEhB,QAAO;;AAIT,SAAS,aAAa,OAAwB;CAC5C,MAAM,KAAK,MAAM;CACjB,MAAM,QAAQ,MAAM,MAAM,QAAQ,QAAQ,IAAI,CAAC,MAAM;CACrD,MAAM,UAAU,MAAM,QAAQ,QAAQ,QAAQ,IAAI,CAAC,MAAM;CACzD,MAAM,YAAY,MAAM;CACxB,MAAM,UAAU,MAAM;CAGtB,IAAI,UAAoB,EAAE;AAC1B,KAAI,MAAM,QAAQ,MAAM,OAAO,CAC7B,WAAU,MAAM,OAAO,KAAK,WAAgB,OAAO,KAAK;UAC/C,MAAM,OACf,WAAU,CAAC,MAAM,OAAO,KAAK;CAI/B,IAAI,QAAe,EAAE;AACrB,KAAI,MAAM,QAAQ,MAAM,KAAK,CAC3B,SAAQ,MAAM;UACL,MAAM,KACf,SAAQ,CAAC,MAAM,KAAK;CAItB,IAAI,SAAS,GAAG,GAAG,QAAQ,SAAS,QAAQ,CAAC;CAC7C,MAAM,aAAa,MAAM,MAAM,SAAc,KAAK,eAAe,MAAM;AACvE,KAAI,cAAc,WAAW,UAC3B,UAAS,WAAW;AAGtB,QAAO;EACL;EACA;EACA;EACA;EACA;EACA;EACA;EACA;EACD"}