'use strict'; var pdf_mjs = require('pdfjs-dist/legacy/build/pdf.mjs'); var fs = require('fs'); class Cell { constructor(options = {}) { this.options = options; this.text = ""; this.x1 = 9999; this.y1 = 9999; this.x2 = 0; this.y2 = 0; this.fontHeight = 8; this.fontWidth = 4; this.lineHeightRatio = options.lineHeight || 1.67; this.count = 0; this.prevX = 0; this.prevY = 0; this.prevX2 = 0; this.prevY2 = 0; this.hasSpan = false; this.inserted = false; this.id = 0; } get lineHeight() { return this.fontHeight * this.lineHeightRatio; } addItem(item, index) { this.count++; this.id = index; if (item.str) this.text += item.str; if (item.hasEOL) this.text += this.options.newlines ? "\n" : " "; const x = item.transform[4]; const y = item.transform[5]; const w = item.width; const h = item.height; if (x < this.x1) this.x1 = x; if (y < this.y1) this.y1 = y; if (x + w > this.x2) this.x2 = x + w; if (y + h > this.y2) this.y2 = y + h; const fh = item.transform[0]; const fw = item.str ? item.width / item.str.length : 0; if (fh > this.fontHeight) this.fontHeight = fh; if (fw > this.fontWidth) this.fontWidth = fw; this.prevX = x; this.prevY = y; this.prevX2 = x + w; this.prevY2 = y + h; } isSameLine(cell) { let same = 0; if (cell.y1 - 1 > this.y2) same = 1; else if (cell.y2 + 1 < this.y1) same = -1; return same; } isOutputLine(cell) { let yOverlaps = cell.y1 >= this.y1 && cell.y1 <= this.y2 || this.y1 >= cell.y1 && this.y1 <= cell.y2; if (yOverlaps) { if (this.x1 < cell.x1 && this.y2 - this.y1 < cell.y2 - cell.y1) yOverlaps = false; } return yOverlaps; } isAdjacent(item) { const x = item.transform[4]; const y = item.transform[5]; const w = item.width; item.height; if (Math.abs(y - this.prevY) <= this.lineHeight * 0.125 && x - this.prevX2 < this.fontWidth) return true; if (this.hasSpan && this.prevY - y > this.lineHeight * 0.75 && this.prevY - y <= this.lineHeight * 1.25 && (x >= this.x1 && x <= this.x2 || this.x1 >= x && this.x1 <= x + w)) return true; return false; } alignment(item) { const aligns = { top: false, bottom: false, left: false, right: false, adjacent: false }; if (this.count === 0) return aligns; const x = item.transform[4]; const y = item.transform[5]; if (Math.abs(y - this.y1) <= 2) aligns.bottom = true; if (Math.abs(y + item.height - this.y2) <= 2) aligns.top = true; if (Math.abs(x - this.x1) <= 2) aligns.left = true; if (Math.abs(x + item.width - this.x2) <= 2) aligns.right = true; if ((aligns.top || aligns.bottom) && Math.abs(x - this.x2) < this.fontWidth) aligns.adjacent = true; if ((aligns.left || aligns.right) && Math.abs(y + item.height - this.y1) < this.fontWidth) aligns.adjacent = true; return aligns; } } const parseMarkedPage = async (page, options) => { let cell = null; let markedContent = ""; let paragraph = false; let span = false; let cells = []; let content = await page.getTextContent({ includeMarkedContent: true, disableNormalization: false, disableCombineTextItems: false }); content.items.forEach((item, index) => { if (item.type === "beginMarkedContent") { switch (item.tag) { case "Artifact": markedContent = "Artifact"; cells.push(cell); cell = null; break; default: console.warn("unknown content tag: " + item.tag); } } else if (item.type === "beginMarkedContentProps") { switch (item.tag) { case "P": markedContent = "P"; paragraph = true; break; case "Span": markedContent = "Span"; span = true; break; } } else if (item.type === "endMarkedContent") { switch (markedContent) { case "Artifact": if (options.artifacts) cells.push(cell); cell = null; break; } markedContent = ""; } else if (item.type) { console.warn("Warning: unknown content type: " + item.type); } else { if (item.dir !== "ltr") console.warn("Warning: text direction is: " + item.dir); if (paragraph || span) { if (item.str === "" && item.width === 0 && paragraph && item.hasEOL) return; if (item.str === " " && (paragraph || item.width > (cell == null ? void 0 : cell.fontWidth))) return; if (cell && cell.count > 0) { cell.hasSpan = cell.hasSpan || span; if (!cell.isAdjacent(item)) { cells.push(cell); cell = null; } } } if (!cell) cell = new Cell(options); cell.addItem(item, index); paragraph = false; span = false; } }); if (cell) cells.push(cell); if ((options == null ? void 0 : options.sort) === "Y1") { cells.sort((a, b) => b.y1 - a.y1); } else if ((options == null ? void 0 : options.sort) === "Y2") { cells.sort((a, b) => b.y2 - a.y2); } return cells; }; const parseLinedPage = async (page, options) => { let cell = new Cell(options); let wasEOL = false; let cells = []; let content = await page.getTextContent({ disableNormalization: false }); content.items.forEach((item, index) => { if (item.dir !== "ltr") console.warn(item.dir); let aligns = cell.alignment(item); if (!aligns.adjacent && cell.count > 0) { cells.push(cell); cell = new Cell(options); } if (wasEOL && (aligns.top || (aligns.left || aligns.right) && aligns.adjacent)) { wasEOL = false; } if (wasEOL && cell.count > 0) { cells.push(cell); cell = new Cell(options); } if (item.height > 0 || item.str === " " && item.width < (cell == null ? void 0 : cell.fontWidth)) cell.addItem(item, index); wasEOL = item.hasEOL; }); if (cell.count > 0) cells.push(cell); if ((options == null ? void 0 : options.sort) === "Y1") { cells.sort((a, b) => b.y1 - a.y1); } else if ((options == null ? void 0 : options.sort) === "Y2") { cells.sort((a, b) => b.y2 - a.y2); } return cells; }; function parseLineRange(line) { const ranges = line.split(","); const result = []; for (const range of ranges) { const [start, end] = range.split("-").map(Number); if (isNaN(start) || end !== void 0 && isNaN(end)) { throw new Error(`Invalid line range: ${range}`); } if (end !== void 0) { result.push( ...Array.from({ length: end - start + 1 }, (_, i) => start + i) ); } else { result.push(start); } } return result; } function getSkipLines(options, currentPage) { var _a, _b, _c; let skipLines = []; if ((_a = options.skip) == null ? void 0 : _a.global) { if (typeof options.skip.global.lines === "string") { skipLines = parseLineRange(options.skip.global.lines); } else { skipLines = options.skip.global.lines; } } if ((_b = options.skip) == null ? void 0 : _b.pageSpecific) { const page = (_c = options.skip.pageSpecific) == null ? void 0 : _c.find( (page2) => page2.page === currentPage ); if (page) { if (typeof page.lines === "string") { skipLines = parseLineRange(page.lines); } else { skipLines = page.lines; } } } return skipLines; } function checkTextMatch(text, matchText, matchType) { switch (matchType) { case "exact": return text === matchText; case "contain": return text.includes(matchText); case "startWith": return text.startsWith(matchText); case "regex": return new RegExp(matchText).test(text); // return RegExp(matchText).test(text); default: return false; } } function shouldDeleteItem(item, textFilters, nextItem) { if (!textFilters) return false; return textFilters.some(({ value, match = "contain", nextLine }) => { const matchF = checkTextMatch(item.text, value, match); return matchF && (!nextLine || nextItem && checkTextMatch(nextItem.text, nextLine.match, nextLine.match)); }); } function filter(items, options, currentPage) { var _a, _b, _c, _d; const skipLines = getSkipLines(options, currentPage); const cells = items.filter((item, index) => { var _a2; const currentLine = index + 1; if (skipLines == null ? void 0 : skipLines.includes(currentLine)) { return false; } const nextItem = index + 1 < items.length ? items[index + 1] : void 0; return !shouldDeleteItem(item, (_a2 = options.skip) == null ? void 0 : _a2.text, nextItem); }); const page = (_b = (_a = options.skip) == null ? void 0 : _a.pageSpecific) == null ? void 0 : _b.find( (page2) => page2.page === currentPage ); if (page == null ? void 0 : page.lastLines) { cells.splice(-page.lastLines); } else if ((_d = (_c = options == null ? void 0 : options.skip) == null ? void 0 : _c.global) == null ? void 0 : _d.lastLines) { cells.splice(-options.skip.global.lastLines); } let isSkipping = false; return cells.map((item) => { var _a2; if ((_a2 = options == null ? void 0 : options.skip) == null ? void 0 : _a2.ranges) { for (const range of options.skip.ranges) { const startMatch = checkTextMatch( item.text, range.start.value, range.start.match || "contain" ); if (startMatch) { isSkipping = true; } const endMatch = checkTextMatch( item.text, range.end.value, range.end.match || "contain" ); if (endMatch) { isSkipping = false; return null; } } } if (!isSkipping) { return item.text.trim(); } }).filter(Boolean).join("\n"); } const parsePagesOption = (pagesOption, totalPages) => { const pages = /* @__PURE__ */ new Set(); if (Array.isArray(pagesOption)) { pagesOption.forEach((page) => { if (page >= 1 && page <= totalPages) { pages.add(page); } }); } else if (typeof pagesOption === "string") { if (pagesOption.trim() === "all") { for (let i = 1; i <= totalPages; i++) { pages.add(i); } } else { const ranges = pagesOption.split(","); ranges.forEach((range) => { const [start, end] = range.split("-").map(Number); if (end) { for (let i = start; i <= end; i++) { if (i >= 1 && i <= totalPages) { pages.add(i); } } } else if (start >= 1 && start <= totalPages) { pages.add(start); } }); } } return [...pages]; }; const pdf = async (pdfPath, options = { threshold: 5, lineHeight: 1.67, pages: "1" }) => { const finalOptions = Object.assign( { threshold: 5, lineHeight: 1.67, pages: "1", sort: "Y2" }, options ); const raw = []; const text = []; const textContentArray = []; const data = new Uint8Array(fs.readFileSync(pdfPath)); try { const doc = await pdf_mjs.getDocument({ data, standardFontDataUrl: "../node_modules/pdfjs-dist/standard_fonts/", verbosity: 0, password: finalOptions.password }).promise; const markInfo = await doc.getMarkInfo(); const numPages = doc.numPages; const pagesToProcess = parsePagesOption(finalOptions.pages, numPages); console.log("Extracting..."); for (const i of pagesToProcess) { console.log("Page", i, "of", numPages); const page = await doc.getPage(i); const textContent = await page.getTextContent(); const pageTextContent = textContent.items.map((item) => item.str).join("\n"); textContentArray.push(pageTextContent); let temp = []; if (markInfo == null ? void 0 : markInfo.Marked) { temp = await parseMarkedPage(page, finalOptions); } else { temp = await parseLinedPage(page, finalOptions); } raw.push( ...temp.map((cell) => ({ text: cell.text, x1: cell.x1, y1: cell.y1, x2: cell.x2, y2: cell.y2, fontHeight: cell.fontHeight, fontWidth: cell.fontWidth })) ); text.push(filter(temp, finalOptions, i)); } console.log("done"); return { getRaw: () => raw, getText: () => text, getPages: () => numPages, getTextContent: () => textContentArray }; } catch (e) { console.error("Error reading pdf", e); throw e; } }; exports.pdf = pdf;