"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { classifyDocument: () => classifyDocument }); module.exports = __toCommonJS(src_exports); // src/functions.ts var import_string_similarity_js = require("string-similarity-js"); var Tesseract = __toESM(require("tesseract.js"), 1); // src/convert.ts var pdfjsLib = __toESM(require("pdfjs-dist"), 1); pdfjsLib.GlobalWorkerOptions.workerSrc = "https://cdn.jsdelivr.net/npm/pdfjs-dist@4.4.168/build/pdf.worker.min.mjs"; async function convert(pdfPath, maxNumPages) { try { const loadingTask = pdfjsLib.getDocument(pdfPath); const doc = await loadingTask.promise; const totalPages = Math.min(doc.numPages, maxNumPages); let blobArray = []; for (let i = 1; i <= totalPages; i++) { const page = await doc.getPage(i); const viewport = page.getViewport({ scale: 1.5 }); const canvas = document.createElement("canvas"); const context = canvas.getContext("2d"); if (!context) { throw new Error("Failed to get canvas context"); } canvas.height = viewport.height; canvas.width = viewport.width; const renderContext = { canvasContext: context, viewport }; await page.render(renderContext).promise; const blob = await new Promise((resolve, reject) => { canvas.toBlob( (blob2) => { if (blob2) { console.log("Converted PDF to Blob!"); resolve(blob2); } else { reject(new Error("Failed to convert canvas to blob")); } }, "image/png", 1 ); }); blobArray.push(blob); } return blobArray; } catch (error) { console.error("Failed to load or process PDF:", error); throw error; } } // src/functions.ts function findTargetWords(documentText, targetWords, threshold = 0.75) { const foundTargetWords = []; const documentWords = documentText.split(/\s+/); documentWords.forEach((docWord) => { targetWords.forEach((targetWord) => { const similarity = (0, import_string_similarity_js.stringSimilarity)( docWord.toLowerCase(), targetWord.toLowerCase() ); if (similarity >= threshold) { foundTargetWords.push(targetWord); } }); }); return foundTargetWords; } var defaultDocumentDictionary = { Milit\u00E6rbevis: [ ["f\xF8rstegangstjeneste", "bevis", "avtjent"], ["attest", "f\xF8rstegangstjeneste"], ["fullf\xF8rt", "f\xF8rstegangstjeneste"] ], Politiattest: [["politiattest", "politidistrikt"], ["police certificate"]], Kompetansebevis: [["omfatter", "oppl\xE6ring", "utdanningsprogram"]], Legeerkl\u00E6ring: [["legeerkl\xE6ring", "f\xF8dselsnummer"]], Bostedsattest: [ ["registrerte", "opplysninger", "folkeregisteret"], ["bostedsattest", "bostedsadresse", "registrert"], ["registrert", "adressehistorikk", "folkeregisteret"] ] }; async function classifyDocument(file, options = {}) { const { onProgress, customDocumentDictionary, maxNumPages = Infinity // Default value if not provided } = options; const documentDictionary = { ...defaultDocumentDictionary, ...customDocumentDictionary }; let BlobArray = []; if (file.type === "application/pdf") { const pdfPath = URL.createObjectURL(file); BlobArray = await convert(pdfPath, maxNumPages); console.log("Blob to read set by convert function"); } else if (file.type.startsWith("image/")) { BlobArray.push(file); } else { throw new Error("Unsupported file type. Please provide a PDF or an image."); } let progress = 0; const totalBlobs = Math.min(BlobArray.length, maxNumPages); let fullText = ""; const targetWords = Array.from( new Set(Object.values(documentDictionary).flat(2)) ); try { for (let i = 0; i < totalBlobs; i++) { const text = await ocrBlob(BlobArray[i]); fullText += text; progress = Math.round((i + 1) / totalBlobs * 100); if (onProgress) { onProgress(progress); } const targetWordsFound = findTargetWords(text, targetWords); const classification = determineClassification( targetWordsFound, documentDictionary ); if (classification !== "Ukjent") { return { classification, text: fullText }; } } } catch (error) { console.log("Error in ocr process: ", error); } return { classification: "Ukjent", text: fullText }; } function determineClassification(targetWordsFound, documentDictionary) { for (const [classification, targetWordSets] of Object.entries( documentDictionary )) { for (const targetWords of targetWordSets) { if (targetWords.every((word) => targetWordsFound.includes(word))) { return classification; } } } return "Ukjent"; } async function ocrBlob(blob, onProgress) { const reader = new FileReader(); const worker = await Tesseract.createWorker(["nor", "eng"], 1, { logger: (m) => { if (m.status === "recognizing text" && onProgress) { onProgress(m.progress * 100); } } }); await worker.setParameters({ tessedit_pageseg_mode: Tesseract.PSM.AUTO_OSD }); return new Promise((resolve, reject) => { reader.onload = async () => { const imageDataUrl = reader.result; try { const { data: { text } } = await worker.recognize(imageDataUrl, { rotateAuto: true }); resolve(text); } catch (err) { console.error(err); reject(err); } }; reader.readAsDataURL(blob); }); } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { classifyDocument }); //# sourceMappingURL=index.cjs.map