var __getOwnPropNames = Object.getOwnPropertyNames; var __commonJS = (cb, mod) => function __require() { return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports; }; // src/utils.js var require_utils = __commonJS({ "src/utils.js"(exports2, module2) { function patternToRegex(pattern) { return new RegExp(pattern.replace(/\*\*/g, ".*").replace(/[.*+?^$()|[\]\\]/g, "\\$&"), "i"); } function normalizeUrl(url) { try { const urlObj = new URL(url); urlObj.hash = ""; urlObj.search = ""; return urlObj.toString().replace(/\/+$/, ""); } catch (error) { console.error("Error normalizing URL:", error); return url; } } function shouldExcludeUrl(url, excludePatterns) { return excludePatterns.some((pattern) => { const regex = new RegExp(pattern.replace(/\*\*/g, ".*").replace(/[.*+?^$()|[\]\\]/g, "\\$&"), "i"); return regex.test(url); }); } module2.exports = { patternToRegex, normalizeUrl, shouldExcludeUrl }; } }); // src/crawler.js var require_crawler = __commonJS({ "src/crawler.js"(exports2, module2) { var { chromium } = require("playwright"); var chalk = require("chalk"); var { createWriteStream } = require("fs"); var { execSync } = require("child_process"); var { Transform } = require("stream"); var { normalizeUrl, shouldExcludeUrl } = require_utils(); var ResultsTransform = class extends Transform { constructor(options = {}) { options.objectMode = true; super(options); this.isFirst = true; this.resultsCount = 0; } _transform(chunk, encoding, callback) { try { let data = ""; if (this.isFirst) { data = "[\n"; this.isFirst = false; } else { data = ",\n"; } data += JSON.stringify(chunk, null, 2); this.resultsCount++; callback(null, data); } catch (error) { callback(error); } } _flush(callback) { this.push("\n]"); callback(); } getResultsCount() { return this.resultsCount; } }; function installBrowser(callback) { console.log(chalk.blue("Installing Playwright browser...")); try { execSync("npx playwright install chromium", { stdio: "inherit" }); console.log(chalk.green("Browser installed successfully")); callback?.(null, true); } catch (error) { console.error(chalk.red("Failed to install browser automatically")); console.error(chalk.red("\nPlease run this command manually:")); console.error(chalk.blue("\n npx playwright install chromium\n")); callback?.(null, false); } } async function crawlWebsite2(config) { console.log(chalk.blue("\nStarting crawler...\n")); const visitedUrls = /* @__PURE__ */ new Set(); let pagesVisited = 0; const resultsTransform = new ResultsTransform(); const outputStream = createWriteStream(config.outputFile); resultsTransform.pipe(outputStream); async function retryOperation(operation, maxRetries = config.maxRetries) { let lastError; for (let attempt = 1; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error; if (attempt < maxRetries) { console.log(chalk.yellow(`Retry attempt ${attempt}/${maxRetries}`)); await new Promise((resolve) => setTimeout(resolve, 1e3 * attempt)); } } } throw lastError; } async function crawlPage(browser, context, page, url, currentLevel = 0) { const normalizedUrl = await normalizeUrl(url); if (visitedUrls.has(normalizedUrl) || pagesVisited >= config.maxPages || currentLevel >= config.maxLevels || shouldExcludeUrl(normalizedUrl, config.excludePatterns)) { return; } console.log(chalk.blue(`\u2192 Starting: ${normalizedUrl} (Level: ${currentLevel})`)); try { await retryOperation(async () => { await page.goto(url, { waitUntil: "networkidle", timeout: config.timeout }); }); if (config.ignoreSelectors && config.ignoreSelectors.length > 0) { await page.evaluate((selectors) => { selectors.forEach((selector) => { document.querySelectorAll(selector).forEach((element) => { element.style.display = "none"; }); }); }, config.ignoreSelectors); } visitedUrls.add(normalizedUrl); pagesVisited++; const content = await page.$$eval( config.selector, (elements, ignoreSelectors) => { const shouldIgnore = (element) => { let current = element; while (current) { if (ignoreSelectors.some((selector) => current.matches && current.matches(selector))) { return true; } current = current.parentElement; } return false; }; return elements.filter((el) => !shouldIgnore(el)).map((el) => el.innerText); }, config.ignoreSelectors || [] ); resultsTransform.write({ url: normalizedUrl, content: content.join("\n"), timestamp: (/* @__PURE__ */ new Date()).toISOString(), level: currentLevel }); console.log(chalk.green(`\u2713 Saved: ${normalizedUrl}`)); if (currentLevel < config.maxLevels) { const links = await page.$$eval("a[href]", (elements, pattern) => elements.map((el) => el.href).filter((href) => href.match(pattern)), config.match.replace("**", ".*")); for (const link of links) { if (pagesVisited >= config.maxPages) break; await crawlPage(browser, context, page, link, currentLevel + 1); } } } catch (error) { console.log(chalk.red(`\u2717 Error: ${normalizedUrl} - ${error.message}`)); } } try { const browser = await chromium.launch({ headless: true }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" }); const page = await context.newPage(); await crawlPage(browser, context, page, config.url, 0); await browser.close(); resultsTransform.end(); await new Promise((resolve) => { outputStream.on("finish", () => { console.log(chalk.green(` \u2713 Crawling complete! Processed ${pagesVisited} pages`)); console.log(chalk.blue(`\u2713 Results saved to ${config.outputFile} `)); resolve(); }); }); } catch (error) { if (error.message.includes("Executable doesn't exist")) { console.error(chalk.red("\nBrowser installation required")); installBrowser((err, installed) => { if (installed) { console.log(chalk.blue("\nPlease try running the crawler again.")); } }); } else { console.error(chalk.red("\nCrawling failed:", error.message, "\n")); } process.exit(1); } } module2.exports = { crawlWebsite: crawlWebsite2, installBrowser }; } }); // src/config.js var require_config = __commonJS({ "src/config.js"(exports2, module2) { var inquirer = require("inquirer"); var { writeFileSync } = require("fs"); var chalk = require("chalk"); var defaultConfig = { url: "", match: "", selector: "", maxPages: 50, outputFile: "crawltojson.output.json", maxRetries: 3, maxLevels: 3, timeout: 7e3, excludePatterns: [ "**/tag/**", // Ignore tag pages "**/tags/**", // Ignore tags pages "**/#*", // Ignore anchor links "**/search**", // Ignore search pages "**.mp4", // Ignore mp4 files "**/archive/**" // Ignore archive pages ], ignoreSelectors: [ "header", "footer", "#scrollspy", ".hs-cookie-notification-position-bottom", // Cookie notifications ".cookie-banner", // Cookie banners ".popup-overlay", // Popups ".newsletter-signup", // Newsletter popups ".chat-widget", // Chat widgets ".advertisement", // Ads "#hubspot-messages-iframe-container", // HubSpot chat ".intercom-lightweight-app" // Intercom chat ] }; async function generateConfig2() { console.log(chalk.blue("Creating crawltojson configuration file...")); const answers = await inquirer.prompt([ { type: "input", name: "url", message: "What is the starting URL to crawl?", validate: (input) => input.startsWith("http") || "Please enter a valid URL" }, { type: "input", name: "match", message: "What URL pattern should be matched? (e.g., https://example.com/**)", default: (answers2) => { const baseUrl = answers2.url.replace(/\/+$/, ""); return `${baseUrl}/**`; } }, { type: "input", name: "selector", message: "What CSS selector should be used to extract content?", default: "body" }, { type: "number", name: "maxPages", message: "Maximum number of pages to crawl?", default: 50 }, { type: "input", name: "ignoreSelectors", message: "Additional selectors to ignore (comma-separated)", default: "", filter: (input) => input ? defaultConfig.ignoreSelectors.concat( input.split(",").map((s) => s.trim()).filter(Boolean) ) : defaultConfig.ignoreSelectors } ]); const config = { ...defaultConfig, ...answers }; try { writeFileSync("./crawltojson.config.json", JSON.stringify(config, null, 2)); console.log(chalk.green("Configuration file created successfully!")); } catch (error) { console.error(chalk.red("Error creating configuration file:", error.message)); process.exit(1); } } module2.exports = { generateConfig: generateConfig2 }; } }); // src/index.js var { crawlWebsite } = require_crawler(); var { generateConfig } = require_config(); module.exports = { crawlWebsite, generateConfig };