# Integrating Prysm into Your Project

Prysm is a powerful, structure-aware web scraper that you can integrate into your projects in multiple ways. This guide explains how to use Prysm in your own applications.

## Installation

First, add Prysm to your project:

```bash
# Clone the repository
git clone https://github.com/pinkpixel-dev/prysm.git

# Install dependencies
cd prysm/scraper
npm install
```

## Integration Methods

There are three main ways to use Prysm in your projects:

### 1. Direct Module Import

Import Prysm directly into your Node.js code:

```javascript
const prysm = require('./path/to/prysm/scraper/main_scraper');

async function scrapeWebsite() {
  try {
    const result = await prysm('https://example.com', {
      maxScrolls: 5,
      bypassCloudflare: true,
      handlePagination: true,
      headless: true,
      // Smart Scan options
      skipAnalysis: false, // Enable Smart Scan (default)
      focused: false, // Use standard mode (default)
      article: false // Use auto-detection (default)
    });

    console.log(`Title: ${result.title}`);
    console.log(`Content items: ${result.content.length}`);
    console.log(`Structure type: ${result.structureType}`);

    // Process the scraped data
    // ...

    return result;
  } catch (error) {
    console.error('Scraping error:', error);
  }
}

scrapeWebsite();
```

### 1a. NPM Package

You can also install Prysm directly from npm:

```bash
npm install @pinkpixel/prysm
```

Then import and use it in your code:

```javascript
const { scrape } = require('@pinkpixel/prysm');

async function scrapeWebsite() {
  try {
    const result = await scrape('https://example.com', {
      maxScrolls: 5,
      bypassCloudflare: true,
      handlePagination: true,
      // Smart Scan options
      skipAnalysis: false, // Enable Smart Scan (default)
      focused: true, // Speed-optimized mode
      article: true // Force article mode
    });

    console.log(`Title: ${result.title}`);
    console.log(`Content items: ${result.content.length}`);
    console.log(`Structure type: ${result.structureType}`);

    return result;
  } catch (error) {
    console.error('Scraping error:', error);
  }
}

scrapeWebsite();
```

### 2. CLI Integration (via shell execution)

Execute Prysm's CLI from your application:

```javascript
const { exec } = require('child_process');
const path = require('path');

function scrapeUrl(url, options = {}) {
  return new Promise((resolve, reject) => {
    const prysmPath = path.join(__dirname, 'path/to/prysm/scraper');

    // Build command with options
    let cmd = `cd ${prysmPath} && npm run start:cli "${url}"`;

    // Add options
    if (options.maxScrolls) cmd += ` --maxScrolls ${options.maxScrolls}`;
    if (options.noHeadless) cmd += ` --noHeadless`;
    if (options.output) cmd += ` --output "${options.output}"`;
    
    // Smart Scan options
    if (options.skipAnalysis) cmd += ` --skipAnalysis`;
    if (options.analyze) cmd += ` --analyze`;
    if (options.focused) cmd += ` --focused`;
    if (options.deep) cmd += ` --deep`;
    if (options.article) cmd += ` --article`;
    if (options.product) cmd += ` --product`;
    if (options.listing) cmd += ` --listing`;

    // Execute the command
    exec(cmd, (error, stdout, stderr) => {
      if (error) {
        reject(error);
        return;
      }

      // Parse the output to get the result file path
      const resultPathMatch = stdout.match(/Full results saved to:\s*([^\s]+)/);
      if (resultPathMatch && resultPathMatch[1]) {
        const resultPath = resultPathMatch[1];

        // Load the results JSON file
        const results = require(resultPath);
        resolve(results);
      } else {
        reject(new Error('Could not find result path in output'));
      }
    });
  });
}

// Usage
scrapeUrl('https://example.com', { 
  maxScrolls: 10,
  focused: true,  // Speed-optimized mode
  article: true   // Force article extraction
})
  .then(results => console.log(results))
  .catch(err => console.error(err));
```

### 3. API Integration (RESTful)

Start the API server and communicate with it via HTTP:

```javascript
const axios = require('axios');
const { spawn } = require('child_process');
const path = require('path');

class PrysmAPI {
  constructor(options = {}) {
    this.baseUrl = options.baseUrl || 'http://localhost:3001';
    this.apiProcess = null;
  }

  async startServer() {
    return new Promise((resolve, reject) => {
      const prysmPath = path.join(__dirname, 'path/to/prysm/scraper');

      this.apiProcess = spawn('npm', ['run', 'start:api'], {
        cwd: prysmPath,
        stdio: ['ignore', 'pipe', 'pipe']
      });

      // Wait for server to start
      let output = '';
      this.apiProcess.stdout.on('data', (data) => {
        output += data.toString();
        if (output.includes('Prysm API running at')) {
          const match = output.match(/http:\/\/localhost:(\d+)/);
          if (match) {
            this.baseUrl = `http://localhost:${match[1]}`;
            resolve(this.baseUrl);
          }
        }
      });

      this.apiProcess.stderr.on('data', (data) => {
        reject(new Error(`Server error: ${data.toString()}`));
      });

      // Timeout if server doesn't start
      setTimeout(() => {
        reject(new Error('API server failed to start within timeout'));
      }, 10000);
    });
  }

  async stopServer() {
    if (this.apiProcess) {
      this.apiProcess.kill();
      this.apiProcess = null;
    }
  }

  async createJob(url, options = {}) {
    const response = await axios.post(`${this.baseUrl}/api/jobs`, {
      url,
      options
    });
    return response.data;
  }

  async getJobStatus(jobId) {
    const response = await axios.get(`${this.baseUrl}/api/jobs/${jobId}`);
    return response.data;
  }

  async getJobResults(jobId) {
    const response = await axios.get(`${this.baseUrl}/api/jobs/${jobId}/results`);
    return response.data.result;
  }

  async pollUntilComplete(jobId, interval = 2000, timeout = 300000) {
    const startTime = Date.now();

    while (Date.now() - startTime < timeout) {
      const status = await this.getJobStatus(jobId);

      if (status.status === 'completed') {
        return await this.getJobResults(jobId);
      } else if (status.status === 'failed') {
        throw new Error(`Job failed: ${status.error}`);
      }

      // Wait before checking again
      await new Promise(resolve => setTimeout(resolve, interval));
    }

    throw new Error('Job timed out');
  }
}

// Usage
async function scrapeWithAPI() {
  const prysm = new PrysmAPI();

  try {
    await prysm.startServer();
    console.log('API server started');

    const job = await prysm.createJob('https://example.com', {
      maxScrolls: 5,
      bypassCloudflare: true,
      // Smart Scan options
      skipAnalysis: false,  // Use Smart Scan (default)
      focused: true,        // Speed-optimized mode
      product: true         // Force product page extraction
    });

    console.log(`Job created: ${job.jobId}`);

    // Wait for job to complete
    const results = await prysm.pollUntilComplete(job.jobId);
    console.log('Scraping results:', results);

    return results;
  } catch (error) {
    console.error('API error:', error);
  } finally {
    await prysm.stopServer();
  }
}

scrapeWithAPI();
```

## Practical Examples

### Example 1: Scrape Content and Save to Database

```javascript
const mongoose = require('mongoose');
const prysm = require('./path/to/prysm/scraper/main_scraper');

// Define a model for scraped content
const Article = mongoose.model('Article', {
  title: String,
  content: [String],
  url: String,
  scrapedAt: Date,
  structureType: String
});

async function scrapeAndStore(url) {
  // Connect to MongoDB
  await mongoose.connect('mongodb://localhost:27017/scraped_content');

  try {
    // Scrape the URL
    const result = await prysm(url, {
      maxScrolls: 10,
      bypassCloudflare: true
    });

    // Create a new article
    const article = new Article({
      title: result.title,
      content: result.content,
      url: url,
      scrapedAt: new Date(),
      structureType: result.structureType
    });

    // Save to database
    await article.save();
    console.log(`Saved article: ${result.title}`);

    return article;
  } catch (error) {
    console.error('Error:', error);
  } finally {
    mongoose.disconnect();
  }
}

// Usage
scrapeAndStore('https://example.com/article');
```

### Example 2: Bulk Scraping with Prysm API

```javascript
const fs = require('fs').promises;
const axios = require('axios');

async function bulkScrape(urlList, outDir) {
  // Start API server (using the PrysmAPI class from previous example)
  const prysm = new PrysmAPI();
  await prysm.startServer();

  try {
    await fs.mkdir(outDir, { recursive: true });

    // Create jobs for all URLs
    const jobs = [];
    for (const url of urlList) {
      const job = await prysm.createJob(url, {
        maxScrolls: 5,
        bypassCloudflare: true,
        handlePagination: true
      });
      jobs.push(job);
      console.log(`Created job ${job.jobId} for ${url}`);
    }

    // Wait for all jobs to complete
    const results = [];
    for (const job of jobs) {
      try {
        const result = await prysm.pollUntilComplete(job.jobId);
        results.push({ url: job.url, data: result });

        // Save to file
        const filename = new URL(job.url).hostname.replace(/[^a-z0-9]/g, '_');
        await fs.writeFile(
          `${outDir}/${filename}.json`,
          JSON.stringify(result, null, 2)
        );

        console.log(`Completed: ${job.url}`);
      } catch (err) {
        console.error(`Failed to scrape ${job.url}:`, err.message);
      }
    }

    return results;
  } finally {
    await prysm.stopServer();
  }
}

// Usage
const urlsToScrape = [
  'https://example.com/page1',
  'https://example.com/page2',
  'https://othersite.com/article'
];

bulkScrape(urlsToScrape, './scraped-data')
  .then(results => console.log(`Scraped ${results.length} pages`))
  .catch(err => console.error(err));
```

## Handling Scraped Data

Prysm's output format is consistent regardless of how you use it:

```javascript
{
  title: "Page Title",
  content: ["Paragraph 1", "Paragraph 2", ...],
  metadata: { /* page metadata */ },
  structureType: "article", // or "recipe", "product", etc.
  paginationType: "infinite", // pagination type detected
  extractionMethod: "ai", // extraction method used
  url: "https://example.com/page"
}
```

## Tips for Production Use

1. **Error Handling**: Always wrap Prysm calls in try/catch blocks
2. **Resource Management**: Consider adding timeouts and resource limits
3. **Caching**: Implement caching to avoid re-scraping the same URLs
4. **Rate Limiting**: Add delays between requests to avoid overloading target sites
5. **Respect robots.txt**: Check robots.txt before scraping
6. **Proxy Rotation**: For large-scale scraping, rotate proxies to avoid IP blocks

## MCP Integration and Environment Variables

Prysm-LLM is optimized for integration with Large Language Models through Model Control Protocol (MCP). To configure the output directories when using with MCP or other automated systems, you can use environment variables.

### Environment Variables

Two environment variables can be used to configure output paths:

- `PRYSM_OUTPUT_DIR` - Sets the main output directory for results (default: ~/prysm/output)
- `PRYSM_IMAGE_OUTPUT_DIR` - Sets the output directory for downloaded images (default: ~/prysm/output/images)

These variables allow you to configure paths without command line arguments, which is especially useful for MCP integration where you may not have direct control over command execution.

### Setting Environment Variables

```javascript
// In Node.js, before importing the scraper
process.env.PRYSM_OUTPUT_DIR = '/custom/path/to/results';
process.env.PRYSM_IMAGE_OUTPUT_DIR = '/custom/path/to/images';

// Then import and use the scraper
const { scrape } = require('@pinkpixel/prysm-llm');
```

In a shell environment:
```bash
export PRYSM_OUTPUT_DIR="/custom/path/to/results"
export PRYSM_IMAGE_OUTPUT_DIR="/custom/path/to/images"
```

### MCP Configuration Example

When configuring Prysm-LLM for use with MCP, it's important to set these environment variables in your integration code:

```javascript
// MCP integration example
const { scrape } = require('@pinkpixel/prysm-llm');

// Configure environment variables for MCP
process.env.PRYSM_OUTPUT_DIR = '/tmp/mcp-results';
process.env.PRYSM_IMAGE_OUTPUT_DIR = '/tmp/mcp-images';

// Example MCP function
async function scrapeFocused(url, options = {}) {
  const result = await scrape(url, {
    maxScrolls: 5,
    scrollDelay: 1000,
    ...options
  });
  
  return result;
}

// Example function for MCP integration
async function scrapeBalanced(url, options = {}) {
  const result = await scrape(url, {
    maxScrolls: 10,
    scrollDelay: 2000,
    ...options
  });
  
  return result;
}

// Example function for MCP integration
async function scrapeDeep(url, options = {}) {
  const result = await scrape(url, {
    maxScrolls: 20,
    scrollDelay: 3000,
    ...options
  });
  
  return result;
}
```

### Important Notes for MCP Integration

1. **Output Paths**: Ensure that the MCP environment has write permissions to the specified output directories.
2. **Minimal Output**: Prysm-LLM is designed to minimize console output for clean LLM integration.
3. **Error Handling**: In MCP environments, errors are returned as part of the result object rather than being logged.
4. **Headless Mode**: Always use headless mode (default) in MCP environments.

By following this guide, you should be able to integrate Prysm into your projects effectively and leverage its powerful scraping capabilities.

---

✨ Dream it, Pixel it | Made with ❤️ by Pink Pixel 