/**
 * Content extraction step for the research pipeline
 * Extracts content from URLs found in search results
 */
import { createStep } from '../utils/steps.js';
import { StepOptions } from '../types/pipeline.js';
/**
 * Options for the content extraction step
 */
export interface ExtractContentOptions extends StepOptions {
    /** CSS selectors to extract content from */
    selectors?: string;
    /** Alias for selectors (for backwards compatibility) */
    selector?: string;
    /** Maximum number of URLs to process */
    maxUrls?: number;
    /** Maximum content length per URL (characters) */
    maxContentLength?: number;
    /** Whether to include the extracted content in the final results */
    includeInResults?: boolean;
    /** Timeout for each URL fetch in milliseconds */
    timeout?: number;
    /** Fetch retry configuration */
    retry?: {
        /** Maximum number of retries */
        maxRetries?: number;
        /** Base delay between retries in ms */
        baseDelay?: number;
    };
    /** Minimum content length to consider a successful extraction */
    minContentLength?: number;
    /** Whether to continue if some URLs fail to extract */
    continueOnError?: boolean;
    /** Whether to require at least one successful extraction */
    requireSuccessful?: boolean;
}
/**
 * Interface for extracted content metadata
 */
export interface ExtractedContentMetadata {
    /** Approximate word count in the content */
    wordCount: number;
    /** Domain of the source website */
    domain: string;
    /** HTTP status code of the response */
    statusCode: number;
    /** MIME type of the content */
    contentType?: string;
    /** Extraction timestamp */
    extractedAt: string;
    /** Which selectors matched and were used */
    matchedSelectors?: string[];
    /** Was this a complete extraction or partial */
    isComplete?: boolean;
    /** Extraction time in milliseconds */
    extractionTimeMs?: number;
    /** Number of retry attempts made */
    retryAttempts?: number;
}
/**
 * Interface for extracted content
 */
export interface ExtractedContent {
    /** URL of the extracted content */
    url: string;
    /** Title of the content */
    title: string;
    /** The extracted text content */
    content: string;
    /** Additional metadata about the extraction */
    metadata?: ExtractedContentMetadata;
    /** Extraction date */
    extractionDate?: string;
}
/**
 * Creates a content extraction step for the research pipeline
 *
 * @param options Configuration options for content extraction
 * @returns A content extraction step for the research pipeline
 */
export declare function extractContent(options?: ExtractContentOptions): ReturnType<typeof createStep>;