/**
 * Evaluation type definitions for NeuroLink
 * Provider performance tracking, evaluation configurations, and provider optimization types
 */
import type { LanguageModelV3CallOptions } from "@ai-sdk/provider";
import type { TokenUsage } from "./analytics.js";
import type { GenerateResult } from "./generate.js";
import type { ToolExecution } from "./tools.js";
import type { JsonObject } from "./common.js";
import type { AggregatedScores, EvaluationTraceContext, PipelineConfig, ReportConfig, ReportFormat, ScoreResult, ScorerInput } from "./scorer.js";
/**
 * Evaluation provider type as specified in core module refactoring
 */
export type EvaluationProvider = "openai" | "anthropic" | "vertex" | "google-ai" | "local";
/**
 * Evaluation modes
 */
export type EvaluationMode = "basic" | "detailed" | "domain-aware" | "disabled";
/**
 * Alert severity levels
 */
export type AlertSeverity = "low" | "medium" | "high" | "none";
/**
 * Response quality evaluation scores - Comprehensive evaluation type
 */
export type EvaluationData = {
    relevance: number;
    accuracy: number;
    completeness: number;
    overall: number;
    domainAlignment?: number;
    terminologyAccuracy?: number;
    toolEffectiveness?: number;
    responseContent?: string;
    queryContent?: string;
    isOffTopic: boolean;
    alertSeverity: AlertSeverity;
    reasoning: string;
    suggestedImprovements?: string;
    evaluationModel: string;
    evaluationTime: number;
    evaluationDomain?: string;
    evaluationProvider?: string;
    evaluationAttempt?: number;
    evaluationConfig?: {
        mode: string;
        fallbackUsed: boolean;
        costEstimate: number;
    };
    domainConfig?: {
        domainName: string;
        domainDescription: string;
        keyTerms: string[];
        failurePatterns: string[];
        successPatterns: string[];
        evaluationCriteria?: Record<string, unknown>;
    };
    domainEvaluation?: {
        domainRelevance: number;
        terminologyAccuracy: number;
        domainExpertise: number;
        domainSpecificInsights: string[];
    };
};
/**
 * Enhanced evaluation context for comprehensive response assessment
 */
export type EvaluationContext = {
    userQuery: string;
    aiResponse: string;
    context?: Record<string, unknown>;
    primaryDomain?: string;
    assistantRole?: string;
    conversationHistory?: Array<{
        role: "user" | "assistant";
        content: string;
        timestamp?: string;
    }>;
    toolUsage?: Array<{
        toolName: string;
        input: unknown;
        output: unknown;
        executionTime: number;
    }>;
    expectedOutcome?: string;
    evaluationCriteria?: string[];
};
/**
 * Evaluation result type
 * Extends EvaluationData with additional fields
 */
export type EnhancedEvaluationResult = EvaluationData & {
    domainAlignment?: number;
    terminologyAccuracy?: number;
    toolEffectiveness?: number;
    contextUtilization?: {
        conversationUsed: boolean;
        toolsUsed: boolean;
        domainKnowledgeUsed: boolean;
    };
    evaluationContext?: {
        domain: string;
        toolsEvaluated: string[];
        conversationTurns: number;
    };
    isOffTopic: boolean;
    alertSeverity: AlertSeverity;
    reasoning: string;
};
/**
 * Evaluation request type as specified in core module refactoring
 */
export type EvaluationRequest = {
    content: string;
    context?: string;
    domain?: string;
    criteria: EvaluationCriteria;
};
/**
 * Evaluation criteria type as specified in core module refactoring
 */
export type EvaluationCriteria = {
    relevance: boolean;
    accuracy: boolean;
    completeness: boolean;
    domainSpecific?: boolean;
};
/**
 * Represents the analysis of the user's query intent.
 * This provides a basic understanding of what the user is trying to achieve.
 */
export type QueryIntentAnalysis = {
    /** The type of query, e.g., asking a question or giving a command. */
    type: "question" | "command" | "greeting" | "unknown";
    /** The estimated complexity of the query. */
    complexity: "low" | "medium" | "high";
    /** Whether the query likely required the use of tools to be answered correctly. */
    shouldHaveUsedTools: boolean;
};
/**
 * Represents a single turn in an enhanced conversation history,
 * including tool executions and evaluations for richer context.
 */
export type EnhancedConversationTurn = {
    /** The role of the speaker, either 'user' or 'assistant'. */
    role: "user" | "assistant";
    /** The content of the message. */
    content: string;
    /** The timestamp of the message. */
    timestamp: string;
    /** Any tools that were executed as part of this turn. */
    toolExecutions?: ToolExecution[];
    /** The evaluation result for this turn, if applicable. */
    evaluation?: EvaluationResult;
};
/**
 * Contains all the rich context needed for a thorough, RAGAS-style evaluation.
 * This object is constructed by the `ContextBuilder` and used by the `RAGASEvaluator`.
 */
export type EnhancedEvaluationContext = {
    /** The original user query. */
    userQuery: string;
    /** An analysis of the user's query intent. */
    queryAnalysis: QueryIntentAnalysis;
    /** The AI's response that is being evaluated. */
    aiResponse: string;
    /** The AI provider that generated the response. */
    provider: string;
    /** The specific model that generated the response. */
    model: string;
    /** The parameters used for the generation call. */
    generationParams: {
        temperature?: number;
        maxTokens?: number;
        systemPrompt?: string;
    };
    /** A list of tools that were executed. */
    toolExecutions: ToolExecution[];
    /** The history of the conversation leading up to this turn. */
    conversationHistory: EnhancedConversationTurn[];
    /** The response time of the AI in milliseconds. */
    responseTime: number;
    /** The token usage for the generation. */
    tokenUsage: TokenUsage;
    /** The results of any previous evaluation attempts for this response. */
    previousEvaluations?: EvaluationResult[];
    /** The current attempt number for this evaluation (1-based). */
    attemptNumber: number;
};
/**
 * Represents the result of a single evaluation attempt, based on RAGAS principles.
 */
export type EvaluationResult = {
    /** The final, overall score for the response, typically from 1 to 10. */
    finalScore: number;
    /** How well the response addresses the user's query. */
    relevanceScore: number;
    /** The factual accuracy of the information in the response. */
    accuracyScore: number;
    /** How completely the response answers the user's query. */
    completenessScore: number;
    /** Whether the final score meets the passing threshold. */
    isPassing: boolean;
    /** Constructive response from the judge LLM on how to improve the response. */
    reasoning: string;
    /** Specific suggestions for improving the response. */
    suggestedImprovements: string;
    /** The raw, unparsed response from the judge LLM. */
    rawEvaluationResponse: string;
    /** The model used to perform the evaluation. */
    evaluationModel: string;
    /** The time taken for the evaluation in milliseconds. */
    evaluationTime: number;
    /** The attempt number for this evaluation. */
    attemptNumber: number;
};
/**
 * Provides detailed information when a response fails quality assurance checks.
 */
export type QualityErrorDetails = {
    /** The history of all evaluation attempts for this response. */
    evaluationHistory: EvaluationResult[];
    /** The final score of the last attempt. */
    finalScore: number;
    /** The total number of evaluation attempts made. */
    attempts: number;
    /** A summary message of the failure. */
    message: string;
};
/**
 * Configuration for the main `Evaluator` class.
 */
export type EvaluationConfig = {
    /** The minimum score (1-10) for a response to be considered passing. */
    threshold?: number;
    /** The evaluation strategy to use. Currently only 'ragas' is supported. */
    evaluationStrategy?: "ragas" | "custom";
    /** The model to use for the LLM-as-judge evaluation. */
    evaluationModel?: string;
    /** The maximum number of evaluation attempts before failing. */
    maxAttempts?: number;
    /** The provider to use for the evaluation model. */
    provider?: string;
    /** A custom evaluator function to override the default behavior. */
    customEvaluator?: (options: LanguageModelV3CallOptions, result: GenerateResult) => Promise<{
        evaluationResult: EvaluationResult;
        evalContext: EnhancedEvaluationContext;
    }>;
    /** The score below which a response is considered off-topic. */
    offTopicThreshold?: number;
    /** The score below which a failing response is considered a high severity alert. */
    highSeverityThreshold?: number;
    /** An optional function to generate custom evaluation prompts. */
    promptGenerator?: GetPromptFunction;
};
/**
 * A function that generates the main body of an evaluation prompt.
 */
export type GetPromptFunction = (context: {
    userQuery: string;
    history: string;
    tools: string;
    retryInfo: string;
    aiResponse: string;
}) => string;
/**
 * Pipeline execution options
 */
export type PipelineExecutionOptions = {
    /** Correlation ID for tracing */
    correlationId?: string;
    /** Custom timeout override */
    timeout?: number;
    /** Skip specific scorers. Mutually exclusive with onlyScorers. */
    skipScorers?: string[];
    /** Only run specific scorers. Mutually exclusive with skipScorers. */
    onlyScorers?: string[];
    /** Additional metadata to attach */
    metadata?: JsonObject;
};
/**
 * Pipeline execution result
 */
export type PipelineResult = AggregatedScores & {
    /** Pipeline configuration used */
    pipelineConfig: PipelineConfig;
    /** Execution options used */
    executionOptions?: PipelineExecutionOptions;
    /** Errors that occurred during execution */
    errors: Array<{
        scorerId: string;
        error: string;
    }>;
    /** Scorers that were skipped */
    skippedScorers: string[];
};
/**
 * Report data structure
 */
export type ReportData = {
    /** Report title */
    title: string;
    /** Timestamp */
    timestamp: number;
    /** Evaluation result */
    result: PipelineResult | AggregatedScores;
    /** Optional custom sections */
    customSections?: Array<{
        title: string;
        content: string | JsonObject;
    }>;
};
/**
 * Function scorer - a simple function-based scorer
 */
export type ScorerFunction = (input: ScorerInput) => Promise<{
    score: number;
    reasoning: string;
    metadata?: JsonObject;
}>;
/**
 * Superset batch progress. `pending` is canonical; `remaining` in the
 * pipeline's batchStrategy was renamed during consolidation (same value).
 */
export type BatchProgress = {
    total: number;
    completed: number;
    failed: number;
    pending: number;
    percentComplete: number;
    succeeded?: number;
    estimatedTimeRemaining?: number;
};
/** Input item for BatchEvaluator. */
export type BatchEvaluationItem = {
    id: string;
    options: LanguageModelV3CallOptions;
    result: GenerateResult;
    threshold?: number;
};
/** Result of a single item in BatchEvaluator. */
export type BatchEvaluationItemResult = {
    id: string;
    success: boolean;
    data?: EvaluationData;
    error?: {
        message: string;
        code?: string;
        retryable?: boolean;
    };
    duration: number;
    retryCount: number;
};
/** Result of a single item in the pipeline batchStrategy. */
export type BatchItemResult = {
    index: number;
    input: ScorerInput;
    result?: PipelineResult;
    error?: string;
    duration: number;
};
/**
 * Superset batch evaluation config. Union of pre-consolidation types
 * (BatchEvaluationConfig in BatchEvaluator, BatchConfig in batchStrategy).
 */
export type BatchEvaluationConfig = EvaluationConfig & {
    concurrency?: number;
    continueOnError?: boolean;
    onProgress?: (progress: BatchProgress) => void;
    maxRetries?: number;
    retryDelay?: number;
    onItemComplete?: (result: BatchEvaluationItemResult) => void;
    batchDelay?: number;
    onResult?: (result: BatchItemResult) => void;
};
/**
 * Superset batch-result. `results` is a union of both item-result flavors;
 * summary field names chosen from BatchEvaluator (`succeeded`, `passingRate`).
 */
export type BatchEvaluationResult = {
    results: BatchEvaluationItemResult[] | BatchItemResult[];
    summary: {
        total: number;
        succeeded: number;
        failed: number;
        averageScore: number;
        averageDuration: number;
        totalDuration: number;
        passingRate: number;
    };
    allSucceeded?: boolean;
};
/** Statistical summary of evaluation scores. */
export type ScoreStatistics = {
    min: number;
    max: number;
    mean: number;
    median: number;
    stdDev: number;
    variance: number;
    p25: number;
    p75: number;
    p90: number;
    p95: number;
};
/** Score distribution across ranges. */
export type ScoreDistribution = {
    /** Items scoring 1-3 (poor) */
    poor: number;
    /** Items scoring 4-5 (below average) */
    belowAverage: number;
    /** Items scoring 6-7 (average) */
    average: number;
    /** Items scoring 8-9 (good) */
    good: number;
    /** Items scoring 10 (excellent) */
    excellent: number;
};
/** Trend analysis results. */
export type TrendAnalysis = {
    direction: "improving" | "declining" | "stable";
    slope: number;
    rSquared: number;
    percentChange: number;
    movingAverage: number;
};
/** Dimension-specific analysis for RAGAS metrics. */
export type DimensionAnalysis = {
    relevance: ScoreStatistics;
    accuracy: ScoreStatistics;
    completeness: ScoreStatistics;
    overall: ScoreStatistics;
    correlations: {
        relevanceAccuracy: number;
        relevanceCompleteness: number;
        accuracyCompleteness: number;
    };
};
/** Quality alerts summary. */
export type AlertSummary = {
    total: number;
    high: number;
    medium: number;
    offTopic: number;
    alertRate: number;
};
/** Comprehensive aggregation result. */
export type AggregationResult = {
    count: number;
    statistics: ScoreStatistics;
    distribution: ScoreDistribution;
    dimensions: DimensionAnalysis;
    sequenceTrend?: TrendAnalysis;
    alerts: AlertSummary;
    passingRate: number;
    avgEvaluationTime: number;
    metadata: {
        aggregatedAt: string;
        threshold: number;
        evaluationModels: string[];
    };
};
/** Configuration preset for common evaluation scenarios. */
export type EvaluatorPreset = {
    name: string;
    description: string;
    config?: EvaluationConfig;
};
/** Configuration for evaluation strategies. */
export type EvaluationStrategyConfig = {
    evaluationModel?: string;
    provider?: string;
    threshold?: number;
    promptGenerator?: (context: {
        userQuery: string;
        history: string;
        tools: string;
        retryInfo: string;
        aiResponse: string;
    }) => string;
    options?: Record<string, unknown>;
};
/** Function that performs evaluation and returns results. */
export type EvaluationStrategyFunction = (options: LanguageModelV3CallOptions, result: GenerateResult, config?: EvaluationStrategyConfig) => Promise<{
    evaluationResult: EvaluationResult;
    evalContext: EnhancedEvaluationContext;
}>;
/** Metadata for registered evaluation strategies. */
export type EvaluationStrategyMetadata = {
    name: string;
    description: string;
    requiresLLM: boolean;
    defaultModel?: string;
    defaultProvider?: string;
    version: string;
    features: string[];
};
/** Canonical evaluation error code. */
export type EvaluationErrorCode = "EVALUATION_FAILED" | "PARSE_ERROR" | "STRATEGY_NOT_FOUND" | "PROVIDER_ERROR" | "CONFIGURATION_ERROR" | "CUSTOM_EVALUATOR_ERROR" | "BATCH_EVALUATION_ERROR" | "AGGREGATION_ERROR" | "REGISTRY_ERROR" | "MAX_RETRIES_EXCEEDED" | "TIMEOUT_ERROR" | "RATE_LIMIT_ERROR";
/** Extended evaluation context for error details. */
export type EvaluationErrorContext = {
    userQueryLength?: number;
    aiResponseLength?: number;
    attemptNumber?: number;
    previousScores?: number[];
    strategy?: string;
    evaluationModel?: string;
    provider?: string;
    rawResponseLength?: number;
    additionalContext?: Record<string, unknown>;
};
/** Minimal Langfuse client interface for evaluation hooks. */
export type LangfuseClient = {
    score: (params: {
        name: string;
        value: number;
        traceId?: string;
        observationId?: string;
        comment?: string;
        metadata?: Record<string, unknown>;
    }) => Promise<unknown>;
    trace?: (params: {
        name: string;
        metadata?: Record<string, unknown>;
        tags?: string[];
    }) => {
        id: string;
    };
    shutdown?: () => Promise<void>;
};
/** Langfuse adapter configuration. */
export type LangfuseAdapterConfig = {
    client: LangfuseClient;
    scorePrefix?: string;
    includeMetadata?: boolean;
    tags?: string[];
    sendPipelineScores?: boolean;
    sendScorerScores?: boolean;
};
/** Events emitted by the evaluation pipeline. */
export type EvaluationEvents = {
    "scorer:start": {
        scorerId: string;
        scorerName: string;
        timestamp: number;
        traceContext?: EvaluationTraceContext;
    };
    "scorer:end": {
        scorerId: string;
        scorerName: string;
        result: ScoreResult;
        timestamp: number;
        duration: number;
        traceContext?: EvaluationTraceContext;
    };
    "scorer:error": {
        scorerId: string;
        scorerName: string;
        error: string;
        timestamp: number;
        traceContext?: EvaluationTraceContext;
    };
    "pipeline:start": {
        pipelineName: string;
        scorerCount: number;
        timestamp: number;
        correlationId: string;
        traceContext?: EvaluationTraceContext;
    };
    "pipeline:end": {
        pipelineName: string;
        result: PipelineResult;
        timestamp: number;
        duration: number;
        traceContext?: EvaluationTraceContext;
    };
    "pipeline:error": {
        pipelineName: string;
        error: string;
        timestamp: number;
        traceContext?: EvaluationTraceContext;
    };
};
/**
 * Flat span attribute map used by the evaluation observability layer.
 * Named EvaluationSpanAttributes to disambiguate from the richer telemetry
 * SpanAttributes in span.ts (§Rule 9 domain prefix).
 */
export type EvaluationSpanAttributes = Record<string, string | number | boolean>;
/** Metrics captured per scorer execution. */
export type ScorerMetrics = {
    scorerId: string;
    scorerName: string;
    totalExecutions: number;
    successfulExecutions: number;
    failedExecutions: number;
    passedCount: number;
    failedCount: number;
    totalScore: number;
    minScore: number;
    maxScore: number;
    totalDuration: number;
    averageDuration: number;
    averageScore: number;
    passRate: number;
    lastExecutionTime: number;
};
/** Metrics captured per evaluation pipeline. */
export type PipelineMetrics = {
    pipelineName: string;
    totalExecutions: number;
    passedCount: number;
    failedCount: number;
    totalScore: number;
    minScore: number;
    maxScore: number;
    totalDuration: number;
    averageDuration: number;
    averageScore: number;
    passRate: number;
    lastExecutionTime: number;
    scorerMetrics: Map<string, ScorerMetrics>;
};
/** Aggregated metrics across pipelines and scorers. */
export type AggregatedMetrics = {
    totalEvaluations: number;
    overallPassRate: number;
    averageScore: number;
    averageDuration: number;
    scoreDistribution: {
        excellent: number;
        good: number;
        fair: number;
        poor: number;
        failing: number;
    };
    pipelineMetrics: Map<string, PipelineMetrics>;
    scorerMetrics: Map<string, ScorerMetrics>;
    collectionStartTime: number;
    lastUpdateTime: number;
};
/** Generated evaluation report envelope. */
export type GeneratedReport = {
    format: ReportFormat;
    content: string;
    metadata: {
        generatedAt: number;
        format: ReportFormat;
        config: ReportConfig;
    };
};
