import { R as ResolvedAxiomConfig } from './config-6PyyriW8.js';
import { c as ScoreWithName, d as ScorerLike, V as ValidateName } from './name-validation.d-BKPGh6r3.js';
import { SerializedError } from 'vitest';
import { Reporter, TestSuite, TestCase, TestModule, TestRunEndReason } from 'vitest/node';
import { c as createAppScope } from './app-scope-BgNUnFZY.js';
import '@opentelemetry/api';
import 'zod';
import './scorers/aggregations.js';

/**
 * Function type for evaluation tasks that process input data and produce output.
 *
 * Used with {@link EvalParams} to define the task that will be evaluated against a dataset.
 * The task output will be scored by functions defined in {@link EvalParams.scorers}.
 *
 *
 * @param input - The input data to process
 * @param expected - The expected output for comparison/validation
 * @returns The task output, Promise, or AsyncIterable for streaming
 *
 * @example
 * ```typescript
 * const textGenerationTask: EvalTask<string, string, string> = async ({ input, expected }) => {
 *   const result = await generateText({
 *     model: myModel,
 *     prompt: input
 *   });
 *   return result.text;
 * };
 * ```
 */
type EvalTask<TInput, TExpected, TOutput> = (args: {
    input: TInput;
    expected: TExpected;
}) => TOutput | Promise<TOutput> | AsyncIterable<TOutput>;
/**
 * Record type representing a single data point in an evaluation dataset.
 *
 */
type CollectionRecord<TInput, TExpected> = {
    /** The input data for the evaluation case */
    input: TInput;
    /** The expected output for comparison/validation */
    expected: TExpected;
    /** Optional metadata for the record */
    metadata?: Record<string, unknown>;
};
/**
 * Configuration parameters for running an evaluation.
 *
 * Used with {@link Eval} to define how an evaluation should be executed.
 * Results are captured in {@link EvalReport} format.
 *
 */
type EvalParams<TInput, TExpected, TOutput> = {
    /** Dataset with input/expected pairs for evaluation, or a function that returns one */
    data: readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]> | (() => readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]>);
    capability: string;
    step?: string | undefined;
    /** The task function to evaluate */
    task: EvalTask<TInput, TExpected, TOutput>;
    /** Array of scoring functions to evaluate the task output */
    scorers: ReadonlyArray<ScorerLike<TInput, TExpected, TOutput>>;
    /** Optional metadata for the evaluation */
    metadata?: Record<string, unknown>;
    /** Optional timeout in milliseconds for task execution */
    timeout?: number;
    /** Optional reduction of flag namespace */
    configFlags?: string[];
    /**
     * Number of times to run each case. Defaults to 1.
     * Each trial runs the task independently, and scores are aggregated per scorer.
     */
    trials?: number;
};
type RuntimeFlagLog = {
    kind: 'introduced';
    value: unknown;
} | {
    kind: 'replaced';
    value: unknown;
    default: unknown;
};
type RuntimeFlagMap = Record<string, RuntimeFlagLog>;
type Evaluation = {
    id: string;
    name: string;
    type: string;
    version: string;
    baseline: {
        id: string | undefined;
        name: string | undefined;
    };
    collection: {
        name: string;
        size: number;
    };
    prompt: {
        model: string;
        params: Record<string, unknown>;
    };
    duration: number;
    status: string;
    traceId: string;
    runAt: string;
    tags: string[];
    user: {
        name: string | undefined;
        email: string | undefined;
    };
    cases: Case[];
    flagConfig?: Record<string, any>;
};
type Case = {
    index: number;
    input: string;
    output: string;
    expected: string;
    duration: string;
    status: string;
    scores: Record<string, {
        name: string;
        value: number;
        metadata: Record<string, any>;
        /** Per-trial scores when running multiple trials */
        trials?: number[];
        /** Aggregation type used (e.g., 'mean', 'pass@k') */
        aggregation?: string;
        /** Threshold for pass-based aggregations */
        threshold?: number;
    }>;
    /** Number of trials run for this case */
    trials?: number;
    runAt: string;
    spanId: string;
    traceId: string;
    task?: Task;
    runtimeFlags?: RuntimeFlagMap;
};
type Chat = {
    operation: string;
    capability: string;
    step: string;
    request: {
        max_token: string;
        model: string;
        temperature: number;
    };
    response: {
        finish_reasons: string;
    };
    usage: {
        input_tokens: number;
        output_tokens: number;
    };
};
type Task = {
    name: string;
    output: string;
    trial: number;
    type: string;
    error?: string;
    chat: Chat;
};
/**
 * Complete report for a single evaluation case including results and metadata.
 *
 * Generated for each test case when running {@link Eval} with {@link EvalParams}.
 * Contains all {@link Score} results and execution metadata.
 *
 */
type EvalCaseReport = {
    /** Order/index of this case in the evaluation suite */
    index: number;
    /** Name of the evaluation */
    name: string;
    /** Input data that was provided to the {@link EvalTask} */
    input: string | Record<string, any>;
    /** Output produced by the {@link EvalTask}; undefined when all trials fail before producing output */
    output: string | Record<string, any> | undefined;
    /** Expected output for comparison */
    expected: string | Record<string, any>;
    /** Optional metadata for the case */
    metadata?: Record<string, any>;
    /** Array of {@link Score} results from all scorers that were run */
    scores: Record<string, ScoreWithName>;
    /** Any errors that occurred during evaluation */
    errors: Error[] | null;
    /** Status of the evaluation case */
    status: 'success' | 'fail' | 'pending';
    /** Per-trial errors in order (null for successful trials) */
    trialErrors?: Array<string | null>;
    /** Trial summary stats for reporting */
    trialSummary?: {
        total: number;
        succeeded: number;
        failed: number;
    };
    /** Duration in milliseconds for the entire case */
    duration: number | undefined;
    /** Timestamp when the case started */
    startedAt: number | undefined;
    /** Flags accessed outside of the picked flags scope for this case */
    outOfScopeFlags?: OutOfScopeFlagAccess[];
    /** Flags that are in scope for this evaluation */
    pickedFlags?: string[];
    /** Runtime flags actually used during this case */
    runtimeFlags?: RuntimeFlagMap;
};
type OutOfScopeFlagAccess = {
    flagPath: string;
    accessedAt: number;
    stackTrace: string[];
};
type OutOfScopeFlag = {
    flagPath: string;
    count: number;
    firstAccessedAt: number;
    lastAccessedAt: number;
    stackTrace: string[];
};
type RegistrationStatus = {
    status: 'success';
} | {
    status: 'failed';
    error: string;
};
type EvaluationReport = {
    id: string;
    name: string;
    version: string;
    runId: string;
    orgId?: string;
    baseline: Evaluation | undefined;
    /** Flags that are in scope for this evaluation */
    configFlags?: string[];
    /** Full flag configuration for this evaluation run */
    flagConfig?: Record<string, any>;
    /** Summary of all flags accessed outside of picked flags scope across all cases */
    outOfScopeFlags?: OutOfScopeFlag[];
    /** End-of-suite config snapshot for console printing only */
    configEnd?: {
        flags?: Record<string, any>;
        pickedFlags?: string[];
        overrides?: Record<string, any>;
    };
    registrationStatus?: RegistrationStatus;
    /** Number of trials per case (only shown if > 1) */
    trials?: number;
};

declare module 'vitest' {
    interface TestSuiteMeta {
        evaluation: EvaluationReport;
    }
    interface TaskMeta {
        case: EvalCaseReport;
        evaluation: EvaluationReport;
    }
    interface ProvidedContext {
        baseline?: string;
        debug?: boolean;
        list?: boolean;
        overrides?: Record<string, any>;
        axiomConfig?: ResolvedAxiomConfig;
        runId: string;
        consoleUrl?: string;
    }
}
/**
 * Creates and registers an evaluation suite with the given name and parameters.
 *
 * This function sets up a complete evaluation pipeline that will run your {@link EvalTask}
 * against a collection, score the results, and provide detailed {@link EvalCaseReport} reporting.
 *
 *
 * @param name - Human-readable name for the evaluation suite
 * @param params - {@link EvalParams} configuration parameters for the evaluation
 *
 * @example
 * ```typescript
 * import { Eval } from 'axiom/ai/evals';
 *
 * Eval('Text Generation Quality', {
 *   capability: 'capability-name',
 *   data: async () => [
 *     { input: 'Explain photosynthesis', expected: 'Plants convert light to energy...' },
 *     { input: 'What is gravity?', expected: 'Gravity is a fundamental force...' }
 *   ],
 *   task: async ({ input }) => {
 *     const result = await generateText({
 *       model: yourModel,
 *       prompt: input
 *     });
 *     return result.text;
 *   },
 *   scorers: [similarityScorer, factualAccuracyScorer],
 * });
 * ```
 */
declare function Eval<TInput, TExpected, TOutput, Name extends string = string, Capability extends string = string, Step extends string = string>(name: ValidateName<Name>, params: Omit<EvalParams<TInput, TExpected, TOutput>, 'capability' | 'step' | 'scorers'> & {
    capability: ValidateName<Capability>;
    step?: ValidateName<Step> | undefined;
    scorers: ReadonlyArray<ScorerLike<NoInfer<TInput>, NoInfer<TExpected>, TOutput>>;
}): void;

/**
 * Custom Vitest reporter for Axiom AI evaluations.
 *
 * This reporter collects evaluation results and scores from tests
 * and processes them for further analysis and reporting.
 *
 */
declare class AxiomReporter implements Reporter {
    startTime: number;
    start: number;
    private _endOfRunConfigEnd;
    private _suiteData;
    private _printedFlagOverrides;
    private _config;
    onTestRunStart(): void;
    onTestSuiteReady(_testSuite: TestSuite): Promise<void>;
    onTestCaseReady(test: TestCase): void;
    onTestSuiteResult(testSuite: TestSuite): Promise<void>;
    onTestRunEnd(_testModules: ReadonlyArray<TestModule>, _errors: ReadonlyArray<SerializedError>, _reason: TestRunEndReason): Promise<void>;
}

interface EvalContextData<Flags = any, Facts = any> {
    flags: Partial<Flags>;
    facts: Partial<Facts>;
    configScope?: ReturnType<typeof createAppScope>;
    pickedFlags?: string[];
    outOfScopeFlags?: OutOfScopeFlagAccess[];
    parent?: EvalContextData<Flags, Facts>;
    overrides?: Record<string, any>;
    accessedFlagKeys?: string[];
}

interface EvalBuilder<AllowedFlags extends Record<string, any> = {}, TInput extends string | Record<string, any> = string, TExpected extends string | Record<string, any> = string, TOutput extends string | Record<string, any> = string> {
    withFlags<F extends Partial<AllowedFlags>>(flags: F): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
    withModel(model: string): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
    withTimeout(timeout: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
    /**
     * Set the number of times to run each case.
     * Each trial runs the task independently, and scores are aggregated per scorer.
     */
    withTrials(trials: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
    run(suffix?: string): void;
}

export { AxiomReporter, type Case, type Chat, Eval, type EvalBuilder, type EvalContextData, type EvalParams, type EvalTask, type Evaluation, type Task };
