/**
 * GAIA Ensemble Runner — ADR-139
 *
 * Runs a GAIA question through N models in parallel, aggregates answers via
 * majority vote, with a judge-model tiebreak when no consensus is reached.
 *
 * Architecture:
 *   1. Each model runs independently using the full tool harness.
 *   2. Answers are normalised (via normaliseAnswer from gaia-judge.ts).
 *   3. Majority vote: if ≥2 models agree on normalised answer → that wins.
 *   4. Tiebreak: when all answers differ (or N=2 with disagreement), the judge
 *      model picks the best answer with a brief rationale.
 *   5. Abstain: if all models return null/timedOut → failed question.
 *
 * Supported models:
 *   - Claude (claude-sonnet-4-6, etc.) via Anthropic API (gaia-agent.ts)
 *   - Gemini (gemini-2.5-pro, etc.) via Google AI API (gaia-agent-gemini.js compiled)
 *   - OpenRouter (gpt-5, deepseek-v3.2, kimi-k2, etc.) via OpenAI-compatible API
 *     NOTE: OpenRouter requires funded account — returns 402 when credits exhausted.
 *
 * CLI integration:
 *   gaia-bench run --mode=ensemble --models=claude-sonnet-4-6,gemini-2.5-pro,openai/gpt-5
 *
 * Cost model (per question, typical L1 ~10k input + ~3k output tokens):
 *   claude-sonnet-4-6:  ~$0.075
 *   gemini-2.5-pro:     ~$0.043
 *   openai/gpt-5 (OR):  ~$0.043
 *   3-model total:      ~$0.161  (53-Q: ~$8.54, 300-Q: ~$48.30)
 *
 * Refs: ADR-139, ADR-133, ADR-135, #2156
 */
import { GaiaQuestion } from './gaia-loader.js';
export interface ModelRunResult {
    model: string;
    finalAnswer: string | null;
    normalisedAnswer: string;
    turns: number;
    totalInputTokens: number;
    totalOutputTokens: number;
    wallMs: number;
    estimatedCostUsd: number;
    timedOut?: boolean;
    error?: string;
}
export type AggregationMethod = 'majority' | 'judge-tiebreak' | 'abstain';
export interface EnsembleResult {
    questionId: string;
    finalAnswer: string | null;
    aggregationMethod: AggregationMethod;
    /** Rationale from the judge tiebreak (only set when method is 'judge-tiebreak'). */
    judgeRationale?: string;
    models: ModelRunResult[];
    totalInputTokens: number;
    totalOutputTokens: number;
    estimatedCostUsd: number;
    wallMs: number;
}
export interface EnsembleOptions {
    /** Models to use (provider inferred from model ID). */
    models?: string[];
    /** Judge model for tiebreak (default: claude-sonnet-4-6). */
    judgeModel?: string;
    /** Anthropic API key (resolved from env/gcloud if not supplied). */
    anthropicApiKey?: string;
    /** Google AI API key (resolved from env/gcloud if not supplied). */
    geminiApiKey?: string;
    /** OpenRouter API key (resolved from env/gcloud if not supplied). */
    openrouterApiKey?: string;
    /** Per-model max turns (default: 8). */
    maxTurns?: number;
    /** Per-model max tokens per turn (default: 2048). */
    maxTokensPerTurn?: number;
    /** Per-turn timeout in ms (default: 60 000). */
    perTurnTimeoutMs?: number;
}
export declare function runEnsembleQuestion(question: GaiaQuestion, options?: EnsembleOptions): Promise<EnsembleResult>;
export interface EnsemblePilotResult {
    correct: number;
    total: number;
    accuracy: number;
    perQuestion: Array<{
        taskId: string;
        question: string;
        expected: string;
        got: string | null;
        correct: boolean;
        aggregationMethod: AggregationMethod;
        judgeRationale?: string;
        costUsd: number;
        wallMs: number;
        perModel: Array<{
            model: string;
            answer: string | null;
            costUsd: number;
        }>;
    }>;
    totalCostUsd: number;
    projectedCost53Q: number;
    projectedCost300Q: number;
    meanWallMs: number;
}
export declare function runEnsemblePilot(questions: GaiaQuestion[], options?: EnsembleOptions): Promise<EnsemblePilotResult>;
//# sourceMappingURL=gaia-ensemble.d.ts.map