/** * GAIA Ensemble Runner — ADR-139 * * Runs a GAIA question through N models in parallel, aggregates answers via * majority vote, with a judge-model tiebreak when no consensus is reached. * * Architecture: * 1. Each model runs independently using the full tool harness. * 2. Answers are normalised (via normaliseAnswer from gaia-judge.ts). * 3. Majority vote: if ≥2 models agree on normalised answer → that wins. * 4. Tiebreak: when all answers differ (or N=2 with disagreement), the judge * model picks the best answer with a brief rationale. * 5. Abstain: if all models return null/timedOut → failed question. * * Supported models: * - Claude (claude-sonnet-4-6, etc.) via Anthropic API (gaia-agent.ts) * - Gemini (gemini-2.5-pro, etc.) via Google AI API (gaia-agent-gemini.js compiled) * - OpenRouter (gpt-5, deepseek-v3.2, kimi-k2, etc.) via OpenAI-compatible API * NOTE: OpenRouter requires funded account — returns 402 when credits exhausted. * * CLI integration: * gaia-bench run --mode=ensemble --models=claude-sonnet-4-6,gemini-2.5-pro,openai/gpt-5 * * Cost model (per question, typical L1 ~10k input + ~3k output tokens): * claude-sonnet-4-6: ~$0.075 * gemini-2.5-pro: ~$0.043 * openai/gpt-5 (OR): ~$0.043 * 3-model total: ~$0.161 (53-Q: ~$8.54, 300-Q: ~$48.30) * * Refs: ADR-139, ADR-133, ADR-135, #2156 */ import { GaiaQuestion } from './gaia-loader.js'; export interface ModelRunResult { model: string; finalAnswer: string | null; normalisedAnswer: string; turns: number; totalInputTokens: number; totalOutputTokens: number; wallMs: number; estimatedCostUsd: number; timedOut?: boolean; error?: string; } export type AggregationMethod = 'majority' | 'judge-tiebreak' | 'abstain'; export interface EnsembleResult { questionId: string; finalAnswer: string | null; aggregationMethod: AggregationMethod; /** Rationale from the judge tiebreak (only set when method is 'judge-tiebreak'). */ judgeRationale?: string; models: ModelRunResult[]; totalInputTokens: number; totalOutputTokens: number; estimatedCostUsd: number; wallMs: number; } export interface EnsembleOptions { /** Models to use (provider inferred from model ID). */ models?: string[]; /** Judge model for tiebreak (default: claude-sonnet-4-6). */ judgeModel?: string; /** Anthropic API key (resolved from env/gcloud if not supplied). */ anthropicApiKey?: string; /** Google AI API key (resolved from env/gcloud if not supplied). */ geminiApiKey?: string; /** OpenRouter API key (resolved from env/gcloud if not supplied). */ openrouterApiKey?: string; /** Per-model max turns (default: 8). */ maxTurns?: number; /** Per-model max tokens per turn (default: 2048). */ maxTokensPerTurn?: number; /** Per-turn timeout in ms (default: 60 000). */ perTurnTimeoutMs?: number; } export declare function runEnsembleQuestion(question: GaiaQuestion, options?: EnsembleOptions): Promise; export interface EnsemblePilotResult { correct: number; total: number; accuracy: number; perQuestion: Array<{ taskId: string; question: string; expected: string; got: string | null; correct: boolean; aggregationMethod: AggregationMethod; judgeRationale?: string; costUsd: number; wallMs: number; perModel: Array<{ model: string; answer: string | null; costUsd: number; }>; }>; totalCostUsd: number; projectedCost53Q: number; projectedCost300Q: number; meanWallMs: number; } export declare function runEnsemblePilot(questions: GaiaQuestion[], options?: EnsembleOptions): Promise; //# sourceMappingURL=gaia-ensemble.d.ts.map