/**
 * GAIA Judge — ADR-133-PR6
 *
 * Two-stage answer scorer for the GAIA benchmark:
 *
 *   Stage 1 — Fast path: normalized exact-match.
 *     Normalise = lowercase + strip surrounding whitespace + strip surrounding
 *     single/double quotes + collapse internal whitespace runs to one space.
 *     Roughly 30 % of GAIA Level-1 answers satisfy this; no API call required.
 *
 *   Stage 2 — LLM-as-judge: when exact-match fails, ask Claude Sonnet whether
 *     the candidate answer is semantically equivalent to the ground truth.
 *     The prompt embeds GAIA's official evaluation guideline (see
 *     https://huggingface.co/datasets/gaia-benchmark/GAIA for full spec).
 *
 * Caching: judgment results are persisted under
 *   ~/.cache/ruflo/gaia/judgments/<hash>.json
 * keyed on (question_id, candidate_answer, model_id, JUDGE_PROMPT_VERSION).
 * Re-running the same pair hits the cache and returns instantly.
 *
 * API pattern: raw fetch() against https://api.anthropic.com/v1/messages —
 * mirrors the pattern established in gaia-agent.ts (ADR-133-PR3).
 *
 * Refs: ADR-133, #2156
 */
export interface JudgeResult {
    questionId: string;
    passed: boolean;
    scoringPath: 'exact-match' | 'llm-judge' | 'cache';
    candidateAnswer: string;
    groundTruth: string;
    judgeReason?: string;
    judgeModel?: string;
    judgeTokensIn?: number;
    judgeTokensOut?: number;
    judgeCostUsd?: number;
}
export interface JudgeOptions {
    /** Default: 'claude-sonnet-4-6' */
    judgeModel?: string;
    /** Default: '~/.cache/ruflo/gaia/judgments/' */
    cacheDir?: string;
    skipCache?: boolean;
    apiKey?: string;
}
/**
 * GAIA normalisation as specified in the dataset paper:
 *   - strip surrounding whitespace
 *   - lowercase
 *   - strip a single pair of surrounding quotes (single or double)
 *   - collapse internal whitespace runs to one space
 */
export declare function normaliseAnswer(raw: string | null | undefined): string;
/**
 * Attempt to match a candidate numeric answer to an expected answer where the
 * question implies a unit scale.
 *
 * Examples that this catches:
 *   candidate="17000", expected="17", question contains "thousand"
 *     → candidate / 1000 ≈ expected → MATCH
 *   candidate="17", expected="17000", question contains "thousand"
 *     → candidate × 1000 ≈ expected → MATCH (reverse direction)
 *
 * Returns true only when a numeric match is found under one of the scale
 * multipliers mentioned in the question text.  Returns false for non-numeric
 * inputs or when no multiplier matches.
 *
 * @param candidate    - The raw string from the model (may include commas/spaces).
 * @param expected     - The raw ground-truth string.
 * @param questionText - The original question (used to detect multiplier words).
 */
export declare function unitAwareNumberMatch(candidate: string, expected: string, questionText?: string): boolean;
/**
 * Judge a single GAIA answer.
 *
 * @param question   - Object with `id` (task_id), `expected` (ground truth),
 *                     and optional `questionText` (the full question string,
 *                     used for unit-aware numeric matching in Stage 1).
 * @param candidateAnswer - The answer produced by the agent; `null` counts as a miss.
 * @param options    - Optional overrides (model, cache dir, API key, etc.).
 * @returns          - JudgeResult with pass/fail, scoring path, and cost metrics.
 */
export declare function judgeAnswer(question: {
    id: string;
    expected: string;
    questionText?: string;
}, candidateAnswer: string | null, options?: JudgeOptions): Promise<JudgeResult>;
//# sourceMappingURL=gaia-judge.d.ts.map