/**
 * CLAUDE.md Analyzer & Auto-Optimizer
 *
 * Quantifiable, verifiable analysis of CLAUDE.md files.
 * Measures structure quality, coverage, enforceability, and produces
 * a numeric score (0-100) that can be tracked over time.
 *
 * The auto-optimizer takes analysis results and produces a concrete
 * list of changes that would improve the score. Changes can be applied
 * programmatically and the score re-measured to verify improvement.
 *
 * @module @claude-flow/guidance/analyzer
 */
import type { ProofEnvelope } from './proof.js';
/** Score breakdown for a single dimension (0-100 each) */
export interface DimensionScore {
    /** Dimension name */
    name: string;
    /** Score 0-100 */
    score: number;
    /** Maximum possible score */
    max: number;
    /** Weight in composite calculation */
    weight: number;
    /** Human-readable findings */
    findings: string[];
}
/** Complete analysis result */
export interface AnalysisResult {
    /** Composite score 0-100 */
    compositeScore: number;
    /** Letter grade A-F */
    grade: string;
    /** Per-dimension scores */
    dimensions: DimensionScore[];
    /** Structural metrics */
    metrics: AnalysisMetrics;
    /** Actionable improvement suggestions */
    suggestions: Suggestion[];
    /** Timestamp */
    analyzedAt: number;
}
/** Raw metrics extracted from the file */
export interface AnalysisMetrics {
    /** Total lines */
    totalLines: number;
    /** Non-blank, non-comment lines */
    contentLines: number;
    /** Number of markdown headings */
    headingCount: number;
    /** Number of H2 sections */
    sectionCount: number;
    /** Estimated constitution lines (first section block) */
    constitutionLines: number;
    /** Number of rule-like statements (imperative sentences) */
    ruleCount: number;
    /** Number of code blocks */
    codeBlockCount: number;
    /** Number of NEVER/ALWAYS/MUST statements */
    enforcementStatements: number;
    /** Number of framework/tool mentions */
    toolMentions: number;
    /** Estimated shard count after compilation */
    estimatedShards: number;
    /** Has build command */
    hasBuildCommand: boolean;
    /** Has test command */
    hasTestCommand: boolean;
    /** Has security section */
    hasSecuritySection: boolean;
    /** Has architecture section */
    hasArchitectureSection: boolean;
    /** Lines in longest section */
    longestSectionLines: number;
    /** Has @import directives */
    hasImports: boolean;
    /** Number of domain-specific rules */
    domainRuleCount: number;
}
/** A concrete improvement suggestion */
export interface Suggestion {
    /** What to change */
    action: 'add' | 'remove' | 'restructure' | 'split' | 'strengthen';
    /** Priority */
    priority: 'high' | 'medium' | 'low';
    /** Which dimension this improves */
    dimension: string;
    /** Human-readable description */
    description: string;
    /** Estimated score improvement */
    estimatedImprovement: number;
    /** Concrete text to add/modify (if applicable) */
    patch?: string;
}
/** Before/after benchmark result */
export interface BenchmarkResult {
    before: AnalysisResult;
    after: AnalysisResult;
    delta: number;
    improvements: DimensionDelta[];
    regressions: DimensionDelta[];
}
interface DimensionDelta {
    dimension: string;
    before: number;
    after: number;
    delta: number;
}
/** Context size preset for optimization */
export type ContextSize = 'compact' | 'standard' | 'full';
/** Configuration for size-aware optimization */
export interface OptimizeOptions {
    /** Target context size */
    contextSize?: ContextSize;
    /** Optional local overlay content */
    localContent?: string;
    /** Maximum optimization iterations */
    maxIterations?: number;
    /** Target score (stop when reached) */
    targetScore?: number;
    /** HMAC key for proof chain (enables cryptographic proof of optimization) */
    proofKey?: string;
}
/** Result of headless benchmark via claude -p */
export interface HeadlessBenchmarkResult {
    /** Before optimization metrics */
    before: {
        analysis: AnalysisResult;
        suitePassRate: number;
        violationCount: number;
        taskResults: HeadlessTaskResult[];
    };
    /** After optimization metrics */
    after: {
        analysis: AnalysisResult;
        suitePassRate: number;
        violationCount: number;
        taskResults: HeadlessTaskResult[];
    };
    /** Score delta */
    delta: number;
    /** Proof chain with cryptographic verification */
    proofChain: ProofEnvelope[];
    /** Formatted report */
    report: string;
}
/** Result of a single headless task run */
export interface HeadlessTaskResult {
    taskId: string;
    prompt: string;
    passed: boolean;
    violations: string[];
    durationMs: number;
}
/**
 * Analyze a CLAUDE.md file and produce quantifiable scores.
 *
 * Scores 6 dimensions (0-100 each), weighted into a composite:
 * - Structure (20%): headings, sections, length, organization
 * - Coverage (20%): build/test/security/architecture/domain
 * - Enforceability (25%): NEVER/ALWAYS statements, concrete rules
 * - Compilability (15%): how well it compiles to constitution + shards
 * - Clarity (10%): code blocks, examples, specificity
 * - Completeness (10%): missing common sections
 */
export declare function analyze(content: string, localContent?: string): AnalysisResult;
/**
 * Run a before/after benchmark.
 * Returns the delta and per-dimension changes.
 */
export declare function benchmark(before: string, after: string, localContent?: string): BenchmarkResult;
/**
 * Auto-optimize a CLAUDE.md file by applying high-priority suggestions.
 * Returns the optimized content and the benchmark result.
 */
export declare function autoOptimize(content: string, localContent?: string, maxIterations?: number): {
    optimized: string;
    benchmark: BenchmarkResult;
    appliedSuggestions: Suggestion[];
};
/**
 * Context-size-aware optimization that restructures content to reach 90%+.
 *
 * Unlike autoOptimize (which only appends), this function:
 * 1. Splits oversized sections into subsections
 * 2. Extracts enforcement prose into list-format rules
 * 3. Trims the constitution to budget
 * 4. Removes redundant content
 * 5. Adds missing coverage sections
 * 6. Applies iterative patch suggestions
 *
 * @param content - CLAUDE.md content
 * @param options - Optimization options with contextSize and targetScore
 * @returns Optimized content, benchmark, and proof chain
 */
export declare function optimizeForSize(content: string, options?: OptimizeOptions): {
    optimized: string;
    benchmark: BenchmarkResult;
    appliedSteps: string[];
    proof: ProofEnvelope[];
};
/**
 * Run a headless benchmark using `claude -p` to measure actual agent
 * compliance before and after optimization.
 *
 * Requires `claude` CLI to be installed. Uses the proof chain to create
 * tamper-evident records of each test run.
 *
 * @param originalContent - Original CLAUDE.md
 * @param optimizedContent - Optimized CLAUDE.md
 * @param options - Options including proof key and executor
 */
export declare function headlessBenchmark(originalContent: string, optimizedContent: string, options?: {
    proofKey?: string;
    executor?: IHeadlessExecutor;
    tasks?: HeadlessBenchmarkTask[];
    workDir?: string;
}): Promise<HeadlessBenchmarkResult>;
/** Executor interface for headless claude commands */
export interface IHeadlessExecutor {
    execute(prompt: string, workDir: string): Promise<{
        stdout: string;
        stderr: string;
        exitCode: number;
    }>;
}
/**
 * Content-aware executor that adapts behavior based on CLAUDE.md content.
 *
 * When `validateEffect()` detects this interface, it calls `setContext()`
 * before each phase (before/after) so the executor can vary its responses
 * based on the quality of the loaded CLAUDE.md. This is the key mechanism
 * that makes the empirical validation meaningful — without it, the same
 * executor produces identical adherence for both phases.
 */
export interface IContentAwareExecutor extends IHeadlessExecutor {
    /** Set the CLAUDE.md content that the executor should use as behavioral context */
    setContext(claudeMdContent: string): void;
}
/** Benchmark task definition */
interface HeadlessBenchmarkTask {
    id: string;
    prompt: string;
    expectForbidden: string[];
    expectPresent: string[];
}
/**
 * Format analysis result as a human-readable report.
 */
export declare function formatReport(result: AnalysisResult): string;
/**
 * Format benchmark result as a comparison table.
 */
export declare function formatBenchmark(result: BenchmarkResult): string;
/**
 * An assertion about expected agent behavior.
 */
export interface ValidationAssertion {
    /** What to check */
    type: 'must-contain' | 'must-not-contain' | 'must-match-pattern' | 'must-mention-tool';
    /** The value to check (string literal or regex pattern for must-match-pattern) */
    value: string;
    /** How bad is a failure? */
    severity: 'critical' | 'major' | 'minor';
}
/**
 * A compliance task that tests whether the agent adheres to a specific
 * dimension's expected behavior.
 */
export interface ValidationTask {
    /** Unique task identifier */
    id: string;
    /** Which scoring dimension this task validates */
    dimension: string;
    /** The prompt to send to the agent */
    prompt: string;
    /** Assertions about the agent's output */
    assertions: ValidationAssertion[];
    /** Importance weight within its dimension (0-1) */
    weight: number;
}
/**
 * Result of running a single validation task.
 */
export interface ValidationTaskResult {
    taskId: string;
    dimension: string;
    passed: boolean;
    assertionResults: {
        assertion: ValidationAssertion;
        passed: boolean;
        detail: string;
    }[];
    output: string;
    durationMs: number;
}
/**
 * A single validation run against one CLAUDE.md version.
 */
export interface ValidationRun {
    /** Analysis of the CLAUDE.md used */
    analysis: AnalysisResult;
    /** Per-task results */
    taskResults: ValidationTaskResult[];
    /** Overall adherence rate (0-1) — weighted by severity */
    adherenceRate: number;
    /** Per-dimension adherence rates */
    dimensionAdherence: Record<string, number>;
    /** Timestamp */
    timestamp: number;
}
/**
 * Statistical correlation between score changes and behavioral changes.
 */
export interface CorrelationResult {
    /** Per-dimension score vs adherence comparison */
    dimensionCorrelations: {
        dimension: string;
        scoreBefore: number;
        scoreAfter: number;
        scoreDelta: number;
        adherenceBefore: number;
        adherenceAfter: number;
        adherenceDelta: number;
        /** Did score and adherence move in the same direction? */
        concordant: boolean;
    }[];
    /** Pearson correlation coefficient (-1 to 1) */
    pearsonR: number;
    /** Spearman rank correlation coefficient (-1 to 1) — more robust for small samples */
    spearmanRho: number;
    /** Cohen's d effect size (null if insufficient data) */
    cohensD: number | null;
    /** Human-readable effect size label */
    effectSizeLabel: string;
    /** Number of data points */
    n: number;
    /** Is the correlation statistically significant? (|r| > threshold for n) */
    significant: boolean;
    /** Overall verdict */
    verdict: 'positive-effect' | 'negative-effect' | 'no-effect' | 'inconclusive';
}
/**
 * Complete validation report proving (or disproving) that score improvements
 * lead to behavioral improvements.
 */
export interface ValidationReport {
    /** Run against original CLAUDE.md */
    before: ValidationRun;
    /** Run against optimized CLAUDE.md */
    after: ValidationRun;
    /** Statistical correlation analysis */
    correlation: CorrelationResult;
    /** Cryptographic proof chain */
    proofChain: ProofEnvelope[];
    /** Formatted human-readable report */
    report: string;
}
/**
 * Empirically validate that score improvements produce behavioral improvements.
 *
 * Runs a suite of compliance tasks against both the original and optimized
 * CLAUDE.md, then computes statistical correlations between per-dimension
 * score deltas and per-dimension adherence rate deltas.
 *
 * **Content-aware executors**: If the executor implements `IContentAwareExecutor`,
 * `setContext()` is called before each phase with the corresponding CLAUDE.md
 * content. This is the key mechanism that allows the executor to vary its
 * behavior based on the quality of the loaded guidance — without it, the same
 * executor produces identical adherence for both phases.
 *
 * The result includes:
 * - Per-dimension concordance (did score and adherence move together?)
 * - Pearson r and Spearman rho correlation coefficients
 * - Cohen's d effect size with interpretation
 * - A verdict: positive-effect, negative-effect, no-effect, or inconclusive
 * - A formatted report with full task breakdown
 * - Optional proof chain for tamper-evident audit trail
 *
 * @param originalContent - Original CLAUDE.md content
 * @param optimizedContent - Optimized CLAUDE.md content
 * @param options - Executor, tasks, proof key, work directory, trials
 * @returns ValidationReport with statistical evidence
 */
export declare function validateEffect(originalContent: string, optimizedContent: string, options?: {
    executor?: IHeadlessExecutor;
    tasks?: ValidationTask[];
    proofKey?: string;
    workDir?: string;
    /** Number of trials per phase (default 1). Higher values average out noise. */
    trials?: number;
}): Promise<ValidationReport>;
/** Task class categories for the A/B benchmark */
export type ABTaskClass = 'bug-fix' | 'feature' | 'refactor' | 'security' | 'deployment' | 'test' | 'performance';
/** A single benchmark task representing a real Claude Flow scenario */
export interface ABTask {
    /** Unique task identifier */
    id: string;
    /** Human-readable description */
    description: string;
    /** Task class for grouping results */
    taskClass: ABTaskClass;
    /** Prompt sent to the executor */
    prompt: string;
    /** Assertions to evaluate pass/fail */
    assertions: ValidationAssertion[];
    /** Violation patterns to detect via gate simulation */
    gatePatterns: ABGatePattern[];
}
/** A pattern the gate simulator checks for in executor output */
export interface ABGatePattern {
    /** What kind of violation this detects */
    category: 'destructive-command' | 'hardcoded-secret' | 'force-push' | 'unsafe-type' | 'skipped-hook' | 'missing-test' | 'policy-violation';
    /** Regex pattern to match in output */
    pattern: string;
    /** Severity of the violation */
    severity: 'critical' | 'major' | 'minor';
}
/** Result for a single task in either config A or config B */
export interface ABTaskResult {
    /** Task ID */
    taskId: string;
    /** Task class */
    taskClass: ABTaskClass;
    /** Did all assertions pass? */
    passed: boolean;
    /** Assertion evaluation details */
    assertionResults: {
        assertion: ValidationAssertion;
        passed: boolean;
        detail: string;
    }[];
    /** Gate violations detected */
    violations: {
        category: string;
        pattern: string;
        severity: string;
    }[];
    /** Would a human need to intervene? (any critical violation) */
    humanIntervention: boolean;
    /** Simulated tool call count (extracted from output) */
    toolCalls: number;
    /** Simulated token spend (estimated from output length) */
    tokenSpend: number;
    /** Raw executor output */
    output: string;
    /** Execution duration in ms */
    durationMs: number;
}
/** Aggregated KPIs for one config (A or B) */
export interface ABMetrics {
    /** Fraction of tasks that passed (0-1) */
    successRate: number;
    /** Total wall clock time in ms */
    wallClockMs: number;
    /** Average tool calls per task */
    avgToolCalls: number;
    /** Average token spend per task */
    avgTokenSpend: number;
    /** Total gate violations */
    totalViolations: number;
    /** Tasks requiring human intervention */
    humanInterventions: number;
    /** Per-task-class success rates */
    classSuccessRates: Record<ABTaskClass, number>;
    /** Composite score: success_rate - 0.1*norm_cost - 0.2*violations - 0.1*interventions */
    compositeScore: number;
}
/** Complete A/B benchmark report */
export interface ABReport {
    /** Config A results (no control plane) */
    configA: {
        label: string;
        taskResults: ABTaskResult[];
        metrics: ABMetrics;
    };
    /** Config B results (with Phase 1 control plane) */
    configB: {
        label: string;
        taskResults: ABTaskResult[];
        metrics: ABMetrics;
    };
    /** Composite score delta (B - A) */
    compositeDelta: number;
    /** Per-task-class deltas */
    classDeltas: Record<ABTaskClass, number>;
    /** Does B beat A by ≥0.2 on composite across ≥3 task classes? */
    categoryShift: boolean;
    /** Proof chain envelopes */
    proofChain: ProofEnvelope[];
    /** Formatted human-readable report */
    report: string;
}
/**
 * Run an A/B benchmark comparing agent performance with and without
 * the Guidance Control Plane.
 *
 * **Config A** (baseline): No guidance — executor runs without setContext()
 * **Config B** (treatment): With guidance — executor gets setContext(claudeMd) +
 *   gate simulation on every output
 *
 * The 20 tasks span 7 task classes drawn from real Claude Flow repo history:
 * bug-fix (3), feature (5), refactor (3), security (3), deployment (2),
 * test (2), performance (2).
 *
 * KPIs tracked per task:
 * - success rate, tool calls, token spend, violations, human interventions
 *
 * Composite score: `success_rate - 0.1*norm_cost - 0.2*violations - 0.1*interventions`
 *
 * **Success criterion**: B beats A by ≥0.2 on composite across ≥3 task classes
 * = "category shift"
 *
 * @param claudeMdContent - The CLAUDE.md content used for Config B
 * @param options - Executor, tasks, proof key, work directory
 * @returns ABReport with full per-task and per-class breakdown
 */
export declare function abBenchmark(claudeMdContent: string, options?: {
    executor?: IHeadlessExecutor;
    tasks?: ABTask[];
    proofKey?: string;
    workDir?: string;
}): Promise<ABReport>;
/**
 * Get the default 20 A/B benchmark tasks.
 * Exported for test customization and documentation.
 */
export declare function getDefaultABTasks(): ABTask[];
export {};
//# sourceMappingURL=analyzer.d.ts.map