"""
Eval Loop Runtime — generated by aiwg nlp new
Pattern: eval-loop
Dependencies: anthropic

The generator and evaluator are ISOLATED calls — no shared context.
"""

from __future__ import annotations

import json
import time
from pathlib import Path
from typing import Any

import anthropic

PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
GENERATOR_MODEL = "claude-haiku-4-5"
EVALUATOR_MODEL = "claude-haiku-4-5"
MAX_TOKENS_GEN = 1024
MAX_TOKENS_EVAL = 256
PASS_THRESHOLD = 0.85
MAX_ATTEMPTS = 3
TIMEOUT_SECONDS = 30
MAX_RETRIES = 3
BACKOFF_SECONDS = 1.0


def load_prompt(filename: str, variables: dict[str, str]) -> tuple[str, str]:
    path = PROMPTS_DIR / filename
    content = path.read_text(encoding="utf-8")
    if content.startswith("---"):
        _, _, content = content.split("---", 2)
    system, user, section = "", "", None
    for line in content.splitlines():
        if line.strip() == "## System":
            section = "system"
        elif line.strip() == "## User":
            section = "user"
        elif section == "system":
            system += line + "\n"
        elif section == "user":
            user += line + "\n"
    for k, v in variables.items():
        system = system.replace("{{" + k + "}}", v)
        user = user.replace("{{" + k + "}}", v)
    return system.strip(), user.strip()


def call_llm(
    client: anthropic.Anthropic, system: str, user: str, model: str, max_tokens: int
) -> str:
    last_error = None
    for attempt in range(MAX_RETRIES):
        try:
            response = client.messages.create(
                model=model,
                max_tokens=max_tokens,
                system=system,
                messages=[{"role": "user", "content": user}],
                timeout=TIMEOUT_SECONDS,
            )
            return response.content[0].text
        except (anthropic.RateLimitError, anthropic.APIStatusError) as e:
            last_error = e
            time.sleep(BACKOFF_SECONDS * (2 ** attempt))
    raise RuntimeError(f"LLM call failed after {MAX_RETRIES} attempts") from last_error


def generate(client: anthropic.Anthropic, input_text: str, feedback: str = "") -> str:
    """Generate output. If feedback provided, prepend refinement instruction."""
    variables = {"input": input_text, "output_format": "JSON"}
    if feedback:
        variables["input"] = f"Previous attempt was insufficient. Feedback: {feedback}\n\nOriginal request: {input_text}"
    system, user = load_prompt("generator.prompt.md", variables)
    return call_llm(client, system, user, GENERATOR_MODEL, MAX_TOKENS_GEN)


def evaluate(client: anthropic.Anthropic, input_text: str, output: str) -> dict[str, Any]:
    """Isolated evaluation — no generator context passed."""
    system, user = load_prompt("evaluator.prompt.md", {"input": input_text, "output": output})
    raw = call_llm(client, system, user, EVALUATOR_MODEL, MAX_TOKENS_EVAL)
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        raw = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
    return json.loads(raw)


def run_eval_loop(
    input_text: str,
    pass_threshold: float = PASS_THRESHOLD,
    max_attempts: int = MAX_ATTEMPTS,
) -> dict[str, Any]:
    """Run the generate → evaluate → refine loop until pass or max attempts."""
    client = anthropic.Anthropic()
    feedback = ""
    history = []

    for attempt in range(1, max_attempts + 1):
        output = generate(client, input_text, feedback)
        result = evaluate(client, input_text, output)  # ISOLATED — no generator context

        history.append({
            "attempt": attempt,
            "output": output,
            "score": result.get("score", 0.0),
            "pass": result.get("pass", False),
            "feedback": result.get("feedback", ""),
        })

        if result.get("pass", False):
            return {
                "output": output,
                "score": result["score"],
                "pass": True,
                "attempts": attempt,
                "history": history,
            }

        feedback = result.get("feedback", "")

    return {
        "output": output,
        "score": result.get("score", 0.0),
        "pass": False,
        "attempts": max_attempts,
        "history": history,
        "escalation_reason": f"Failed to pass after {max_attempts} attempts",
    }


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python loop.py '<input>'")
        sys.exit(1)
    result = run_eval_loop(sys.argv[1])
    print(json.dumps(result, indent=2))
