All files evaluator.ts

100% Statements 28/28
82.35% Branches 14/17
100% Functions 5/5
100% Lines 28/28

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 1301x           1x           9x     9x 9x                 8x             8x 8x                                                     7x 7x 1x       6x 6x         2x                     8x   8x       8x                     8x 3x         8x 2x 4x       8x   8x             1x 2x   1x   1x      
import OpenAI from "openai";
import { LLMEvaluator, AgentLabConfig } from "./types";
 
/**
 * OpenAI-based evaluator for agent responses
 */
export class OpenAIEvaluator implements LLMEvaluator {
  private client: OpenAI;
  private model: string;
  private temperature: number;
 
  constructor(config: AgentLabConfig) {
    this.client = new OpenAI({
      apiKey: config.llm.apiKey,
    });
    this.model = config.llm.model || "gpt-4-turbo-preview";
    this.temperature = config.llm.temperature || 0.3;
  }
 
  async evaluate(
    input: string | Record<string, any>,
    actualResponse: string | Record<string, any>,
    expectedBehavior?: string,
    exampleResponses?: string[]
  ): Promise<{ passed: boolean; reasoning: string }> {
    const prompt = this.buildEvaluationPrompt(
      input,
      actualResponse,
      expectedBehavior,
      exampleResponses
    );
 
    try {
      const response = await this.client.chat.completions.create({
        model: this.model,
        temperature: this.temperature,
        messages: [
          {
            role: "system",
            content: `You are an expert evaluator for AI agent responses. Your job is to determine if an agent's response is appropriate given the input and expected behavior.
 
You should evaluate based on:
1. Correctness: Does the response address the input appropriately?
2. Relevance: Is the response relevant to what was asked?
3. Quality: Is the response well-formed and useful?
4. Alignment: Does it match the expected behavior and example responses (if provided)?
 
Respond in the following JSON format:
{
  "passed": true/false,
  "reasoning": "Brief explanation of your evaluation"
}`,
          },
          {
            role: "user",
            content: prompt,
          },
        ],
      });
 
      const content = response.choices[0]?.message?.content;
      if (!content) {
        throw new Error("No response from LLM evaluator");
      }
 
      // Parse JSON response
      const result = JSON.parse(content);
      return {
        passed: Boolean(result.passed),
        reasoning: result.reasoning || "No reasoning provided",
      };
    } catch (error: any) {
      throw new Error(`LLM evaluation failed: ${error.message}`);
    }
  }
 
  private buildEvaluationPrompt(
    input: string | Record<string, any>,
    actualResponse: string | Record<string, any>,
    expectedBehavior?: string,
    exampleResponses?: string[]
  ): string {
    const inputStr =
      typeof input === "string" ? input : JSON.stringify(input, null, 2);
    const responseStr =
      typeof actualResponse === "string"
        ? actualResponse
        : JSON.stringify(actualResponse, null, 2);
 
    let prompt = `**Input:**
\`\`\`
${inputStr}
\`\`\`
 
**Actual Response:**
\`\`\`
${responseStr}
\`\`\`
`;
 
    if (expectedBehavior) {
      prompt += `\n**Expected Behavior:**
${expectedBehavior}
`;
    }
 
    if (exampleResponses && exampleResponses.length > 0) {
      prompt += `\n**Example Appropriate Responses:**
${exampleResponses.map((ex, i) => `${i + 1}. ${ex}`).join("\n")}
`;
    }
 
    prompt += `\n\nPlease evaluate if the actual response is appropriate.`;
 
    return prompt;
  }
}
 
/**
 * Factory function to create evaluator based on config
 */
export function createEvaluator(config: AgentLabConfig): LLMEvaluator {
  switch (config.llm.provider) {
    case "openai":
      return new OpenAIEvaluator(config);
    default:
      throw new Error(`Unsupported LLM provider: ${config.llm.provider}`);
  }
}