All files evaluator.ts

100% Statements 28/28
82.35% Branches 14/17
100% Functions 5/5
100% Lines 28/28
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130 1x
 
 
 
 
 
1x
 
 
 
 
 
9x
 
 
9x
9x
 
 
 
 
 
 
 
 
8x
 
 
 
 
 
 
8x
8x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7x
7x
1x
 
 
 
6x
6x
 
 
 
 
2x
 
 
 
 
 
 
 
 
 
 
8x
 
8x
 
 
 
8x
 
 
 
 
 
 
 
 
 
 
8x
3x
 
 
 
 
8x
2x
4x
 
 
 
8x
 
8x
 
 
 
 
 
 
1x
2x
 
1x
 
1x
 
 
  import OpenAI from "openai";
import { LLMEvaluator, AgentLabConfig } from "./types";
 
/**
 * OpenAI-based evaluator for agent responses
 */
export class OpenAIEvaluator implements LLMEvaluator {
  private client: OpenAI;
  private model: string;
  private temperature: number;
 
  constructor(config: AgentLabConfig) {
    this.client = new OpenAI({
      apiKey: config.llm.apiKey,
    });
    this.model = config.llm.model || "gpt-4-turbo-preview";
    this.temperature = config.llm.temperature || 0.3;
  }
 
  async evaluate(
    input: string | Record<string, any>,
    actualResponse: string | Record<string, any>,
    expectedBehavior?: string,
    exampleResponses?: string[]
  ): Promise<{ passed: boolean; reasoning: string }> {
    const prompt = this.buildEvaluationPrompt(
      input,
      actualResponse,
      expectedBehavior,
      exampleResponses
    );
 
    try {
      const response = await this.client.chat.completions.create({
        model: this.model,
        temperature: this.temperature,
        messages: [
          {
            role: "system",
            content: `You are an expert evaluator for AI agent responses. Your job is to determine if an agent's response is appropriate given the input and expected behavior.
 
You should evaluate based on:
1. Correctness: Does the response address the input appropriately?
2. Relevance: Is the response relevant to what was asked?
3. Quality: Is the response well-formed and useful?
4. Alignment: Does it match the expected behavior and example responses (if provided)?
 
Respond in the following JSON format:
{
  "passed": true/false,
  "reasoning": "Brief explanation of your evaluation"
}`,
          },
          {
            role: "user",
            content: prompt,
          },
        ],
      });
 
      const content = response.choices[0]?.message?.content;
      if (!content) {
        throw new Error("No response from LLM evaluator");
      }
 
      // Parse JSON response
      const result = JSON.parse(content);
      return {
        passed: Boolean(result.passed),
        reasoning: result.reasoning || "No reasoning provided",
      };
    } catch (error: any) {
      throw new Error(`LLM evaluation failed: ${error.message}`);
    }
  }
 
  private buildEvaluationPrompt(
    input: string | Record<string, any>,
    actualResponse: string | Record<string, any>,
    expectedBehavior?: string,
    exampleResponses?: string[]
  ): string {
    const inputStr =
      typeof input === "string" ? input : JSON.stringify(input, null, 2);
    const responseStr =
      typeof actualResponse === "string"
        ? actualResponse
        : JSON.stringify(actualResponse, null, 2);
 
    let prompt = `**Input:**
\`\`\`
${inputStr}
\`\`\`
 
**Actual Response:**
\`\`\`
${responseStr}
\`\`\`
`;
 
    if (expectedBehavior) {
      prompt += `\n**Expected Behavior:**
${expectedBehavior}
`;
    }
 
    if (exampleResponses && exampleResponses.length > 0) {
      prompt += `\n**Example Appropriate Responses:**
${exampleResponses.map((ex, i) => `${i + 1}. ${ex}`).join("\n")}
`;
    }
 
    prompt += `\n\nPlease evaluate if the actual response is appropriate.`;
 
    return prompt;
  }
}
 
/**
 * Factory function to create evaluator based on config
 */
export function createEvaluator(config: AgentLabConfig): LLMEvaluator {
  switch (config.llm.provider) {
    case "openai":
      return new OpenAIEvaluator(config);
    default:
      throw new Error(`Unsupported LLM provider: ${config.llm.provider}`);
  }
}