Press n or j to go to the next uncovered block, b, p or k for the previous block.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | 1x 1x 9x 9x 9x 8x 8x 8x 7x 7x 1x 6x 6x 2x 8x 8x 8x 8x 3x 8x 2x 4x 8x 8x 1x 2x 1x 1x | import OpenAI from "openai"; import { LLMEvaluator, AgentLabConfig } from "./types"; /** * OpenAI-based evaluator for agent responses */ export class OpenAIEvaluator implements LLMEvaluator { private client: OpenAI; private model: string; private temperature: number; constructor(config: AgentLabConfig) { this.client = new OpenAI({ apiKey: config.llm.apiKey, }); this.model = config.llm.model || "gpt-4-turbo-preview"; this.temperature = config.llm.temperature || 0.3; } async evaluate( input: string | Record<string, any>, actualResponse: string | Record<string, any>, expectedBehavior?: string, exampleResponses?: string[] ): Promise<{ passed: boolean; reasoning: string }> { const prompt = this.buildEvaluationPrompt( input, actualResponse, expectedBehavior, exampleResponses ); try { const response = await this.client.chat.completions.create({ model: this.model, temperature: this.temperature, messages: [ { role: "system", content: `You are an expert evaluator for AI agent responses. Your job is to determine if an agent's response is appropriate given the input and expected behavior. You should evaluate based on: 1. Correctness: Does the response address the input appropriately? 2. Relevance: Is the response relevant to what was asked? 3. Quality: Is the response well-formed and useful? 4. Alignment: Does it match the expected behavior and example responses (if provided)? Respond in the following JSON format: { "passed": true/false, "reasoning": "Brief explanation of your evaluation" }`, }, { role: "user", content: prompt, }, ], }); const content = response.choices[0]?.message?.content; if (!content) { throw new Error("No response from LLM evaluator"); } // Parse JSON response const result = JSON.parse(content); return { passed: Boolean(result.passed), reasoning: result.reasoning || "No reasoning provided", }; } catch (error: any) { throw new Error(`LLM evaluation failed: ${error.message}`); } } private buildEvaluationPrompt( input: string | Record<string, any>, actualResponse: string | Record<string, any>, expectedBehavior?: string, exampleResponses?: string[] ): string { const inputStr = typeof input === "string" ? input : JSON.stringify(input, null, 2); const responseStr = typeof actualResponse === "string" ? actualResponse : JSON.stringify(actualResponse, null, 2); let prompt = `**Input:** \`\`\` ${inputStr} \`\`\` **Actual Response:** \`\`\` ${responseStr} \`\`\` `; if (expectedBehavior) { prompt += `\n**Expected Behavior:** ${expectedBehavior} `; } if (exampleResponses && exampleResponses.length > 0) { prompt += `\n**Example Appropriate Responses:** ${exampleResponses.map((ex, i) => `${i + 1}. ${ex}`).join("\n")} `; } prompt += `\n\nPlease evaluate if the actual response is appropriate.`; return prompt; } } /** * Factory function to create evaluator based on config */ export function createEvaluator(config: AgentLabConfig): LLMEvaluator { switch (config.llm.provider) { case "openai": return new OpenAIEvaluator(config); default: throw new Error(`Unsupported LLM provider: ${config.llm.provider}`); } } |