feat: bench - max tokens for the judge, improving default prompt, sho…

…rt 0shot eval prompt, fixing judge prompt display in tasks report, judge prompt selection
av · Oct 6, 2024 · 91bc947 · 91bc947
1 parent 3054459
commit 91bc947
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 10 deletions.
diff --git a/bench/src/config.ts b/bench/src/config.ts
@@ -24,6 +24,7 @@ export const config = {
         apiUrl: Deno.env.get('HARBOR_BENCH_JUDGE_API'),
         apiKey: Deno.env.get('HARBOR_BENCH_JUDGE_API_KEY'),
         prompt: Deno.env.get('HARBOR_BENCH_JUDGE_PROMPT') ?? 'default',
+        max_tokens: Deno.env.get('HARBOR_BENCH_JUDGE_MAX_TOKENS') ?? 2048,
         temperature: 0,
         seed: 42,
     } as LLMConfig,

diff --git a/bench/src/judge.ts b/bench/src/judge.ts
@@ -25,17 +25,17 @@ Correct response: Yes
 
 Question: Who wrote "Romeo and Juliet"?
 Answer: Shakespeare.
-Criterion: The answer names Shakespeare as the author
+Criterion: Answer is Shakespeare
 Correct response: Yes
 
 Question: What is the capital of France?
-Answer: Paris
-Criterion: Answer mentions Paris being a capital of France
-Correct response: Yes
+Answer: London
+Criterion: Answer is Paris
+Correct response: No
 
-Question:
+Question: What is the EU capital?
 Answer: Paris
-Criterion: Answer mentions Paris
+Criterion: Answer mentions Brussels
 Correct response: No
 </instructions>
 
@@ -102,7 +102,36 @@ ${criteria}
 Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric.
 `.trim();
 
+
+export const short = ({
+    question,
+    answer,
+    criteria,
+}) => `
+<instructions>
+You are an impartial evaluator.
+You will be given a question, an answer, and a specific criteria to evaluate that answer.
+Respond with "Yes" if and only if the criterion is met.
+Respond with "No" if the criterion is not met or only partially met.
+Your response must be either "Yes" or "No" only, everything else will be ignored.
+</instructions>
+
+<question>
+${question}
+</question>
+
+<answer>
+${answer}
+</answer>
+
+<criteria>
+${criteria}
+</criteria>
+`;
+
+
 export const prompts = {
     default: prompt,
     flow,
+    short,
 };
diff --git a/bench/src/llm.ts b/bench/src/llm.ts
@@ -1,6 +1,7 @@
 import { config } from "./config.ts";
 import { omit, sleep } from './utils.ts';
 import { log } from './log.ts';
+import { prompts } from './judge.ts';
 
 export type LLMOptions = {
   max_tokens?: number;
@@ -11,12 +12,12 @@ export type LLMConfig = {
   model: string;
   apiUrl: string;
   apiKey?: string;
-  prompt?: string;
+  prompt?: keyof typeof prompts;
   options?: LLMOptions;
 };
 
 export class LLM {
-  private llm: LLMConfig;
+  llm: LLMConfig;
 
   constructor(llm: LLMConfig) {
     this.llm = llm;

diff --git a/bench/src/report.ts b/bench/src/report.ts
@@ -1,4 +1,4 @@
-import { prompt } from './judge.ts';
+import { prompts } from './judge.ts';
 
 export const summaryTemplate = (data: unknown) => `
 <!DOCTYPE html>
@@ -313,6 +313,7 @@ export const runsTemplate = (runs: unknown) => {
                 <ul>
                     ${Object.entries(task.criteria).map(([key, value]) => {
                         const result = task.results[key];
+                        const prompt = prompts[judge.llm.prompt ?? 'default'] ?? prompts.default;
                         const judgePrompt = prompt({ question: task.question, answer: task.answer, criteria: value })
                         return `
                     <li>

diff --git a/bench/src/task.ts b/bench/src/task.ts
@@ -1,4 +1,4 @@
-import { prompt } from "./judge.ts";
+import { prompts } from "./judge.ts";
 
 import type { Task } from "./tasks.ts";
 import type { LLM } from "./llm.ts";
@@ -30,6 +30,7 @@ export class BenchTask implements Task {
 
   async eval(judge: LLM) {
     for (const [key, value] of Object.entries(this.criteria)) {
+      const prompt = prompts[judge.llm.prompt ?? 'default'] ?? prompts.default;
       const result = await judge.chat(
         prompt({
           question: this.question,