Skip to content

Commit

Permalink
feat: bench - max tokens for the judge, improving default prompt, sho…
Browse files Browse the repository at this point in the history
…rt 0shot eval prompt, fixing judge prompt display in tasks report, judge prompt selection
  • Loading branch information
av committed Oct 6, 2024
1 parent 3054459 commit 91bc947
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 10 deletions.
1 change: 1 addition & 0 deletions bench/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export const config = {
apiUrl: Deno.env.get('HARBOR_BENCH_JUDGE_API'),
apiKey: Deno.env.get('HARBOR_BENCH_JUDGE_API_KEY'),
prompt: Deno.env.get('HARBOR_BENCH_JUDGE_PROMPT') ?? 'default',
max_tokens: Deno.env.get('HARBOR_BENCH_JUDGE_MAX_TOKENS') ?? 2048,
temperature: 0,
seed: 42,
} as LLMConfig,
Expand Down
41 changes: 35 additions & 6 deletions bench/src/judge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ Correct response: Yes
Question: Who wrote "Romeo and Juliet"?
Answer: Shakespeare.
Criterion: The answer names Shakespeare as the author
Criterion: Answer is Shakespeare
Correct response: Yes
Question: What is the capital of France?
Answer: Paris
Criterion: Answer mentions Paris being a capital of France
Correct response: Yes
Answer: London
Criterion: Answer is Paris
Correct response: No
Question:
Question: What is the EU capital?
Answer: Paris
Criterion: Answer mentions Paris
Criterion: Answer mentions Brussels
Correct response: No
</instructions>
Expand Down Expand Up @@ -102,7 +102,36 @@ ${criteria}
Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric.
`.trim();


export const short = ({
question,
answer,
criteria,
}) => `
<instructions>
You are an impartial evaluator.
You will be given a question, an answer, and a specific criteria to evaluate that answer.
Respond with "Yes" if and only if the criterion is met.
Respond with "No" if the criterion is not met or only partially met.
Your response must be either "Yes" or "No" only, everything else will be ignored.
</instructions>
<question>
${question}
</question>
<answer>
${answer}
</answer>
<criteria>
${criteria}
</criteria>
`;


export const prompts = {
default: prompt,
flow,
short,
};
5 changes: 3 additions & 2 deletions bench/src/llm.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { config } from "./config.ts";
import { omit, sleep } from './utils.ts';
import { log } from './log.ts';
import { prompts } from './judge.ts';

export type LLMOptions = {
max_tokens?: number;
Expand All @@ -11,12 +12,12 @@ export type LLMConfig = {
model: string;
apiUrl: string;
apiKey?: string;
prompt?: string;
prompt?: keyof typeof prompts;
options?: LLMOptions;
};

export class LLM {
private llm: LLMConfig;
llm: LLMConfig;

constructor(llm: LLMConfig) {
this.llm = llm;
Expand Down
3 changes: 2 additions & 1 deletion bench/src/report.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { prompt } from './judge.ts';
import { prompts } from './judge.ts';

export const summaryTemplate = (data: unknown) => `
<!DOCTYPE html>
Expand Down Expand Up @@ -313,6 +313,7 @@ export const runsTemplate = (runs: unknown) => {
<ul>
${Object.entries(task.criteria).map(([key, value]) => {
const result = task.results[key];
const prompt = prompts[judge.llm.prompt ?? 'default'] ?? prompts.default;
const judgePrompt = prompt({ question: task.question, answer: task.answer, criteria: value })
return `
<li>
Expand Down
3 changes: 2 additions & 1 deletion bench/src/task.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { prompt } from "./judge.ts";
import { prompts } from "./judge.ts";

import type { Task } from "./tasks.ts";
import type { LLM } from "./llm.ts";
Expand Down Expand Up @@ -30,6 +30,7 @@ export class BenchTask implements Task {

async eval(judge: LLM) {
for (const [key, value] of Object.entries(this.criteria)) {
const prompt = prompts[judge.llm.prompt ?? 'default'] ?? prompts.default;
const result = await judge.chat(
prompt({
question: this.question,
Expand Down

0 comments on commit 91bc947

Please sign in to comment.