forked from TIGER-AI-Lab/MMLU-Pro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_accuracy.py
78 lines (67 loc) · 2.04 KB
/
compute_accuracy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import glob
import sys
import json
import re
import random
assert len(sys.argv) > 1, 'You need to pass the directory'
path = sys.argv[1]
def extract_answer(text, level):
if level == 'l1':
pattern = r"answer is \(?([A-J])\)?"
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return None
elif level == 'l2':
pattern = r"answer is \(?([A-J])\)?"
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return extract_again(text)
def extract_again(text):
match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
if match:
return match.group(1)
else:
return extract_final(text)
def extract_final(text):
pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
match = re.search(pattern, text, re.DOTALL)
if match:
return match.group(0)
else:
return None
for name in glob.glob(path + '/*'):
print('Level 1 regex' + '==' * 20)
succ, fail = 0, 0
with open(name, 'r') as f:
entries = json.load(f)
for e in entries:
pred = extract_answer(e['model_outputs'], 'l1')
if pred is None:
random.seed(12345)
pred = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"])
# Remove the None cases
if pred == e['answer']:
succ += 1
else:
fail += 1
print(name, succ / (succ + fail))
print('Level 2 regex' + '==' * 20)
succ, fail = 0, 0
with open(name, 'r') as f:
entries = json.load(f)
for e in entries:
pred = extract_answer(e['model_outputs'], 'l2')
if pred is None:
random.seed(12345)
pred = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"])
# Remove the None cases
if pred == e['answer']:
succ += 1
else:
fail += 1
print(name, succ / (succ + fail))
print()