comparison

ku-nlp · Aug 24, 2023 · e323fba · e323fba
1 parent 100c331
commit e323fba
Show file tree

Hide file tree

Showing 25 changed files with 95,019 additions and 46 deletions.
diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py
@@ -24,7 +24,8 @@
 TIE_DELTA = 0.1
 
 # Categories that need reference answers
-NEED_REF_CATS = ["math", "reasoning", "coding"]
+#NEED_REF_CATS = ["math", "reasoning", "coding"]
+NEED_REF_CATS = []
 
 # Extract scores from judgments
 two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
@@ -104,10 +105,17 @@ def load_model_answers(answer_dir: str):
     for filename in filenames:
         model_name = os.path.basename(filename)[:-6]
         answer = {}
-        with open(filename) as fin:
-            for line in fin:
-                line = json.loads(line)
-                answer[line["question_id"]] = line
+
+        #print(filename)
+        with open(filename, "r") as fin:
+            if "lora" not in filename:
+                for line in fin:
+                    line = json.loads(line)
+                    answer[line["question_id"]] = line
+            else:
+                data = json.load(fin)
+                for line in data:
+                    answer[line["question_id"]] = line
         model_answers[model_name] = answer
 
     return model_answers
@@ -641,13 +649,16 @@ def check_data(questions, model_answers, ref_answers, models, judges):
             ), f"Missing model {m}'s answer to Question {q['question_id']}"
     # check ref answers
     for jg in judges.values():
+        print(jg)
         if not jg.ref_based:
             continue
         for q in questions:
             if q["category"] not in NEED_REF_CATS:
                 continue
+            print(q["question_id"])
+            #print(ref_answers[jg.model_name])
             assert (
-                q["question_id"] in ref_answers[jg.model_name]
+                int(q["question_id"]) in ref_answers[jg.model_name]
             ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
 
 

diff --git a/fastchat/llm_judge/data/jp_bench/convert.py b/fastchat/llm_judge/data/jp_bench/convert.py
@@ -0,0 +1,22 @@
+import json
+
+
+import sys
+
+dict_list = []
+infile = sys.argv[1]
+outfile = sys.argv[2]
+
+with open(infile, "r") as f:
+    for line in f.read().splitlines():
+    #data = json.load(f)
+    #for tmp_dict in data:
+        tmp_dict = json.loads(line)
+        tmp_dict["question_id"] = int(tmp_dict["question_id"])
+        tmp_dict["turns"] = [tmp_dict["text"]]
+        del tmp_dict["text"]
+
+
+        with open(outfile, "a") as f:
+            json.dump(tmp_dict, f, ensure_ascii=False)
+            f.write("\n")
diff --git a/fastchat/llm_judge/data/jp_bench/model_answer/Japanese-Alpaca-LoRA-7b.jsonl b/fastchat/llm_judge/data/jp_bench/model_answer/Japanese-Alpaca-LoRA-7b.jsonl