From be0b85711f942beb28169a623eb196d68f694489 Mon Sep 17 00:00:00 2001
From: void-b583x2-NULL <xuyang0112@hotmail.com>
Date: Fri, 2 Feb 2024 18:05:54 +0000
Subject: [PATCH] fix qwen multi picture testing

---
 eval/model_tester.py   |  8 ++++----
 eval/models/qwen_hf.py | 20 +++++++++++++++-----
 eval/models/qwen_ms.py | 22 ++++++++++++++++------
 models/.gitkeep        |  0
 4 files changed, 35 insertions(+), 15 deletions(-)
 delete mode 100644 models/.gitkeep
diff --git a/eval/model_tester.py b/eval/model_tester.py
index 0bbb626..cb61083 100644
--- a/eval/model_tester.py
+++ b/eval/model_tester.py
@@ -106,7 +106,7 @@ def parse_args():
 }]
 
 testcases_for_chinese_puretext = testcases_for_chinese[:3]
-testcases_for_chinese_pic_descrip = testcases_for_chinese[-5:-1]
+testcases_for_chinese_pic_descrip = testcases_for_chinese[-5:]
 
 from eval import get_evaluator
 
@@ -126,9 +126,9 @@ def parse_args():
         # if test_args.input_type == 0:
         #     testcases = testcases_for_chinese
         # else:
-    testcases = testcases_for_chinese
-    # testcases = testcases_for_chinese_pic_descrip[-1:]
-    testcases = testcases_for_chinese[-1:]
+    # testcases = testcases_for_chinese
+    testcases = testcases_for_chinese_pic_descrip
+    # testcases = testcases_for_chinese[-1:]
 
     testcases = dict(zip(range(len(testcases)), testcases))
     for i in testcases:
diff --git a/eval/models/qwen_hf.py b/eval/models/qwen_hf.py
index 06bb814..6a2085d 100644
--- a/eval/models/qwen_hf.py
+++ b/eval/models/qwen_hf.py
@@ -35,19 +35,29 @@ def __init__(self, model_dir="Qwen/Qwen-VL-Chat-Int4",  # "Qwen/Qwen-VL-Chat"
     def prepare_inputs(self, content, image_list=None, image_path=None):
         if image_list:
             match = re.findall("<img_[0-9]+>", content)
+            content_prefix = ""
             if len(match) > 0:
-                for img_sub, image_path in zip(match, image_list):
-                    content = content.replace(img_sub, "")
-                    content = f"<img>{image_path}</img>\n" + content
+                if len(image_list) == 1:
+                    content = content.replace(match[0], "")
+                    content_prefix += f"<img>{image_list[0]}</img>\n" # align with previous setting
+                else:
+                    for i, (img_sub, image_path) in enumerate(zip(match, image_list)):
+                        content = content.replace(img_sub, f"[IMAGE_{i+1}]")
+                        content_prefix += f"Picture {i+1}: <img>{image_path}</img>\n"
             elif len(image_list) > 0:
                 # This is the universal setting of parsing one-round dialogue questions.
                 # in `get_prompt` we cleared all img tokens in the question. However that's critically fatal in one-image question
                 # We need to add the image paths back!
-                for image_path in image_list:
-                    content = f"<img>{image_path}</img>\n" + content
+                if len(image_list) == 1:
+                    content_prefix += f"<img>{image_list[0]}</img>\n" # align with our previous setting
+                else:
+                    for i, image_path in enumerate(image_list):
+                        content_prefix += f"Picture {i+1}: <img>{image_path}</img>\n"
+            content = content_prefix + content
         elif image_path:
             # The reason it literally works is that in multi-round dialogue questions we parse `image_path` and the information doesn't get lost!
             content = f"<img>{image_path}</img>\n" + content  # The surprising bug says that qwen read the images at the head of text inputs.
+        # print(content) # debug only
         return content
 
     def generate_response(self, input):
diff --git a/eval/models/qwen_ms.py b/eval/models/qwen_ms.py
index 78d1873..849c2dc 100644
--- a/eval/models/qwen_ms.py
+++ b/eval/models/qwen_ms.py
@@ -38,21 +38,31 @@ def __init__(self, model_path="Qwen/Qwen-VL-Chat-Int4",  # "Qwen/Qwen-VL-Chat"
     def prepare_inputs(self, content, image_list=None, image_path=None):
         if image_list:
             match = re.findall("<img_[0-9]+>", content)
+            content_prefix = ""
             if len(match) > 0:
-                for img_sub, image_path in zip(match, image_list):
-                    content = content.replace(img_sub, "")
-                    content = f"<img>{image_path}</img>" + content
+                if len(image_list) == 1:
+                    content = content.replace(match[0], "")
+                    content_prefix += f"<img>{image_list[0]}</img>\n" # align with previous setting
+                else:
+                    for i, (img_sub, image_path) in enumerate(zip(match, image_list)):
+                        content = content.replace(img_sub, f"[IMAGE_{i+1}]")
+                        content_prefix += f"Picture {i+1}: <img>{image_path}</img>\n"
             elif len(image_list) > 0:
                 # This is the universal setting of parsing one-round dialogue questions.
                 # in `get_prompt` we cleared all img tokens in the question. However that's critically fatal in one-image question
                 # We need to add the image paths back!
-                for image_path in image_list:
-                    content = f"<img>{image_path}</img>" + content
+                if len(image_list) == 1:
+                    content_prefix += f"<img>{image_list[0]}</img>\n" # align with our previous setting
+                else:
+                    for i, image_path in enumerate(image_list):
+                        content_prefix += f"Picture {i+1}: <img>{image_path}</img>\n"
+            content = content_prefix + content
         elif image_path:
             # The reason it literally works is that in multi-round dialogue questions we parse `image_path` and the information doesn't get lost!
-            content = f"<img>{image_path}</img>" + content  # The surprising bug says that qwen read the images at the head of text inputs.
+            content = f"<img>{image_path}</img>\n" + content  # The surprising bug says that qwen read the images at the head of text inputs.
         return content
 
+
     def generate_response(self, input):
         if isinstance(input, dict):
             question = input
diff --git a/models/.gitkeep b/models/.gitkeep
deleted file mode 100644
index e69de29..0000000