diff --git a/eval/model_tester.py b/eval/model_tester.py index 0bbb626..cb61083 100644 --- a/eval/model_tester.py +++ b/eval/model_tester.py @@ -106,7 +106,7 @@ def parse_args(): }] testcases_for_chinese_puretext = testcases_for_chinese[:3] -testcases_for_chinese_pic_descrip = testcases_for_chinese[-5:-1] +testcases_for_chinese_pic_descrip = testcases_for_chinese[-5:] from eval import get_evaluator @@ -126,9 +126,9 @@ def parse_args(): # if test_args.input_type == 0: # testcases = testcases_for_chinese # else: - testcases = testcases_for_chinese - # testcases = testcases_for_chinese_pic_descrip[-1:] - testcases = testcases_for_chinese[-1:] + # testcases = testcases_for_chinese + testcases = testcases_for_chinese_pic_descrip + # testcases = testcases_for_chinese[-1:] testcases = dict(zip(range(len(testcases)), testcases)) for i in testcases: diff --git a/eval/models/qwen_hf.py b/eval/models/qwen_hf.py index 06bb814..6a2085d 100644 --- a/eval/models/qwen_hf.py +++ b/eval/models/qwen_hf.py @@ -35,19 +35,29 @@ def __init__(self, model_dir="Qwen/Qwen-VL-Chat-Int4", # "Qwen/Qwen-VL-Chat" def prepare_inputs(self, content, image_list=None, image_path=None): if image_list: match = re.findall("", content) + content_prefix = "" if len(match) > 0: - for img_sub, image_path in zip(match, image_list): - content = content.replace(img_sub, "") - content = f"{image_path}\n" + content + if len(image_list) == 1: + content = content.replace(match[0], "") + content_prefix += f"{image_list[0]}\n" # align with previous setting + else: + for i, (img_sub, image_path) in enumerate(zip(match, image_list)): + content = content.replace(img_sub, f"[IMAGE_{i+1}]") + content_prefix += f"Picture {i+1}: {image_path}\n" elif len(image_list) > 0: # This is the universal setting of parsing one-round dialogue questions. # in `get_prompt` we cleared all img tokens in the question. However that's critically fatal in one-image question # We need to add the image paths back! - for image_path in image_list: - content = f"{image_path}\n" + content + if len(image_list) == 1: + content_prefix += f"{image_list[0]}\n" # align with our previous setting + else: + for i, image_path in enumerate(image_list): + content_prefix += f"Picture {i+1}: {image_path}\n" + content = content_prefix + content elif image_path: # The reason it literally works is that in multi-round dialogue questions we parse `image_path` and the information doesn't get lost! content = f"{image_path}\n" + content # The surprising bug says that qwen read the images at the head of text inputs. + # print(content) # debug only return content def generate_response(self, input): diff --git a/eval/models/qwen_ms.py b/eval/models/qwen_ms.py index 78d1873..849c2dc 100644 --- a/eval/models/qwen_ms.py +++ b/eval/models/qwen_ms.py @@ -38,21 +38,31 @@ def __init__(self, model_path="Qwen/Qwen-VL-Chat-Int4", # "Qwen/Qwen-VL-Chat" def prepare_inputs(self, content, image_list=None, image_path=None): if image_list: match = re.findall("", content) + content_prefix = "" if len(match) > 0: - for img_sub, image_path in zip(match, image_list): - content = content.replace(img_sub, "") - content = f"{image_path}" + content + if len(image_list) == 1: + content = content.replace(match[0], "") + content_prefix += f"{image_list[0]}\n" # align with previous setting + else: + for i, (img_sub, image_path) in enumerate(zip(match, image_list)): + content = content.replace(img_sub, f"[IMAGE_{i+1}]") + content_prefix += f"Picture {i+1}: {image_path}\n" elif len(image_list) > 0: # This is the universal setting of parsing one-round dialogue questions. # in `get_prompt` we cleared all img tokens in the question. However that's critically fatal in one-image question # We need to add the image paths back! - for image_path in image_list: - content = f"{image_path}" + content + if len(image_list) == 1: + content_prefix += f"{image_list[0]}\n" # align with our previous setting + else: + for i, image_path in enumerate(image_list): + content_prefix += f"Picture {i+1}: {image_path}\n" + content = content_prefix + content elif image_path: # The reason it literally works is that in multi-round dialogue questions we parse `image_path` and the information doesn't get lost! - content = f"{image_path}" + content # The surprising bug says that qwen read the images at the head of text inputs. + content = f"{image_path}\n" + content # The surprising bug says that qwen read the images at the head of text inputs. return content + def generate_response(self, input): if isinstance(input, dict): question = input diff --git a/models/.gitkeep b/models/.gitkeep deleted file mode 100644 index e69de29..0000000