diff --git a/eval/args.py b/eval/args.py index 575b780..6f38c2e 100644 --- a/eval/args.py +++ b/eval/args.py @@ -63,6 +63,12 @@ "executor": "llama2", "evaluator": "Llama2Evaluator", }, + "dfm": { + "model_type": "local", + "support_input": [0, 1], + "executor": "dfm", + "evaluator": "DFMEvaluator", + }, } diff --git a/eval/eval.sh b/eval/eval.sh index ffa275e..145f9eb 100644 --- a/eval/eval.sh +++ b/eval/eval.sh @@ -1,23 +1,58 @@ #!/bin/bash +export CUDA_VISIBLE_DEVICES=3,4,5,6,7 python eval.py \ --problem_file ../data/problem_v1.2.0_20231217.json \ + --knowledge_file ../data/knowledge_v1.2.0_20231217.json \ --questions_type 0,1,2,3 \ --input_type 0 \ - --model llama2 \ - --model_dir ../models/dfm-2.0-13b \ - --cuda_device cuda:5 \ - --exp_name dfm-2.0-13b + --model dfm \ + --model_dir ../models/dfm-2.0-70b \ + --cuda_device auto python eval.py \ --problem_file ../data/problem_v1.2.0_20231217.json \ - --knowledge_file ../data/knowledge_v1.2.0_20231217.json \ + --caption_file ../data/captions_v1.2.0_20231217.csv \ + --questions_type 0,1,2,3 \ + --input_type 1 \ + --model dfm \ + --model_dir ../models/dfm-2.0-70b \ + --cuda_device auto + +python eval.py \ + --problem_file ../data/problem_v1.2.0_20231217.json \ + --caption_file ../data/ocr_v1.2.0_20231217.csv \ + --questions_type 0,1,2,3 \ + --input_type 1 \ + --model dfm \ + --model_dir ../models/dfm-2.0-70b \ + --cuda_device auto + +export CUDA_VISIBLE_DEVICES=0 +python eval.py \ + --problem_file ../data/problem_v1.2.0_20231217.json \ --questions_type 0,1,2,3 \ --input_type 0 \ - --model llama2 \ + --model dfm \ --model_dir ../models/dfm-2.0-13b \ - --cuda_device cuda:6 \ - --exp_name dfm-2.0-13b + --cuda_device auto +export CUDA_VISIBLE_DEVICES=1 +python eval.py \ + --problem_file ../data/problem_v1.2.0_20231217.json \ + --caption_file ../data/captions_v1.2.0_20231217.csv \ + --questions_type 0,1,2,3 \ + --input_type 1 \ + --model dfm \ + --model_dir ../models/dfm-2.0-13b \ + --cuda_device auto -python eval.py --checkpoint_dir ../results/dfm-2.0-13b_llama2_input_0_shot_0_kn_20240125_191329 \ No newline at end of file +export CUDA_VISIBLE_DEVICES=2 +python eval.py \ + --problem_file ../data/problem_v1.2.0_20231217.json \ + --caption_file ../data/ocr_v1.2.0_20231217.csv \ + --questions_type 0,1,2,3 \ + --input_type 1 \ + --model dfm \ + --model_dir ../models/dfm-2.0-13b \ + --cuda_device auto diff --git a/eval/extract_response_on_hard.sh b/eval/extract_response_on_hard.sh index 183cad0..11d8ba5 100755 --- a/eval/extract_response_on_hard.sh +++ b/eval/extract_response_on_hard.sh @@ -2,7 +2,7 @@ <<<<<<< Updated upstream #folder_list="gpt-4-vision-preview_input_2_shot_0_20231221_101231/add_no_image qwen-vl_input_2_shot_0_it_20231225_083611 gemini-pro-vision_input_2_shot_0_20231224_004723/add_no_image moss_input_0_shot_0_20231226_070654 gemini-pro_input_0_shot_0_20231223_141324 gpt-3.5-turbo-0613_input_0_shot_0_20231220_143731 viscpm_input_2_shot_0_it_bi_20231228_194002 gpt-4-1106-preview_input_0_shot_0_20231220_214000 visualglm_input_2_shot_0_it_20231226_070315" -folder_list="gpt-3.5-turbo-1106_input_0_shot_0_20240124_162631" +folder_list="dfm_input_0_shot_0_20240130_161509" for folder in $folder_list; do python extract_response_on_hard.py -j ../data/selected_hard_list_v1.2.0_20231217.json -i $folder diff --git a/eval/models/dfm_hf.py b/eval/models/dfm_hf.py new file mode 100644 index 0000000..98e2c74 --- /dev/null +++ b/eval/models/dfm_hf.py @@ -0,0 +1,40 @@ +"""dfm-2.0 evaluator with HuggingFace Transformers""" + +from transformers import AutoTokenizer, AutoModelForCausalLM +import transformers +import torch +import pdb + +class DFMEvaluator: + def __init__(self, model_dir="dfm-2.0-13b", max_tokens=200, device_map="cuda:0"): + self.model_dir = model_dir + self.sample_params = { + "max_new_tokens": max_tokens, + "do_sample": False, + } + self.device_map = device_map + + self.model = AutoModelForCausalLM.from_pretrained(self.model_dir, device_map=device_map, torch_dtype=torch.float16, trust_remote_code=True).half().eval() + self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir, trust_remote_code=True) + + self.model.generation_config.__dict__.update(self.sample_params) + + def prepare_inputs(self, content_sys, content): + content = f"<|system|>:{content_sys.strip()}\n<|user|>:{content.strip()}<|assistant|>:" + return content + + def generate_response(self, question): + message = self.prepare_inputs(question["prompted_system_content"],question["prompted_content"]) + inputs = self.tokenizer([message],add_special_tokens=False, return_tensors="pt") + pred = self.model.generate(input_ids=inputs.input_ids[0, :4096].cuda().unsqueeze(0), eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.eos_token_id, **self.sample_params, ) + input_length = inputs.input_ids.size(1) + response = self.tokenizer.decode(pred[0][input_length:], skip_special_tokens=True).strip() + return response, message + + def generate_answer(self, question): + response, message = self.generate_response(question) + question["input_message"] = message + question["prediction"] = response + question.pop("prompted_content") + question.pop("prompted_system_content") + return question diff --git a/eval/models/llama2_hf.py b/eval/models/llama2_hf.py index fe1271e..90c75fe 100644 --- a/eval/models/llama2_hf.py +++ b/eval/models/llama2_hf.py @@ -6,7 +6,7 @@ import pdb class Llama2Evaluator: - def __init__(self, model_dir="/home/ubuntu/tools/llama2/llama-2-13b-chat-hf", max_tokens=200, device_map="auto"): + def __init__(self, model_dir="Llama-2-13b-chat-hf", max_tokens=200, device_map="auto"): self.model_dir = model_dir self.sample_params = { "max_new_tokens": max_tokens, @@ -15,18 +15,18 @@ def __init__(self, model_dir="/home/ubuntu/tools/llama2/llama-2-13b-chat-hf", ma self.device_map = device_map self.model = AutoModelForCausalLM.from_pretrained(self.model_dir, device_map=device_map, torch_dtype=torch.float16, trust_remote_code=True).half().eval() - self.tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir, trust_remote_code=True) self.model.generation_config.__dict__.update(self.sample_params) - def prepare_inputs(self, content_sys,content): - content = f"[INST] <> {content_sys} <> \n\n {content} [/INST]" + def prepare_inputs(self, content_sys, content): + content = f"[INST] <> {content_sys} <> \n\n {{content}} [/INST]" return content def generate_response(self, question): message = self.prepare_inputs(question["prompted_system_content"],question["prompted_content"]) inputs = self.tokenizer([message],add_special_tokens=False, return_tensors="pt") - pred = self.model.generate(input_ids=inputs.input_ids[0, :2048].cuda().unsqueeze(0), eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.eos_token_id, **self.sample_params, ) + pred = self.model.generate(input_ids=inputs.input_ids[0, :4096].cuda().unsqueeze(0).to(self.device_map), eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.eos_token_id, **self.sample_params, ) input_length = inputs.input_ids.size(1) response = self.tokenizer.decode(pred[0][input_length:], skip_special_tokens=True).strip() return response, message diff --git a/eval/prompts.py b/eval/prompts.py index b7db0b4..5ed5186 100644 --- a/eval/prompts.py +++ b/eval/prompts.py @@ -182,7 +182,7 @@ def get_prompt(question, args): prompted += fs_shot_guide_example[args.lang][question_type] prompted += fs_end_example[args.lang] - if args.model in ['gpt','gpt4v','llama2','gemini','geminivision']: + if args.model in ['gpt','gpt4v','llama2','gemini','geminivision','dfm']: prompted_question["prompted_system_content"] = prompted prompted = "" # TODO: Identify GPT in this way seems not so reasonable. diff --git a/website/README_en.md b/website/README_en.md index bed2e4c..6f9f747 100644 --- a/website/README_en.md +++ b/website/README_en.md @@ -5,7 +5,7 @@ MULTI: Multimodal Understanding Leaderboard with Text and Images ## Authors -Zichen Zhu, Yang Xu, Lu Chen, Jingkai Yang, Yichuan Ma, Yimin Sun, Hailin Wen, Jiaqi Liu, Jinyu Cai, Yingzi Ma, Liangtai Sun, Zihan Zhao, Kai Yu +Zichen Zhu, Yang Xu, Lu Chen, Jingkai Yang, Yichuan Ma, Yiming Sun, Hailin Wen, Jiaqi Liu, Jinyu Cai, Yingzi Ma, Liangtai Sun, Zihan Zhao, Kai Yu X-LANCE Lab, Department of Computer Science and Engineering diff --git a/website/static/images/case_study.png b/website/static/images/case_study.png deleted file mode 100644 index 9340eaf..0000000 Binary files a/website/static/images/case_study.png and /dev/null differ diff --git a/website/static/images/case_study_wide.png b/website/static/images/case_study_wide.png deleted file mode 100644 index 93dfd68..0000000 Binary files a/website/static/images/case_study_wide.png and /dev/null differ diff --git a/website/static/images/data_anno.png b/website/static/images/data_anno.png index 1d71402..ddac291 100644 Binary files a/website/static/images/data_anno.png and b/website/static/images/data_anno.png differ diff --git a/website/static/images/data_aug.png b/website/static/images/data_aug.png index 1a73c63..8833c68 100644 Binary files a/website/static/images/data_aug.png and b/website/static/images/data_aug.png differ diff --git a/website/static/images/example.png b/website/static/images/example.png new file mode 100644 index 0000000..bdf3c26 Binary files /dev/null and b/website/static/images/example.png differ diff --git a/website/static/images/examples.png b/website/static/images/examples.png new file mode 100644 index 0000000..4ba17f5 Binary files /dev/null and b/website/static/images/examples.png differ diff --git a/website/static/images/overview.png b/website/static/images/overview.png index 076216c..bd865b2 100644 Binary files a/website/static/images/overview.png and b/website/static/images/overview.png differ diff --git a/website/static/images/platform.png b/website/static/images/platform.png index 904779c..1a1410c 100644 Binary files a/website/static/images/platform.png and b/website/static/images/platform.png differ diff --git a/website/static/images/prompt.png b/website/static/images/prompt.png deleted file mode 100644 index 633a867..0000000 Binary files a/website/static/images/prompt.png and /dev/null differ diff --git a/website/static/images/prompts_all.png b/website/static/images/prompts_all.png new file mode 100644 index 0000000..aae1d81 Binary files /dev/null and b/website/static/images/prompts_all.png differ diff --git a/website/static/pdfs/MULTI_Benchmark_v1.0.pdf b/website/static/pdfs/MULTI_Benchmark_v1.0.pdf new file mode 100644 index 0000000..f1f3b33 Binary files /dev/null and b/website/static/pdfs/MULTI_Benchmark_v1.0.pdf differ diff --git a/website/static/pdfs/license_agreement.txt b/website/static/pdfs/license_agreement.txt deleted file mode 100644 index e316efc..0000000 --- a/website/static/pdfs/license_agreement.txt +++ /dev/null @@ -1,9 +0,0 @@ -**VisIT-Bench Dataset License Agreement** - -The VisIT-Bench Dataset is licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). Alongside this license, the following conditions apply: - -1. **Purpose:** The dataset was primarily designed for use as a test set. - -2. **Commercial Use:** Commercially, the dataset may be used as a test set, but it's prohibited to use it as a training set. - -By accessing or using this dataset, you acknowledge and agree to abide by these terms in conjunction with the CC BY 4.0 license. diff --git a/website/static/pdfs/visit_bench_paper.pdf b/website/static/pdfs/visit_bench_paper.pdf deleted file mode 100644 index 82d33f9..0000000 Binary files a/website/static/pdfs/visit_bench_paper.pdf and /dev/null differ diff --git a/website/static/videos/banner_video.mp4 b/website/static/videos/banner_video.mp4 deleted file mode 100644 index 68eb69b..0000000 Binary files a/website/static/videos/banner_video.mp4 and /dev/null differ diff --git a/website/static/videos/carousel1.mp4 b/website/static/videos/carousel1.mp4 deleted file mode 100644 index 3dc429a..0000000 Binary files a/website/static/videos/carousel1.mp4 and /dev/null differ diff --git a/website/static/videos/carousel2.mp4 b/website/static/videos/carousel2.mp4 deleted file mode 100644 index f47c06c..0000000 Binary files a/website/static/videos/carousel2.mp4 and /dev/null differ diff --git a/website/static/videos/carousel3.mp4 b/website/static/videos/carousel3.mp4 deleted file mode 100644 index 698e9f9..0000000 Binary files a/website/static/videos/carousel3.mp4 and /dev/null differ