From f3ab9b9a85f8895f9a26abd24237ccca7ca49b17 Mon Sep 17 00:00:00 2001 From: DoraDong-2023 Date: Sun, 21 Jan 2024 22:40:06 -0500 Subject: [PATCH] enhance gpt response compatibility, fix gpu index selection, fix bert loading, --- docs/PyPI2APP.md | 8 ++---- src/dataloader/check_valid_API_annotate.py | 15 +++++++++++ src/dataloader/preprocess_retriever_data.py | 22 ++++++++++------ src/deploy/inference_dialog_server.py | 14 +++++++--- src/inference/retriever_finetune_inference.py | 26 ++++++++++++++++--- src/models/train_retriever.py | 12 ++++----- 6 files changed, 70 insertions(+), 27 deletions(-) diff --git a/docs/PyPI2APP.md b/docs/PyPI2APP.md index 5dcca38..b600269 100644 --- a/docs/PyPI2APP.md +++ b/docs/PyPI2APP.md @@ -114,7 +114,6 @@ python inference/retriever_bm25_inference.py --LIB ${LIB} --top_k 3 7. Fine-tune the retriever. You can finetune the retriever based on the [bert-base-uncased](https://huggingface.co/bert-base-uncased) model ```bash -CUDA_VISIBLE_DEVICES=0 # if you use gpu mkdir ./hugging_models/retriever_model_finetuned/${LIB} python models/train_retriever.py \ --data_path ./data/standard_process/${LIB}/retriever_train_data/ \ @@ -126,7 +125,8 @@ python models/train_retriever.py \ --warmup_steps 500 \ --max_seq_length 256 \ --optimize_top_k 3 \ - --plot_dir ./plot/${LIB}/retriever/ + --plot_dir ./plot/${LIB}/retriever/ \ + --gpu "1" ``` You can check the training performance curve under `./src/plot/${LIB}/` to determine whether you need more number of epochs. @@ -134,7 +134,6 @@ You can check the training performance curve under `./src/plot/${LIB}/` to deter 8. Test the inference performance using: ```bash export HUGGINGPATH=./hugging_models -CUDA_VISIBLE_DEVICES=0 # if you use gpu python inference/retriever_finetune_inference.py \ --retrieval_model_path ./hugging_models/retriever_model_finetuned/${LIB}/assigned \ --max_seq_length 256 \ @@ -161,7 +160,6 @@ Please refer to [lit-llama](https://github.com/Lightning-AI/lit-llama) for getti process data: ```bash -CUDA_VISIBLE_DEVICES=0 export TOKENIZERS_PARALLELISM=true python models/data_classification.py \ --pretrained_path ./hugging_models/llama-2-finetuned/checkpoints/lite-llama2/lit-llama.pth \ @@ -184,7 +182,6 @@ python models/data_classification.py \ Then, finetune model: ```bash -CUDA_VISIBLE_DEVICES=0 \ python models/train_classification.py \ --data_dir ./data/standard_process/${LIB}/classification_train/ \ --out_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/ \ @@ -195,7 +192,6 @@ python models/train_classification.py \ Finally, check the performance: ```bash -CUDA_VISIBLE_DEVICES=0 \ python models/inference_classification.py \ --data_dir ./data/standard_process/${LIB}/classification_train/ \ --checkpoint_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/combined_model_checkpoint.pth \ diff --git a/src/dataloader/check_valid_API_annotate.py b/src/dataloader/check_valid_API_annotate.py index 47cc01c..08e5c72 100644 --- a/src/dataloader/check_valid_API_annotate.py +++ b/src/dataloader/check_valid_API_annotate.py @@ -54,6 +54,19 @@ def check_all_queries_unique(annotated_data): else: print("All queries are unique.") +def check_api_presence_in_inquiry(composite_data, inquiry_data): + """ + Check if all APIs in the composite dataset are present in the inquiry dataset. + """ + composite_apis = set(item for item in composite_data) + inquiry_apis = set(item['api_calling'][0].split('(')[0] for item in inquiry_data) + print(f'length of composite/inquiry is {len(composite_apis)}, {len(inquiry_apis)}') + missing_apis = composite_apis - inquiry_apis + if missing_apis: + print(f"Missing APIs in inquiry dataset: {missing_apis}") + else: + print("All APIs in composite dataset are present in inquiry dataset.") + def main(): parser = argparse.ArgumentParser(description="Check data integrity for training and testing datasets.") parser.add_argument("lib", type=str, help="Library name for the JSON data.") @@ -61,6 +74,7 @@ def main(): inquiry_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry.json') annotated_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry_annotate.json') + composite_data = load_data(f'./data/standard_process/{args.lib}/API_composite.json') train_data, test_data = get_training_and_test_sets(inquiry_data, annotated_data) @@ -69,6 +83,7 @@ def main(): check_for_query_text_overlap(train_data, test_data) print("All checks passed successfully.") check_all_queries_unique(annotated_data) + check_api_presence_in_inquiry(composite_data, inquiry_data) if __name__ == "__main__": main() diff --git a/src/dataloader/preprocess_retriever_data.py b/src/dataloader/preprocess_retriever_data.py index a5f9f2f..b5fde28 100644 --- a/src/dataloader/preprocess_retriever_data.py +++ b/src/dataloader/preprocess_retriever_data.py @@ -23,13 +23,19 @@ prompt_oneapi_whole = f"{Task_Description_of_Singletool_oneapi_Instructions_whole}\n{Other_Requirements_singletool_oneapi_whole}" def unify_response_format(response): - list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL) - matched_lists = list_pattern.findall(response) - unified_response = [] - for single_response in matched_lists: - response_list = ast.literal_eval(single_response) - unified_response.extend(response_list) - return unified_response + try: + return json.loads(response) + except json.JSONDecodeError: + list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL) + matched_lists = list_pattern.findall(response) + unified_response = [] + for single_response in matched_lists: + try: + response_list = ast.literal_eval(single_response) + unified_response.extend(response_list) + except (ValueError, SyntaxError): + pass + return unified_response async def async_LLM_response(llm, tokenizer, prompt, history=[], kwargs={}): loop = asyncio.get_event_loop() @@ -53,7 +59,7 @@ async def process_prompt_async(api_name, api, llm, tokenizer, prompt_template, p except: pass retry_count += 1 - #print('GPT response:', response) + print('GPT response:', response) if not valid_response: return [] results = [] diff --git a/src/deploy/inference_dialog_server.py b/src/deploy/inference_dialog_server.py index b4f1108..a67027f 100644 --- a/src/deploy/inference_dialog_server.py +++ b/src/deploy/inference_dialog_server.py @@ -909,9 +909,17 @@ def run_pipeline_after_doublechecking_API_selection(self, user_input): response, _ = LLM_response(self.llm, self.tokenizer, parameters_prompt, history=[], kwargs={}) logging.info(f'==>Asking GPT: %s, ==>GPT response: %s', parameters_prompt, response) returned_content_str_new = response.replace('null', 'None').replace('None', '"None"') - returned_content = ast.literal_eval(returned_content_str_new) - success = True - break + try: + returned_content = ast.literal_eval(returned_content_str_new) + success = True + break + except: + try: + returned_content = json.loads(returned_content_str_new) + success = True + break + except: + pass except Exception as e: pass #return # 231130 fix diff --git a/src/inference/retriever_finetune_inference.py b/src/inference/retriever_finetune_inference.py index e920d8c..a6c3f23 100644 --- a/src/inference/retriever_finetune_inference.py +++ b/src/inference/retriever_finetune_inference.py @@ -3,7 +3,7 @@ from tqdm import tqdm import pandas as pd from configs.model_config import HUGGINGPATH -from sentence_transformers import SentenceTransformer, util +from sentence_transformers import SentenceTransformer, util, models, InputExample, losses, LoggingHandler from inference.utils import process_retrieval_document_query_version, compress_api_str_from_list_query_version import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -47,7 +47,16 @@ def build_retrieval_corpus(self, corpus_tsv_path): corpus_ids = list(corpus.keys()) corpus = [corpus[cid] for cid in corpus_ids] self.corpus = corpus - self.embedder = SentenceTransformer(self.model_path, device=device) + print(f'modelpath: {self.model_path}') + if self.model_path=='bert-base-uncased': + print('using unpretrained model!!!') + word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length) + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + elif 'hugging_models' in self.model_path: + self.embedder = SentenceTransformer(self.model_path, device=device) + else: + raise ValueError self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True) def build_and_merge_corpus(self, add_base=True): # based on build_retrieval_corpus, add API_base.json, fix 231227 @@ -62,7 +71,17 @@ def build_and_merge_corpus(self, add_base=True): corpus_ids = list(corpus.keys()) corpus = [corpus[cid] for cid in corpus_ids] self.corpus = corpus - self.embedder = SentenceTransformer(self.model_path, device=device) + if self.model_path=='bert-base-uncased': + print('using unpretrained model!!!') + word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length) + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + elif 'hugging_models/' in self.model_path: + print('using pretrained model!!!') + self.embedder = SentenceTransformer(self.model_path, device=device) + else: + raise ValueError + #self.embedder = SentenceTransformer(self.model_path, device=device) self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True) def retrieving(self, query, top_k): query_embedding = self.embedder.encode(query, convert_to_tensor=True) @@ -129,6 +148,7 @@ def compute_accuracy(retriever, data, args,name='train'): parser.add_argument('--input_query_file', type=str, required=True, help='input path') parser.add_argument('--idx_file', type=str, required=True, help='idx path') parser.add_argument('--LIB', type=str, required=True, help='lib') + parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.") args = parser.parse_args() # Step 1: Load API data from the JSON file diff --git a/src/models/train_retriever.py b/src/models/train_retriever.py index 9477a56..218601a 100644 --- a/src/models/train_retriever.py +++ b/src/models/train_retriever.py @@ -1,15 +1,11 @@ - -import logging -import os -import json +import logging, os, json import pandas as pd from datetime import datetime import torch -#torch.cuda.set_per_process_memory_fraction(0.5) import torch.nn as nn -from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter +#torch.cuda.set_per_process_memory_fraction(0.5) +from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler from models.api_evaluator import APIEvaluator import argparse import os @@ -57,8 +53,10 @@ def main(): parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.") parser.add_argument("--optimize_top_k", default=3, type=int, required=True,help="The metric which to save best model") parser.add_argument("--plot_dir", default="./plot/retriever/", type=str, required=True,help="plot dir for saving") + parser.add_argument("--gpu", type=str, default="0", help="GPU to use") args = parser.parse_args() + torch.cuda.set_device(int(args.gpu)) torch.manual_seed(42) torch.cuda.manual_seed(42)