enhance gpt response compatibility, fix gpu index selection, fix bert…

… loading,
batmen-lab · Jan 22, 2024 · f3ab9b9 · f3ab9b9
1 parent 116a223
commit f3ab9b9
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 27 deletions.
diff --git a/docs/PyPI2APP.md b/docs/PyPI2APP.md
@@ -114,7 +114,6 @@ python inference/retriever_bm25_inference.py --LIB ${LIB} --top_k 3
 7. Fine-tune the retriever.
 You can finetune the retriever based on the [bert-base-uncased](https://huggingface.co/bert-base-uncased) model
 ```bash
-CUDA_VISIBLE_DEVICES=0 # if you use gpu
 mkdir ./hugging_models/retriever_model_finetuned/${LIB}
 python models/train_retriever.py \
     --data_path ./data/standard_process/${LIB}/retriever_train_data/ \
@@ -126,15 +125,15 @@ python models/train_retriever.py \
     --warmup_steps 500 \
     --max_seq_length 256 \
     --optimize_top_k 3 \
-    --plot_dir ./plot/${LIB}/retriever/
+    --plot_dir ./plot/${LIB}/retriever/ \
+    --gpu "1"
 ```
 
 You can check the training performance curve under `./src/plot/${LIB}/` to determine whether you need more number of epochs.
 
 8. Test the inference performance using:
 ```bash 
 export HUGGINGPATH=./hugging_models
-CUDA_VISIBLE_DEVICES=0 # if you use gpu
 python inference/retriever_finetune_inference.py  \
     --retrieval_model_path ./hugging_models/retriever_model_finetuned/${LIB}/assigned \
     --max_seq_length 256 \
@@ -161,7 +160,6 @@ Please refer to [lit-llama](https://github.com/Lightning-AI/lit-llama) for getti
 
 process data:
 ```bash
-CUDA_VISIBLE_DEVICES=0
 export TOKENIZERS_PARALLELISM=true
 python models/data_classification.py \
     --pretrained_path ./hugging_models/llama-2-finetuned/checkpoints/lite-llama2/lit-llama.pth \
@@ -184,7 +182,6 @@ python models/data_classification.py \
 
 Then, finetune model:
 ```bash
-CUDA_VISIBLE_DEVICES=0 \
 python models/train_classification.py \
     --data_dir ./data/standard_process/${LIB}/classification_train/ \
     --out_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/ \
@@ -195,7 +192,6 @@ python models/train_classification.py \
 
 Finally, check the performance:
 ```bash
-CUDA_VISIBLE_DEVICES=0 \
 python models/inference_classification.py \
     --data_dir ./data/standard_process/${LIB}/classification_train/ \
     --checkpoint_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/combined_model_checkpoint.pth \

diff --git a/src/dataloader/check_valid_API_annotate.py b/src/dataloader/check_valid_API_annotate.py
@@ -54,13 +54,27 @@ def check_all_queries_unique(annotated_data):
     else:
         print("All queries are unique.")
 
+def check_api_presence_in_inquiry(composite_data, inquiry_data):
+    """
+    Check if all APIs in the composite dataset are present in the inquiry dataset.
+    """
+    composite_apis = set(item for item in composite_data)
+    inquiry_apis = set(item['api_calling'][0].split('(')[0] for item in inquiry_data)
+    print(f'length of composite/inquiry is {len(composite_apis)}, {len(inquiry_apis)}')
+    missing_apis = composite_apis - inquiry_apis
+    if missing_apis:
+        print(f"Missing APIs in inquiry dataset: {missing_apis}")
+    else:
+        print("All APIs in composite dataset are present in inquiry dataset.")
+
 def main():
     parser = argparse.ArgumentParser(description="Check data integrity for training and testing datasets.")
     parser.add_argument("lib", type=str, help="Library name for the JSON data.")
     args = parser.parse_args()
 
     inquiry_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry.json')
     annotated_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry_annotate.json')
+    composite_data = load_data(f'./data/standard_process/{args.lib}/API_composite.json')
 
     train_data, test_data = get_training_and_test_sets(inquiry_data, annotated_data)
 
@@ -69,6 +83,7 @@ def main():
     check_for_query_text_overlap(train_data, test_data)
     print("All checks passed successfully.")
     check_all_queries_unique(annotated_data)
+    check_api_presence_in_inquiry(composite_data, inquiry_data)
 
 if __name__ == "__main__":
     main()
diff --git a/src/dataloader/preprocess_retriever_data.py b/src/dataloader/preprocess_retriever_data.py
@@ -23,13 +23,19 @@
 prompt_oneapi_whole = f"{Task_Description_of_Singletool_oneapi_Instructions_whole}\n{Other_Requirements_singletool_oneapi_whole}"
 
 def unify_response_format(response):
-    list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL)
-    matched_lists = list_pattern.findall(response)
-    unified_response = []
-    for single_response in matched_lists:
-        response_list = ast.literal_eval(single_response)
-        unified_response.extend(response_list)
-    return unified_response
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL)
+        matched_lists = list_pattern.findall(response)
+        unified_response = []
+        for single_response in matched_lists:
+            try:
+                response_list = ast.literal_eval(single_response)
+                unified_response.extend(response_list)
+            except (ValueError, SyntaxError):
+                pass
+        return unified_response
 
 async def async_LLM_response(llm, tokenizer, prompt, history=[], kwargs={}):
     loop = asyncio.get_event_loop()
@@ -53,7 +59,7 @@ async def process_prompt_async(api_name, api, llm, tokenizer, prompt_template, p
         except:
             pass
         retry_count += 1
-    #print('GPT response:', response)
+    print('GPT response:', response)
     if not valid_response:
         return []
     results = []

diff --git a/src/deploy/inference_dialog_server.py b/src/deploy/inference_dialog_server.py
@@ -909,9 +909,17 @@ def run_pipeline_after_doublechecking_API_selection(self, user_input):
                     response, _ = LLM_response(self.llm, self.tokenizer, parameters_prompt, history=[], kwargs={})  
                     logging.info(f'==>Asking GPT: %s, ==>GPT response: %s', parameters_prompt, response)
                     returned_content_str_new = response.replace('null', 'None').replace('None', '"None"')
-                    returned_content = ast.literal_eval(returned_content_str_new)
-                    success = True
-                    break
+                    try:
+                        returned_content = ast.literal_eval(returned_content_str_new)
+                        success = True
+                        break
+                    except:
+                        try:
+                            returned_content = json.loads(returned_content_str_new)
+                            success = True
+                            break
+                        except:
+                            pass
                 except Exception as e:
                     pass
                     #return # 231130 fix 

diff --git a/src/inference/retriever_finetune_inference.py b/src/inference/retriever_finetune_inference.py
@@ -3,7 +3,7 @@
 from tqdm import tqdm
 import pandas as pd
 from configs.model_config import HUGGINGPATH
-from sentence_transformers import SentenceTransformer, util
+from sentence_transformers import SentenceTransformer, util, models, InputExample, losses, LoggingHandler
 from inference.utils import process_retrieval_document_query_version, compress_api_str_from_list_query_version
 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -47,7 +47,16 @@ def build_retrieval_corpus(self, corpus_tsv_path):
         corpus_ids = list(corpus.keys())
         corpus = [corpus[cid] for cid in corpus_ids]
         self.corpus = corpus
-        self.embedder = SentenceTransformer(self.model_path, device=device)
+        print(f'modelpath: {self.model_path}')
+        if self.model_path=='bert-base-uncased':
+            print('using unpretrained model!!!')
+            word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length)
+            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+            self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+        elif 'hugging_models' in self.model_path:
+            self.embedder = SentenceTransformer(self.model_path, device=device)
+        else:
+            raise ValueError
         self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
     def build_and_merge_corpus(self, add_base=True):
         # based on build_retrieval_corpus, add API_base.json, fix 231227
@@ -62,7 +71,17 @@ def build_and_merge_corpus(self, add_base=True):
         corpus_ids = list(corpus.keys())
         corpus = [corpus[cid] for cid in corpus_ids]
         self.corpus = corpus
-        self.embedder = SentenceTransformer(self.model_path, device=device)
+        if self.model_path=='bert-base-uncased':
+            print('using unpretrained model!!!')
+            word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length)
+            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+            self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+        elif 'hugging_models/' in self.model_path:
+            print('using pretrained model!!!')
+            self.embedder = SentenceTransformer(self.model_path, device=device)
+        else:
+            raise ValueError
+        #self.embedder = SentenceTransformer(self.model_path, device=device)
         self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
     def retrieving(self, query, top_k):
         query_embedding = self.embedder.encode(query, convert_to_tensor=True)
@@ -129,6 +148,7 @@ def compute_accuracy(retriever, data, args,name='train'):
     parser.add_argument('--input_query_file', type=str, required=True, help='input path')
     parser.add_argument('--idx_file', type=str, required=True, help='idx path')
     parser.add_argument('--LIB', type=str, required=True, help='lib')
+    parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
     args = parser.parse_args()
 
     # Step 1: Load API data from the JSON file

diff --git a/src/models/train_retriever.py b/src/models/train_retriever.py
@@ -1,15 +1,11 @@
-
-import logging
-import os
-import json
+import logging, os, json
 import pandas as pd
 from datetime import datetime
 import torch
-#torch.cuda.set_per_process_memory_fraction(0.5)
 import torch.nn as nn
-from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
 from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
+#torch.cuda.set_per_process_memory_fraction(0.5)
+from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
 from models.api_evaluator import APIEvaluator
 import argparse
 import os
@@ -57,8 +53,10 @@ def main():
     parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
     parser.add_argument("--optimize_top_k", default=3, type=int, required=True,help="The metric which to save best model")
     parser.add_argument("--plot_dir", default="./plot/retriever/", type=str, required=True,help="plot dir for saving")
+    parser.add_argument("--gpu", type=str, default="0", help="GPU to use")
     args = parser.parse_args()
 
+    torch.cuda.set_device(int(args.gpu))
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)