Skip to content

Commit

Permalink
enhance gpt response compatibility, fix gpu index selection, fix bert…
Browse files Browse the repository at this point in the history
… loading,
  • Loading branch information
DoraDong-2023 committed Jan 22, 2024
1 parent 116a223 commit f3ab9b9
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 27 deletions.
8 changes: 2 additions & 6 deletions docs/PyPI2APP.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ python inference/retriever_bm25_inference.py --LIB ${LIB} --top_k 3
7. Fine-tune the retriever.
You can finetune the retriever based on the [bert-base-uncased](https://huggingface.co/bert-base-uncased) model
```bash
CUDA_VISIBLE_DEVICES=0 # if you use gpu
mkdir ./hugging_models/retriever_model_finetuned/${LIB}
python models/train_retriever.py \
--data_path ./data/standard_process/${LIB}/retriever_train_data/ \
Expand All @@ -126,15 +125,15 @@ python models/train_retriever.py \
--warmup_steps 500 \
--max_seq_length 256 \
--optimize_top_k 3 \
--plot_dir ./plot/${LIB}/retriever/
--plot_dir ./plot/${LIB}/retriever/ \
--gpu "1"
```

You can check the training performance curve under `./src/plot/${LIB}/` to determine whether you need more number of epochs.

8. Test the inference performance using:
```bash
export HUGGINGPATH=./hugging_models
CUDA_VISIBLE_DEVICES=0 # if you use gpu
python inference/retriever_finetune_inference.py \
--retrieval_model_path ./hugging_models/retriever_model_finetuned/${LIB}/assigned \
--max_seq_length 256 \
Expand All @@ -161,7 +160,6 @@ Please refer to [lit-llama](https://github.com/Lightning-AI/lit-llama) for getti

process data:
```bash
CUDA_VISIBLE_DEVICES=0
export TOKENIZERS_PARALLELISM=true
python models/data_classification.py \
--pretrained_path ./hugging_models/llama-2-finetuned/checkpoints/lite-llama2/lit-llama.pth \
Expand All @@ -184,7 +182,6 @@ python models/data_classification.py \

Then, finetune model:
```bash
CUDA_VISIBLE_DEVICES=0 \
python models/train_classification.py \
--data_dir ./data/standard_process/${LIB}/classification_train/ \
--out_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/ \
Expand All @@ -195,7 +192,6 @@ python models/train_classification.py \

Finally, check the performance:
```bash
CUDA_VISIBLE_DEVICES=0 \
python models/inference_classification.py \
--data_dir ./data/standard_process/${LIB}/classification_train/ \
--checkpoint_dir ./hugging_models/llama-2-finetuned/${LIB}/finetuned/combined_model_checkpoint.pth \
Expand Down
15 changes: 15 additions & 0 deletions src/dataloader/check_valid_API_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,27 @@ def check_all_queries_unique(annotated_data):
else:
print("All queries are unique.")

def check_api_presence_in_inquiry(composite_data, inquiry_data):
"""
Check if all APIs in the composite dataset are present in the inquiry dataset.
"""
composite_apis = set(item for item in composite_data)
inquiry_apis = set(item['api_calling'][0].split('(')[0] for item in inquiry_data)
print(f'length of composite/inquiry is {len(composite_apis)}, {len(inquiry_apis)}')
missing_apis = composite_apis - inquiry_apis
if missing_apis:
print(f"Missing APIs in inquiry dataset: {missing_apis}")
else:
print("All APIs in composite dataset are present in inquiry dataset.")

def main():
parser = argparse.ArgumentParser(description="Check data integrity for training and testing datasets.")
parser.add_argument("lib", type=str, help="Library name for the JSON data.")
args = parser.parse_args()

inquiry_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry.json')
annotated_data = load_data(f'./data/standard_process/{args.lib}/API_inquiry_annotate.json')
composite_data = load_data(f'./data/standard_process/{args.lib}/API_composite.json')

train_data, test_data = get_training_and_test_sets(inquiry_data, annotated_data)

Expand All @@ -69,6 +83,7 @@ def main():
check_for_query_text_overlap(train_data, test_data)
print("All checks passed successfully.")
check_all_queries_unique(annotated_data)
check_api_presence_in_inquiry(composite_data, inquiry_data)

if __name__ == "__main__":
main()
22 changes: 14 additions & 8 deletions src/dataloader/preprocess_retriever_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,19 @@
prompt_oneapi_whole = f"{Task_Description_of_Singletool_oneapi_Instructions_whole}\n{Other_Requirements_singletool_oneapi_whole}"

def unify_response_format(response):
list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL)
matched_lists = list_pattern.findall(response)
unified_response = []
for single_response in matched_lists:
response_list = ast.literal_eval(single_response)
unified_response.extend(response_list)
return unified_response
try:
return json.loads(response)
except json.JSONDecodeError:
list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL)
matched_lists = list_pattern.findall(response)
unified_response = []
for single_response in matched_lists:
try:
response_list = ast.literal_eval(single_response)
unified_response.extend(response_list)
except (ValueError, SyntaxError):
pass
return unified_response

async def async_LLM_response(llm, tokenizer, prompt, history=[], kwargs={}):
loop = asyncio.get_event_loop()
Expand All @@ -53,7 +59,7 @@ async def process_prompt_async(api_name, api, llm, tokenizer, prompt_template, p
except:
pass
retry_count += 1
#print('GPT response:', response)
print('GPT response:', response)
if not valid_response:
return []
results = []
Expand Down
14 changes: 11 additions & 3 deletions src/deploy/inference_dialog_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,9 +909,17 @@ def run_pipeline_after_doublechecking_API_selection(self, user_input):
response, _ = LLM_response(self.llm, self.tokenizer, parameters_prompt, history=[], kwargs={})
logging.info(f'==>Asking GPT: %s, ==>GPT response: %s', parameters_prompt, response)
returned_content_str_new = response.replace('null', 'None').replace('None', '"None"')
returned_content = ast.literal_eval(returned_content_str_new)
success = True
break
try:
returned_content = ast.literal_eval(returned_content_str_new)
success = True
break
except:
try:
returned_content = json.loads(returned_content_str_new)
success = True
break
except:
pass
except Exception as e:
pass
#return # 231130 fix
Expand Down
26 changes: 23 additions & 3 deletions src/inference/retriever_finetune_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from tqdm import tqdm
import pandas as pd
from configs.model_config import HUGGINGPATH
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer, util, models, InputExample, losses, LoggingHandler
from inference.utils import process_retrieval_document_query_version, compress_api_str_from_list_query_version
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -47,7 +47,16 @@ def build_retrieval_corpus(self, corpus_tsv_path):
corpus_ids = list(corpus.keys())
corpus = [corpus[cid] for cid in corpus_ids]
self.corpus = corpus
self.embedder = SentenceTransformer(self.model_path, device=device)
print(f'modelpath: {self.model_path}')
if self.model_path=='bert-base-uncased':
print('using unpretrained model!!!')
word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
elif 'hugging_models' in self.model_path:
self.embedder = SentenceTransformer(self.model_path, device=device)
else:
raise ValueError
self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
def build_and_merge_corpus(self, add_base=True):
# based on build_retrieval_corpus, add API_base.json, fix 231227
Expand All @@ -62,7 +71,17 @@ def build_and_merge_corpus(self, add_base=True):
corpus_ids = list(corpus.keys())
corpus = [corpus[cid] for cid in corpus_ids]
self.corpus = corpus
self.embedder = SentenceTransformer(self.model_path, device=device)
if self.model_path=='bert-base-uncased':
print('using unpretrained model!!!')
word_embedding_model = models.Transformer(self.model_path, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
elif 'hugging_models/' in self.model_path:
print('using pretrained model!!!')
self.embedder = SentenceTransformer(self.model_path, device=device)
else:
raise ValueError
#self.embedder = SentenceTransformer(self.model_path, device=device)
self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
def retrieving(self, query, top_k):
query_embedding = self.embedder.encode(query, convert_to_tensor=True)
Expand Down Expand Up @@ -129,6 +148,7 @@ def compute_accuracy(retriever, data, args,name='train'):
parser.add_argument('--input_query_file', type=str, required=True, help='input path')
parser.add_argument('--idx_file', type=str, required=True, help='idx path')
parser.add_argument('--LIB', type=str, required=True, help='lib')
parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
args = parser.parse_args()

# Step 1: Load API data from the JSON file
Expand Down
12 changes: 5 additions & 7 deletions src/models/train_retriever.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@

import logging
import os
import json
import logging, os, json
import pandas as pd
from datetime import datetime
import torch
#torch.cuda.set_per_process_memory_fraction(0.5)
import torch.nn as nn
from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
#torch.cuda.set_per_process_memory_fraction(0.5)
from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
from models.api_evaluator import APIEvaluator
import argparse
import os
Expand Down Expand Up @@ -57,8 +53,10 @@ def main():
parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
parser.add_argument("--optimize_top_k", default=3, type=int, required=True,help="The metric which to save best model")
parser.add_argument("--plot_dir", default="./plot/retriever/", type=str, required=True,help="plot dir for saving")
parser.add_argument("--gpu", type=str, default="0", help="GPU to use")
args = parser.parse_args()

torch.cuda.set_device(int(args.gpu))
torch.manual_seed(42)
torch.cuda.manual_seed(42)

Expand Down

0 comments on commit f3ab9b9

Please sign in to comment.