Skip to content

Commit

Permalink
update utils function
Browse files Browse the repository at this point in the history
  • Loading branch information
DoraDong-2023 committed Mar 15, 2024
1 parent 69d1bb9 commit 302d78f
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 32 deletions.
34 changes: 9 additions & 25 deletions src/deploy/inference_dialog_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from typing import Any
import multiprocessing
from sentence_transformers import SentenceTransformer, models

from inference.utils import predict_by_similarity
from tqdm import tqdm
from deploy.utils import change_format

import logging
Expand Down Expand Up @@ -125,11 +126,6 @@ def generate_api_calling(api_name, api_details, returned_content_str):
}
return api_name, api_calling, output

def predict_by_similarity(user_query_vector, centroids, labels):
similarities = [cosine_similarity(user_query_vector, centroid.reshape(1, -1)) for centroid in centroids]
return labels[np.argmax(similarities)]

from tqdm import tqdm
def infer(query, model, centroids, labels):
# 240125 modified chitchat model
user_query_vector = np.array([sentence_transformer_embed(model, query)])
Expand Down Expand Up @@ -362,7 +358,6 @@ def install_lib_simple(self,lib_name, lib_alias, api_html=None, github_url=None,
subprocess.run(['pip', 'install', f'{lib_alias}'])
[callback.on_agent_action(block_id="installation-" + str(self.indexxxx), task="Lib downloaded...",task_title="0") for callback in self.callbacks]
self.indexxxx+=1

if doc_url and api_html:
download_readthedoc(doc_url, api_html)
[callback.on_agent_action(block_id="installation-" + str(self.indexxxx), task="Preparing API_init.json ...",task_title="26") for callback in self.callbacks]
Expand All @@ -377,7 +372,6 @@ def install_lib_simple(self,lib_name, lib_alias, api_html=None, github_url=None,
shutil.copy(f'./data/standard_process/{self.LIB}/API_init.json', f'./data/standard_process/{self.LIB}/API_composite.json')
[callback.on_agent_action(block_id="installation-" + str(self.indexxxx), task="Finished API_composite.json ...",task_title="39") for callback in self.callbacks]
self.indexxxx+=1

###########
[callback.on_agent_action(block_id="installation-" + str(self.indexxxx), task="Preparing instruction generation API_inquiry.json ...",task_title="52") for callback in self.callbacks]
self.indexxxx+=1
Expand Down Expand Up @@ -423,7 +417,6 @@ def install_lib_full(self,lib_name, lib_alias, api_html=None, github_url=None, d
#from configs.model_config import get_all_variable_from_cheatsheet
#info_json = get_all_variable_from_cheatsheet(lib_name)
#API_HTML, TUTORIAL_GITHUB = [info_json[key] for key in ['API_HTML', 'TUTORIAL_GITHUB']]

self.LIB = lib_name
self.args_retrieval_model_path = f'./hugging_models/retriever_model_finetuned/{lib_name}/assigned'
from configs.model_config import GITHUB_PATH, ANALYSIS_PATH, READTHEDOC_PATH
Expand Down Expand Up @@ -492,15 +485,16 @@ def install_lib_full(self,lib_name, lib_alias, api_html=None, github_url=None, d
"python",
"models/train_retriever.py",
"--data_path", f"./data/standard_process/{self.LIB}/retriever_train_data/",
"--model_name", "bert-base-uncased",
"--model_name", "all-MiniLM-L6-v2",
"--output_path", f"./hugging_models/retriever_model_finetuned/{self.LIB}",
"--num_epochs", "25",
"--num_epochs", "20",
"--train_batch_size", "32",
"--learning_rate", "1e-5",
"--warmup_steps", "500",
"--max_seq_length", "256",
"--optimize_top_k", "3",
"--plot_dir", f"./plot/{self.LIB}/retriever/"
"--gpu '0'"
]
subprocess.run(command)
base64_image = convert_image_to_base64(f"./plot/{self.LIB}/retriever/ndcg_plot.png")
Expand Down Expand Up @@ -681,13 +675,10 @@ def run_pipeline(self, user_input, lib, top_k=3, files=[],conversation_started=T
else:
sampled_shuffled = random.sample(self.retriever.shuffled_data, 5)
instruction_shot_example = "".join(["\nInstruction: " + ex['query'] + "\nFunction: " + ex['gold'] for ex in sampled_shuffled])
api_predict_prompt = f"""
Task: choose one of the following APIs to use for the instruction.
{json.dumps(description_jsons)}
{instruction_shot_example}
Instruction: {user_input}
API:
"""
# 240315: substitute prompt
from gpt.utils import get_retrieved_prompt, get_nonretrieved_prompt
api_predict_init_prompt = get_retrieved_prompt()
api_predict_prompt = api_predict_init_prompt.format(query=user_input, retrieved_apis=json.dumps(description_jsons), similar_queries=instruction_shot_example)
success = False
for attempt in range(3):
try:
Expand Down Expand Up @@ -1420,7 +1411,6 @@ def run_pipeline_execution_code_list(self, execution_code_list, output_file):
output_list = []
for code in execution_code_list:
ori_code = code
print(f'start executing code: {code}')
if 'import' in code:
add_tmp = None
pass
Expand All @@ -1429,17 +1419,13 @@ def run_pipeline_execution_code_list(self, execution_code_list, output_file):
ans = self.executor.execute_api_call(code, "code", output_file=output_file)
# process tmp variable, if not None, add it to the
if add_tmp:
print('add tmp!')
print('tmp' in self.executor.variables)
if ('tmp' in self.executor.variables):
self.executor.counter+=1
self.executor.variables['result_'+str(self.executor.counter+1)] = {
"type": self.executor.variables['tmp']['type'],
"value": self.executor.variables['tmp']['value']
}
print('added tmp to ', 'result_'+str(self.executor.counter+1))
code, _ = self.modify_code_add_tmp(ori_code, 'result_'+str(self.executor.counter+1)) # add `tmp =`
print('add normal variable :', code)
ans = self.executor.execute_api_call(code, "code", output_file=output_file)
print('%s, %s', str(code), str(ans))
if ans:
Expand All @@ -1451,8 +1437,6 @@ def run_pipeline_execution_code_list(self, execution_code_list, output_file):
else:
pass
#sys.stdout.close()
print('variables keys: ', self.executor.variables.keys())
print(self.executor.variables)
result = json.dumps({'code': code, 'output_list': output_list})
self.executor.save_environment("./tmp/tmp_output_run_pipeline_execution_code_variables.pkl")
with open("./tmp/tmp_output_run_pipeline_execution_code_list.txt", 'w') as file:
Expand Down
2 changes: 1 addition & 1 deletion src/inference/retriever_finetune_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def retrieve_similar_queries(self, query, shot_k=5):
query_embedding = self.embedder.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_embedding, self.shuffled_query_embeddings, top_k=shot_k, score_function=util.cos_sim)
#similar_queries = [shuffled_data[hit['corpus_id']] for hit in hits[0]]
similar_queries = ["\nExample Instruction: " + self.shuffled_data[hit['corpus_id']]['query'] + "\nExample Function: " + self.shuffled_data[hit['corpus_id']]['gold'] for hit in hits[0]]
similar_queries = ["\nInstruction: " + self.shuffled_data[hit['corpus_id']]['query'] + "\nFunction: " + self.shuffled_data[hit['corpus_id']]['gold'] for hit in hits[0]]
return ''.join(similar_queries)

def compute_accuracy(retriever, data, args,name='train'):
Expand Down
6 changes: 6 additions & 0 deletions src/inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
from gpt.utils import get_all_api_json, find_similar_api_pairs, is_pair_in_merged_pairs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def predict_by_similarity(user_query_vector, centroids, labels):
similarities = [cosine_similarity(user_query_vector, centroid.reshape(1, -1)) for centroid in centroids]
return labels[np.argmax(similarities)]

def find_similar_two_pairs(lib_name):
from collections import defaultdict
Expand Down
7 changes: 1 addition & 6 deletions src/models/chitchat_classification.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import argparse, os, json, torch, glob, time, pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, models
from inference.utils import sentence_transformer_embed, bert_embed
from inference.utils import sentence_transformer_embed, bert_embed, predict_by_similarity
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

Expand Down Expand Up @@ -80,10 +79,6 @@ def calculate_centroid(data, model_chosen):
#print('ans', ans.shape)
return ans

def predict_by_similarity(user_query_vector, centroids, labels):
similarities = [cosine_similarity(user_query_vector, centroid.reshape(1, -1)) for centroid in centroids]
return labels[np.argmax(similarities)]

def plot_tsne_distribution_modified(lib_name, train_data, test_data, model, labels, c2_accuracy,embed_method):
# Combine train and test data for t-SNE
combined_data = pd.concat([train_data['Question'], test_data['Question']], ignore_index=True)
Expand Down

0 comments on commit 302d78f

Please sign in to comment.