From ece10d1a4879a7473f7e2ef30d542cc110fc2bd0 Mon Sep 17 00:00:00 2001 From: DoraDong-2023 Date: Sat, 27 Jan 2024 18:01:43 -0500 Subject: [PATCH] update openai version, fix details in chitchat --- src/deploy/inference_dialog_server.py | 2 +- src/gpt/gpt_interface.py | 14 +++++----- src/models/chitchat_classification.py | 40 ++++++++++++++++++++++----- src/models/model.py | 15 ++++------ src/requirements.txt | 2 +- 5 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/deploy/inference_dialog_server.py b/src/deploy/inference_dialog_server.py index 5089572..cf6289b 100644 --- a/src/deploy/inference_dialog_server.py +++ b/src/deploy/inference_dialog_server.py @@ -265,7 +265,7 @@ def __init__(self): logging.info('==>chitchat vectorizer loaded!') self.retrieve_query_mode = "similar" logging.info("Server ready") - def load_bert_model(self, load_mode='finetuned_bert'): + def load_bert_model(self, load_mode='unfinetuned_bert'): if load_mode=='unfinetuned_bert': word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) diff --git a/src/gpt/gpt_interface.py b/src/gpt/gpt_interface.py index 51183a1..3328675 100644 --- a/src/gpt/gpt_interface.py +++ b/src/gpt/gpt_interface.py @@ -26,16 +26,16 @@ def setup_openai(fname, mode='azure'): @T.retry(stop=T.stop_after_attempt(5), wait=T.wait_fixed(60), after=lambda s: logging.error(repr(s))) def query_openai(prompt, mode='azure', model='gpt-35-turbo', **kwargs): + # 240127: update openai version if mode == 'openai': - response = openai.ChatCompletion.create( - model=model, - messages=[{'role': 'user', 'content': prompt}], - **kwargs - ) + response = openai.chat.completions.create(model=model, + messages=[{'role': 'user', 'content': prompt}], + **kwargs + ) else: - response = openai.ChatCompletion.create( + response = openai.chat.completions.create( deployment_id=model, messages=[{'role': 'user', 'content': prompt}], **kwargs, ) - return response['choices'][0]['message']['content'] \ No newline at end of file + return response.choices[0].message.content \ No newline at end of file diff --git a/src/models/chitchat_classification.py b/src/models/chitchat_classification.py index 781961d..47aba33 100644 --- a/src/models/chitchat_classification.py +++ b/src/models/chitchat_classification.py @@ -85,7 +85,7 @@ def sampledata_combine(data1, data2, data3, test_data3, train_count_data1=1000, def calculate_centroid(data, embed_method): if embed_method == "original": embeddings = np.array([bert_embed(bert_trans_model, tokenizer,text, - ) for text in tqdm(data, desc="Processing with original BERT")]) + ).cpu() for text in tqdm(data, desc="Processing with original BERT")]) elif embed_method == "st_untrained": print('Using pretrained model!!!') embeddings = np.array([sentence_transformer_embed(unpretrained_model, text).cpu() for text in tqdm(data, desc="Processing with unpretrained sentencetransformer BERT")]) @@ -101,13 +101,14 @@ def predict_by_similarity(user_query_vector, centroids, labels): similarities = [cosine_similarity(user_query_vector, centroid.reshape(1, -1)) for centroid in centroids] return labels[np.argmax(similarities)] -def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer, labels, c2_accuracy): +def plot_tsne_distribution_modified(lib_name, train_data, test_data, model, labels, c2_accuracy,embed_method): import matplotlib.pyplot as plt from sklearn.manifold import TSNE # Combine train and test data for t-SNE combined_data = pd.concat([train_data['Question'], test_data['Question']], ignore_index=True) - tfidf_matrix_combined = vectorizer.transform(combined_data) + #tfidf_matrix_combined = vectorizer.transform(combined_data) + tfidf_matrix_combined = sentence_transformer_embed(model, combined_data).cpu() tsne = TSNE(n_components=2, random_state=40, init='random') reduced_data_combined = tsne.fit_transform(tfidf_matrix_combined) # Fit and transform combined data @@ -132,7 +133,7 @@ def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer, formatted_accuracy = "{:.2f}".format(c2_accuracy) plt.title(f't-SNE visualization of train data with test accuracy for api/non-api {formatted_accuracy}%') plt.legend() - plt.savefig(f'./plot/{lib_name}/chitchat_train_tsne_modified.png') + plt.savefig(f'./plot/{lib_name}/chitchat_train_tsne_modified_{embed_method}.pdf') plt.clf() # Clear the current figure # Create a figure for test data @@ -146,7 +147,7 @@ def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer, plt.title('t-SNE visualization of test data') plt.legend() - plt.savefig(f'./plot/{lib_name}/chitchat_test_tsne_modified.png') + plt.savefig(f'./plot/{lib_name}/chitchat_test_tsne_modified_{embed_method}.pdf') def main(): process_topicalchat() @@ -192,7 +193,17 @@ def calculate_accuracy(test_data, centroids, labels): correct_predictions = 0 for index, row in test_data.iterrows(): #user_query_vector = vectorizer.transform([row['Question']]) - user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1) + if args.embed_method == "original": + print('using original') + user_query_vector = np.array([bert_embed(bert_trans_model, tokenizer,[row['Question']], )]) + elif args.embed_method == "st_untrained": + print('Using pretrained model!!!') + user_query_vector = np.array([sentence_transformer_embed(unpretrained_model, [row['Question']]).cpu()]) + elif args.embed_method == "st_trained": + print('using finetuned model') + user_query_vector = np.array([sentence_transformer_embed(pretrained_model, [row['Question']]).cpu()]) + + user_query_vector = user_query_vector.flatten().reshape(1,-1) predicted_label = predict_by_similarity(user_query_vector, centroids, labels) actual_label = row['Source'] if predicted_label == actual_label: @@ -205,7 +216,16 @@ def calculate_accuracy(test_data, centroids, labels): correct_predictions = 0 for index, row in test_data.iterrows(): #user_query_vector = vectorizer.transform([row['Question']]) - user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1) + #user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1) + if args.embed_method == "original": + user_query_vector = np.array(bert_embed(bert_trans_model, tokenizer,[row['Question']], )) + elif args.embed_method == "st_untrained": + print('Using pretrained model!!!') + user_query_vector = np.array(sentence_transformer_embed(unpretrained_model, [row['Question']]).cpu()) + elif args.embed_method == "st_trained": + user_query_vector = np.array(sentence_transformer_embed(pretrained_model, [row['Question']]).cpu()) + user_query_vector = user_query_vector.flatten().reshape(1,-1) + predicted_label = predict_by_similarity(user_query_vector, centroids, labels) actual_label = row['Source'] if (actual_label=='api-query' and predicted_label=='api-query') or (actual_label!='api-query' and predicted_label!='api-query'): @@ -223,6 +243,12 @@ def calculate_accuracy(test_data, centroids, labels): os.makedirs(f"./plot/{args.LIB}", exist_ok=True) print(f"Centroids saved. Time taken: {time.time() - start_time:.2f} seconds") start_time = time.time() + if args.embed_method == "original": + plot_tsne_distribution_modified(args.LIB, train_data, test_data, bert_trans_model, labels, c2_accuracy,args.embed_method) + elif args.embed_method == "st_untrained": + plot_tsne_distribution_modified(args.LIB, train_data, test_data, unpretrained_model, labels, c2_accuracy,args.embed_method) + elif args.embed_method == "st_trained": + plot_tsne_distribution_modified(args.LIB, train_data, test_data, pretrained_model, labels, c2_accuracy,args.embed_method) if __name__=='__main__': main() \ No newline at end of file diff --git a/src/models/model.py b/src/models/model.py index 2a972ed..cd57873 100644 --- a/src/models/model.py +++ b/src/models/model.py @@ -21,11 +21,6 @@ def create_peft_config(model): return model, peft_config def LLM_model(local=True): - """ - https://python.langchain.com/docs/modules/model_io/models/llms/integrations/openai - https://python.langchain.com/docs/modules/model_io/models/llms/integrations/huggingface_hub - https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads - """ if llm_model_dict[LLM_MODEL]['platform']=='OPENAI': from gpt import gpt_interface gpt_interface.setup_openai('', mode='openai') @@ -121,8 +116,8 @@ def embedding_model(): return embeddings if __name__=='__main__': - llm, tokenizer =LLM_model() - prompt = "hello" - response, history = LLM_response(llm,tokenizer,prompt) - print(f'User: {prompt}') - print(f'LLM: {response}') \ No newline at end of file + llm, tokenizer =LLM_model() + prompt = "hello" + response, history = LLM_response(llm,tokenizer,prompt) + print(f'User: {prompt}') + print(f'LLM: {response}') \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt index cd2a022..21ecb8e 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -24,7 +24,7 @@ mmtf_python==1.1.3 nbformat==5.9.2 networkx==3.2.1 numpy==1.26.1 -openai==0.28.1 +openai==1.10.0 pandas peft==0.6.0 protobuf==3.20.0