Skip to content

Commit

Permalink
update openai version, fix details in chitchat
Browse files Browse the repository at this point in the history
  • Loading branch information
DoraDong-2023 committed Jan 27, 2024
1 parent d0fd5a7 commit ece10d1
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 26 deletions.
2 changes: 1 addition & 1 deletion src/deploy/inference_dialog_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def __init__(self):
logging.info('==>chitchat vectorizer loaded!')
self.retrieve_query_mode = "similar"
logging.info("Server ready")
def load_bert_model(self, load_mode='finetuned_bert'):
def load_bert_model(self, load_mode='unfinetuned_bert'):
if load_mode=='unfinetuned_bert':
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
Expand Down
14 changes: 7 additions & 7 deletions src/gpt/gpt_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ def setup_openai(fname, mode='azure'):

@T.retry(stop=T.stop_after_attempt(5), wait=T.wait_fixed(60), after=lambda s: logging.error(repr(s)))
def query_openai(prompt, mode='azure', model='gpt-35-turbo', **kwargs):
# 240127: update openai version
if mode == 'openai':
response = openai.ChatCompletion.create(
model=model,
messages=[{'role': 'user', 'content': prompt}],
**kwargs
)
response = openai.chat.completions.create(model=model,
messages=[{'role': 'user', 'content': prompt}],
**kwargs
)
else:
response = openai.ChatCompletion.create(
response = openai.chat.completions.create(
deployment_id=model,
messages=[{'role': 'user', 'content': prompt}],
**kwargs,
)
return response['choices'][0]['message']['content']
return response.choices[0].message.content
40 changes: 33 additions & 7 deletions src/models/chitchat_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def sampledata_combine(data1, data2, data3, test_data3, train_count_data1=1000,
def calculate_centroid(data, embed_method):
if embed_method == "original":
embeddings = np.array([bert_embed(bert_trans_model, tokenizer,text,
) for text in tqdm(data, desc="Processing with original BERT")])
).cpu() for text in tqdm(data, desc="Processing with original BERT")])
elif embed_method == "st_untrained":
print('Using pretrained model!!!')
embeddings = np.array([sentence_transformer_embed(unpretrained_model, text).cpu() for text in tqdm(data, desc="Processing with unpretrained sentencetransformer BERT")])
Expand All @@ -101,13 +101,14 @@ def predict_by_similarity(user_query_vector, centroids, labels):
similarities = [cosine_similarity(user_query_vector, centroid.reshape(1, -1)) for centroid in centroids]
return labels[np.argmax(similarities)]

def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer, labels, c2_accuracy):
def plot_tsne_distribution_modified(lib_name, train_data, test_data, model, labels, c2_accuracy,embed_method):
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Combine train and test data for t-SNE
combined_data = pd.concat([train_data['Question'], test_data['Question']], ignore_index=True)
tfidf_matrix_combined = vectorizer.transform(combined_data)
#tfidf_matrix_combined = vectorizer.transform(combined_data)
tfidf_matrix_combined = sentence_transformer_embed(model, combined_data).cpu()

tsne = TSNE(n_components=2, random_state=40, init='random')
reduced_data_combined = tsne.fit_transform(tfidf_matrix_combined) # Fit and transform combined data
Expand All @@ -132,7 +133,7 @@ def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer,
formatted_accuracy = "{:.2f}".format(c2_accuracy)
plt.title(f't-SNE visualization of train data with test accuracy for api/non-api {formatted_accuracy}%')
plt.legend()
plt.savefig(f'./plot/{lib_name}/chitchat_train_tsne_modified.png')
plt.savefig(f'./plot/{lib_name}/chitchat_train_tsne_modified_{embed_method}.pdf')
plt.clf() # Clear the current figure

# Create a figure for test data
Expand All @@ -146,7 +147,7 @@ def plot_tsne_distribution_modified(lib_name, train_data, test_data, vectorizer,

plt.title('t-SNE visualization of test data')
plt.legend()
plt.savefig(f'./plot/{lib_name}/chitchat_test_tsne_modified.png')
plt.savefig(f'./plot/{lib_name}/chitchat_test_tsne_modified_{embed_method}.pdf')

def main():
process_topicalchat()
Expand Down Expand Up @@ -192,7 +193,17 @@ def calculate_accuracy(test_data, centroids, labels):
correct_predictions = 0
for index, row in test_data.iterrows():
#user_query_vector = vectorizer.transform([row['Question']])
user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1)
if args.embed_method == "original":
print('using original')
user_query_vector = np.array([bert_embed(bert_trans_model, tokenizer,[row['Question']], )])
elif args.embed_method == "st_untrained":
print('Using pretrained model!!!')
user_query_vector = np.array([sentence_transformer_embed(unpretrained_model, [row['Question']]).cpu()])
elif args.embed_method == "st_trained":
print('using finetuned model')
user_query_vector = np.array([sentence_transformer_embed(pretrained_model, [row['Question']]).cpu()])

user_query_vector = user_query_vector.flatten().reshape(1,-1)
predicted_label = predict_by_similarity(user_query_vector, centroids, labels)
actual_label = row['Source']
if predicted_label == actual_label:
Expand All @@ -205,7 +216,16 @@ def calculate_accuracy(test_data, centroids, labels):
correct_predictions = 0
for index, row in test_data.iterrows():
#user_query_vector = vectorizer.transform([row['Question']])
user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1)
#user_query_vector = bert_embed(bert_trans_model, tokenizer,row['Question'],device=device).reshape(1, -1)
if args.embed_method == "original":
user_query_vector = np.array(bert_embed(bert_trans_model, tokenizer,[row['Question']], ))
elif args.embed_method == "st_untrained":
print('Using pretrained model!!!')
user_query_vector = np.array(sentence_transformer_embed(unpretrained_model, [row['Question']]).cpu())
elif args.embed_method == "st_trained":
user_query_vector = np.array(sentence_transformer_embed(pretrained_model, [row['Question']]).cpu())
user_query_vector = user_query_vector.flatten().reshape(1,-1)

predicted_label = predict_by_similarity(user_query_vector, centroids, labels)
actual_label = row['Source']
if (actual_label=='api-query' and predicted_label=='api-query') or (actual_label!='api-query' and predicted_label!='api-query'):
Expand All @@ -223,6 +243,12 @@ def calculate_accuracy(test_data, centroids, labels):
os.makedirs(f"./plot/{args.LIB}", exist_ok=True)
print(f"Centroids saved. Time taken: {time.time() - start_time:.2f} seconds")
start_time = time.time()
if args.embed_method == "original":
plot_tsne_distribution_modified(args.LIB, train_data, test_data, bert_trans_model, labels, c2_accuracy,args.embed_method)
elif args.embed_method == "st_untrained":
plot_tsne_distribution_modified(args.LIB, train_data, test_data, unpretrained_model, labels, c2_accuracy,args.embed_method)
elif args.embed_method == "st_trained":
plot_tsne_distribution_modified(args.LIB, train_data, test_data, pretrained_model, labels, c2_accuracy,args.embed_method)

if __name__=='__main__':
main()
15 changes: 5 additions & 10 deletions src/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ def create_peft_config(model):
return model, peft_config

def LLM_model(local=True):
"""
https://python.langchain.com/docs/modules/model_io/models/llms/integrations/openai
https://python.langchain.com/docs/modules/model_io/models/llms/integrations/huggingface_hub
https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads
"""
if llm_model_dict[LLM_MODEL]['platform']=='OPENAI':
from gpt import gpt_interface
gpt_interface.setup_openai('', mode='openai')
Expand Down Expand Up @@ -121,8 +116,8 @@ def embedding_model():
return embeddings

if __name__=='__main__':
llm, tokenizer =LLM_model()
prompt = "hello"
response, history = LLM_response(llm,tokenizer,prompt)
print(f'User: {prompt}')
print(f'LLM: {response}')
llm, tokenizer =LLM_model()
prompt = "hello"
response, history = LLM_response(llm,tokenizer,prompt)
print(f'User: {prompt}')
print(f'LLM: {response}')
2 changes: 1 addition & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mmtf_python==1.1.3
nbformat==5.9.2
networkx==3.2.1
numpy==1.26.1
openai==0.28.1
openai==1.10.0
pandas
peft==0.6.0
protobuf==3.20.0
Expand Down

0 comments on commit ece10d1

Please sign in to comment.