Skip to content

Commit

Permalink
fix details before v1.1.9 update
Browse files Browse the repository at this point in the history
  • Loading branch information
DoraDong-2023 committed Dec 14, 2023
1 parent 2c39602 commit 4c7428a
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 70 deletions.
33 changes: 12 additions & 21 deletions src/configs/Lib_cheatsheet.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"GITHUB_LINK": "https://github.com/aertslab/scenicplus",
"READTHEDOC_LINK": "https://scenicplus.readthedocs.io/en/latest/",
"TUTORIAL_HTML_PATH": "scenicplus.readthedocs.io/en/latest/examples/",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://github.com/aertslab/scenicplus_analyses"
},
"biopython": {
"GITHUB_LINK": "https://github.com/biopython/biopython",
Expand All @@ -42,24 +42,24 @@
"API_HTML_PATH": "biopython.org/docs/latest/api",
"LIB_ALIAS": "Bio",
"TUTORIAL_GITHUB":"https://github.com/biopython/biopython/tree/master/Doc/Tutorial",
"TUTORIAL_HTML_PATH":"scenicplus.readthedocs.io/en/latest/examples/"
"TUTORIAL_HTML_PATH":"https://biopython.org/DIST/docs/tutorial/Tutorial.html"
},
"qiime2": {
"LIB": "qiime2",
"LIB_ALIAS": "qiime2",
"API_HTML_PATH": "docs.qiime2.org/2023.9/interfaces/artifact-api/index.html",
"GITHUB_LINK": "https://github.com/qiime2/qiime2",
"READTHEDOC_LINK": "https://docs.qiime2.org/2023.9/",
"TUTORIAL_HTML_PATH":"docs.qiime2.org/2023.9/tutorials",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://github.com/biocore/validation-notebooks/tree/main",
"TUTORIAL_HTML_PATH":"docs.qiime2.org/2023.9/tutorials"
},
"eletoolkit": {
"LIB": "eletoolkit",
"LIB_ALIAS": "ete4",
"API_HTML_PATH": "etetoolkit.github.io/ete/reference/index.html",
"GITHUB_LINK": "https://github.com/etetoolkit/ete",
"READTHEDOC_LINK": "https://etetoolkit.github.io/ete/",
"TUTORIAL_GITHUB":null,
"TUTORIAL_GITHUB": "https://github.com/etetoolkit/cookbook",
"TUTORIAL_HTML_PATH":"etetoolkit.org/docs/latest/tutorial"
},
"pyopenms": {
Expand All @@ -69,7 +69,7 @@
"GITHUB_LINK": "https://github.com/OpenMS/OpenMS",
"READTHEDOC_LINK": "https://pyopenms.readthedocs.io",
"TUTORIAL_HTML_PATH":"pyopenms.readthedocs.io/en/latest/user_guide",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://github.com/biosustain/pyOpenMS_UmetaFlow"
},
"pyteomics": {
"LIB": "pyteomics",
Expand All @@ -86,8 +86,8 @@
"API_HTML_PATH": "scikit.bio/docs/latest/index.html",
"GITHUB_LINK": "https://github.com/biocore/scikit-bio",
"READTHEDOC_LINK": "https://scikit.bio/docs/latest/",
"TUTORIAL_HTML_PATH": null,
"TUTORIAL_GITHUB": null
"TUTORIAL_HTML_PATH": "https://nbviewer.org/github/scikit-bio/scikit-bio-cookbook/tree/master/",
"TUTORIAL_GITHUB": "https://github.com/scikit-bio/scikit-bio-cookbook"
},
"emperor": {
"LIB": "emperor",
Expand All @@ -96,7 +96,7 @@
"GITHUB_LINK": "https://github.com/biocore/emperor",
"READTHEDOC_LINK": "http://biocore.github.io/emperor/",
"TUTORIAL_HTML_PATH": "biocore.github.io/emperor/tutorials/",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://nbviewer.org/github/biocore/emperor/tree/new-api/examples/"
},
"gneiss": {
"LIB": "gneiss",
Expand All @@ -107,15 +107,6 @@
"TUTORIAL_HTML_PATH": "biocore.github.io/gneiss/docs/v0.4.0/tutorials/python/",
"TUTORIAL_GITHUB": null
},
"biostars": {
"LIB": "biostars",
"LIB_ALIAS": "biostars",
"API_HTML_PATH": null,
"GITHUB_LINK": null,
"READTHEDOC_LINK": "https://www.biostars.org/",
"TUTORIAL_HTML_PATH": null,
"TUTORIAL_GITHUB": null
},
"deap": {
"LIB": "deap",
"LIB_ALIAS": "deap",
Expand All @@ -132,7 +123,7 @@
"GITHUB_LINK": "https://github.com/tskit-dev/tskit",
"READTHEDOC_LINK": "https://tskit.dev/tskit/docs/latest/introduction.html",
"TUTORIAL_HTML_PATH": "tskit.dev/tutorials/",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://github.com/tskit-dev/tutorials"
},
"biotite": {
"LIB": "biotite",
Expand All @@ -141,7 +132,7 @@
"GITHUB_LINK": "https://github.com/biotite-dev/biotite",
"READTHEDOC_LINK": "https://www.biotite-python.org/",
"TUTORIAL_HTML_PATH": "www.biotite-python.org/tutorial/target/index.html",
"TUTORIAL_GITHUB": null
"TUTORIAL_GITHUB": "https://github.com/biotite-dev/article-notebooks"
},
"sonata":{
"LIB": "sonata",
Expand Down Expand Up @@ -179,4 +170,4 @@
"TUTORIAL_HTML_PATH": null,
"TUTORIAL_GITHUB": null
}
}
}
5 changes: 4 additions & 1 deletion src/dataloader/preprocess_retriever_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# instruction_generation
# prepare for retriever data
import json, os, re, copy, ast, random, time, cProfile, pstats, argparse, asyncio
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-test')

from tqdm import tqdm as tqdm_normal
from tqdm.asyncio import tqdm_asyncio
Expand Down Expand Up @@ -240,4 +243,4 @@ def preprocess_fake_test_data(QUERY_FILE, QUERY_ANNOTATE_FILE):
print('step2 cost:', time.time()-t1)
t1 = time.time()
preprocess_retriever_data(OUTPUT_DIR, QUERY_FILE, QUERY_ANNOTATE_FILE, INDEX_FILE)
print('step3 cost:', time.time()-t1)
print('step3 cost:', time.time()-t1)
15 changes: 10 additions & 5 deletions src/deploy/inference_dialog_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,8 +529,10 @@ def run_pipeline(self, user_input, lib, top_k=3, files=[],conversation_started=T
break
except Exception as e:
print(f'Time {attempt}. GPT predict error: {e}')
return
#return
if not success:
[callback.on_tool_start() for callback in self.callbacks]
[callback.on_tool_end() for callback in self.callbacks]
[callback.on_agent_action(block_id="log-" + str(self.indexxxx),task="GPT can not return valid API name prediction, please redesign your prompt.",task_title="GPT predict Error",) for callback in self.callbacks]
self.indexxxx += 1
return
Expand Down Expand Up @@ -897,12 +899,14 @@ def run_pipeline_after_entering_params(self, user_input):
# split parameters according to multiple API, or class/method API
parameters_list = self.extract_parameters(self.api_name_json, self.API_composite)
extracted_params = self.split_params(self.selected_params, parameters_list)
print(f'extracted_params: {extracted_params}')
print(f'==>self.api_name_json: {self.api_name_json}', f'parameters_list: {parameters_list}')
print(f'==>extracted_params: {extracted_params}')
extracted_params_dict = {api_name: extracted_param for api_name, extracted_param in zip(self.api_name_json, extracted_params)}
print('extracted_params_dict: ', extracted_params_dict)
api_params_list = []
for idx, api_name in enumerate(self.api_name_json):
if self.api_name_json[api_name]['type']!='class':
if True:
#if self.api_name_json[api_name]['type']=='class': # !
#print('==>assume not start with class API:', api_name)
class_selected_params = {}
fake_class_api = '.'.join(api_name.split('.')[:-1])
Expand All @@ -928,6 +932,7 @@ def run_pipeline_after_entering_params(self, user_input):
"parameters":extracted_params[idx],
"return_type":self.API_composite[api_name]['Returns']['type'],
"class_selected_params":class_selected_params})
print('==>api_params_list:', json.dumps(api_params_list))
execution_code = self.executor.generate_execution_code(api_params_list)
print('==>execution_code:',execution_code)
[callback.on_tool_start() for callback in self.callbacks]
Expand Down Expand Up @@ -1036,7 +1041,7 @@ def run_pipeline_after_entering_params(self, user_input):
self.indexxxx+=1
else:
print(f'Execution Error: {content}')
[callback.on_agent_action(block_id="log-"+str(self.indexxxx),task=""+"".join(error_list),task_title="Executed results [Fail]",) for callback in self.callbacks] # Execution failed!
[callback.on_agent_action(block_id="log-"+str(self.indexxxx),task="".join(list(set(error_list))),task_title="Executed results [Fail]",) for callback in self.callbacks] # Execution failed!
self.indexxxx+=1
file_name=f"./tmp/sessions/{str(self.session_id)}_environment.pkl"
self.executor.save_environment(file_name)
Expand Down Expand Up @@ -1182,4 +1187,4 @@ def handle_keyboard_interrupt(signal, frame):
#thread = Thread(target=handle_requests)
#thread.daemon = True
#thread.start()
app.run(use_reloader=False, host="0.0.0.0", debug=True, port=5000)
app.run(use_reloader=False, host="0.0.0.0", debug=True, port=5000)
89 changes: 46 additions & 43 deletions src/models/train_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,6 @@
import os
from inference.utils import process_retrieval_document_query_version, compress_api_str_from_list_query_version

parser = argparse.ArgumentParser()
parser.add_argument("--data_path", default=None, type=str, required=True,help="The input data dir. Should contain the .tsv files for the task.")
parser.add_argument("--model_name", default=None, type=str, required=True,help="The base model name.")
parser.add_argument("--output_path", default=None, type=str, required=True,help="The base path where the model output will be saved.")
parser.add_argument("--num_epochs", default=10, type=int, required=True,help="Train epochs.")
parser.add_argument("--train_batch_size", default=32, type=int, required=True,help="Train batch size.")
parser.add_argument("--learning_rate", default=2e-5, type=float, required=True,help="Learning rate.")
parser.add_argument("--warmup_steps", default=500, type=float, required=True,help="Warmup steps.")
parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
parser.add_argument("--optimize_top_k", default=3, type=int, required=True,help="The metric which to save best model")
parser.add_argument("--plot_dir", default="./plot/retriever/", type=str, required=True,help="plot dir for saving")
args = parser.parse_args()

torch.manual_seed(42)
torch.cuda.manual_seed(42)

os.makedirs(args.output_path, exist_ok=True)
os.makedirs(args.plot_dir, exist_ok=True)

model_save_path = os.path.join(args.output_path,'assigned')
os.makedirs(model_save_path, exist_ok=True)

# Model definition
word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

def load_query(dataset_type, data_path):
queries_df = pd.read_csv(os.path.join(data_path, f'{dataset_type}.query.txt'), sep='\t', names=['qid', 'query'])
labels_df = pd.read_csv(os.path.join(data_path, f'qrels.{dataset_type}.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
Expand All @@ -52,9 +25,9 @@ def load_relevant_docs(labels_df):
relevant_docs.setdefault(row.qid, set()).add(row.docid)
return relevant_docs

def get_data(data_path, ):
def get_data(data_path, process_corpus_df):
documents_df = pd.read_csv(os.path.join(data_path, 'corpus.tsv'), sep='\t')
ir_corpus, _ = process_retrieval_document_query_version(documents_df)
ir_corpus, _ = process_corpus_df(documents_df)
labels_df_train, ir_train_queries = load_query("train", data_path)
labels_df_test, ir_test_queries = load_query("test", data_path)
labels_df_val, ir_val_queries = load_query("val", data_path)
Expand All @@ -70,20 +43,50 @@ def get_data(data_path, ):
'test': {'queries': ir_test_queries, 'relevant_docs': test_relevant_docs},
}
return ir_corpus, train_samples, corpus_config
ir_corpus, train_samples, corpus_config = get_data(args.data_path)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=args.train_batch_size, pin_memory=True)
train_loss = losses.MultipleNegativesRankingLoss(model)
evaluator = APIEvaluator(corpus_config, ir_corpus, fig_path=args.plot_dir,optimize_top_k=args.optimize_top_k)
# You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations
#from tensorflow.keras.callbacks import EarlyStopping
#early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=args.num_epochs,
warmup_steps=args.warmup_steps,
optimizer_params={'lr': args.learning_rate},
output_path=model_save_path
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", default=None, type=str, required=True,help="The input data dir. Should contain the .tsv files for the task.")
parser.add_argument("--model_name", default=None, type=str, required=True,help="The base model name.")
parser.add_argument("--output_path", default=None, type=str, required=True,help="The base path where the model output will be saved.")
parser.add_argument("--num_epochs", default=10, type=int, required=True,help="Train epochs.")
parser.add_argument("--train_batch_size", default=32, type=int, required=True,help="Train batch size.")
parser.add_argument("--learning_rate", default=2e-5, type=float, required=True,help="Learning rate.")
parser.add_argument("--warmup_steps", default=500, type=float, required=True,help="Warmup steps.")
parser.add_argument("--max_seq_length", default=256, type=int, required=True,help="Max sequence length.")
parser.add_argument("--optimize_top_k", default=3, type=int, required=True,help="The metric which to save best model")
parser.add_argument("--plot_dir", default="./plot/retriever/", type=str, required=True,help="plot dir for saving")
args = parser.parse_args()

torch.manual_seed(42)
torch.cuda.manual_seed(42)

os.makedirs(args.output_path, exist_ok=True)
os.makedirs(args.plot_dir, exist_ok=True)

model_save_path = os.path.join(args.output_path,'assigned')
os.makedirs(model_save_path, exist_ok=True)

# Model definition
word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

ir_corpus, train_samples, corpus_config = get_data(args.data_path,process_retrieval_document_query_version)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=args.train_batch_size, pin_memory=True)
train_loss = losses.MultipleNegativesRankingLoss(model)
evaluator = APIEvaluator(corpus_config, ir_corpus, fig_path=args.plot_dir,optimize_top_k=args.optimize_top_k)
# You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations
#from tensorflow.keras.callbacks import EarlyStopping
#early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=args.num_epochs,
warmup_steps=args.warmup_steps,
optimizer_params={'lr': args.learning_rate},
output_path=model_save_path
)
if __name__=='__main__':
main()


0 comments on commit 4c7428a

Please sign in to comment.