From 6276539a2984a907b6c79f47e82e87e63abb951c Mon Sep 17 00:00:00 2001 From: DoraDong-2023 Date: Tue, 16 Jan 2024 18:33:07 -0500 Subject: [PATCH] minor update for prompt and new libs --- docs/PyPI2APP.md | 2 +- src/configs/Lib_cheatsheet.json | 12 +- .../get_API_composite_from_tutorial.py | 38 ++---- src/dataloader/utils/code_analyzer.py | 2 - src/deploy/inference_dialog_server.py | 10 +- src/gpt/gpt_baseline.ipynb | 118 ++++++++++++++---- src/inference/execution_UI.py | 21 +++- src/models/model.py | 6 +- src/prompt/composite.py | 25 ++++ src/prompt/instruction.py | 62 ++++----- src/requirements.txt | 6 + 11 files changed, 198 insertions(+), 104 deletions(-) create mode 100644 src/prompt/composite.py diff --git a/docs/PyPI2APP.md b/docs/PyPI2APP.md index bcb712a..97213bd 100644 --- a/docs/PyPI2APP.md +++ b/docs/PyPI2APP.md @@ -48,7 +48,7 @@ export LIB=scanpy # download materials according to your provided url links python dataloader/utils/other_download.py --LIB ${LIB} # generate codes for your downloaded tutorial files, support for either html, ipynb. -python dataloader/utils/tutorial_loader_strategy.py --LIB ${LIB} --file_type 'html' +python dataloader/utils/tutorial_loader_strategy.py --LIB ${LIB} --file_type 'ipynb' ``` NOTE it requires API_HTML_PATH, READTHEDOC_PATH and TUTORIAL_GITHUB to run the above script! diff --git a/src/configs/Lib_cheatsheet.json b/src/configs/Lib_cheatsheet.json index c0a1292..7d0a632 100644 --- a/src/configs/Lib_cheatsheet.json +++ b/src/configs/Lib_cheatsheet.json @@ -174,18 +174,18 @@ "LIB": "ehrapy", "LIB_ALIAS": "ehrapy", "API_HTML_PATH": "ehrapy.readthedocs.io/en/latest/usage/usage.html", - "GITHUB_LINK": null, - "READTHEDOC_LINK": null, + "GITHUB_LINK": "https://github.com/theislab/ehrapy", + "READTHEDOC_LINK": "https://ehrapy.readthedocs.io/", "TUTORIAL_HTML_PATH": "https://ehrapy.readthedocs.io/en/latest/tutorials/index.html", - "TUTORIAL_GITHUB": null + "TUTORIAL_GITHUB": "https://github.com/theislab/ehrapy-tutorials" }, "snapatac2": { "LIB": "snapatac2", "LIB_ALIAS": "snapatac2", "API_HTML_PATH": "kzhang.org/SnapATAC2/api/index.html", - "GITHUB_LINK": null, - "READTHEDOC_LINK": null, + "GITHUB_LINK": "https://github.com/kaizhang/SnapATAC2", + "READTHEDOC_LINK": "https://kzhang.org/SnapATAC2", "TUTORIAL_HTML_PATH": "https://kzhang.org/SnapATAC2/tutorials/index.html", - "TUTORIAL_GITHUB": null + "TUTORIAL_GITHUB": "https://github.com/kaizhang/SnapATAC2" } } diff --git a/src/dataloader/get_API_composite_from_tutorial.py b/src/dataloader/get_API_composite_from_tutorial.py index 00903ed..b19a848 100644 --- a/src/dataloader/get_API_composite_from_tutorial.py +++ b/src/dataloader/get_API_composite_from_tutorial.py @@ -11,6 +11,8 @@ from configs.model_config import ANALYSIS_PATH, get_all_variable_from_cheatsheet #tut, html_dict, code from dataloader.utils.tutorial_loader_strategy import main_convert_tutorial_to_py from dataloader.utils.code_analyzer import extract_io_variables +from models.model import LLM_model, LLM_response +from prompt.composite import build_prompt_for_composite_docstring, build_prompt_for_composite_name parser = argparse.ArgumentParser() parser.add_argument('--LIB', type=str, required=True, help='PyPI tool') @@ -342,33 +344,19 @@ def extract_api_calls(code_block, imports, lib_alias): except SyntaxError: return [] -def process_docstring_with_LLM(llm, API_description, func_inputs,func_outputs, description_text=""): +def process_docstring_with_LLM(llm, tokenizer, API_description, func_inputs,func_outputs, description_text=""): # LLM for modifying docstring - prompt = f"""You are an expert in Python programming. Your task is to write the docstring for the given information of an invisible function. Interpret the assigned inputs and return variables in the docstring. -The description of used APIs inside this code is: {API_description} -The input and output parameter information is as below: -- Parameters: {func_inputs} -- Returns: {func_outputs} -- The other description associated with the code is: {description_text} -- Please extract the core information in 1-2 sentences and polish it. Docstring description should only use 1-2 sentences. -Your Response format is detailed docstring. Please do not include other information except for response information, in reStructuredText format. Never include specific API information in description. -""" - response = llm.predict(prompt) + prompt = build_prompt_for_composite_docstring(API_description, func_inputs, func_outputs, description_text) + response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={}) print(f'==>GPT docstring response: {response}') if 'def' in response.split('\n')[0]: return '\n'.join(response.split('\n')[1:]) else: return response -def process_name_with_LLM(llm,sub_API_names,llm_docstring): - prompt=f"""Your task is to suggest an appropriate name for the given invisible function: -- Here are the sub API used together with function's docstring, please consider the API name to generate function name. sub API names: {sub_API_names}, -function docstring: ```{llm_docstring}``` -- Your name should consist of 4-5 keywords that combined with `_`, name should be recognizable and contain as much information as you can in keywords. -Your Response format: {{'func_name': (your designed function name)}} -Please do not include other information except for response format. -""" - response = llm.predict(prompt) +def process_name_with_LLM(llm,tokenizer,sub_API_names,llm_docstring): + prompt = build_prompt_for_composite_name(sub_API_names, llm_docstring) + response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={}) print(f'==>GPT name response: {response}') MAX_trial = 5 count=0 @@ -381,7 +369,7 @@ def process_name_with_LLM(llm,sub_API_names,llm_docstring): ans = ast.literal_eval(response) return list(ans.keys())[0] except: - response = llm.predict(prompt) + response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={}) print(f'==>retry GPT {count}: {response}') count+=1 return "function" @@ -412,7 +400,6 @@ def main_get_API_composite(LIB_ANALYSIS_PATH, output_folder_json): return unique_code_blocks def main_get_LLM_docstring(unique_code_blocks): - from models.model import LLM_model # LLM model llm, tokenizer = LLM_model() # load API_init.json @@ -439,8 +426,9 @@ def main_get_LLM_docstring(unique_code_blocks): # drop duplicate func_inputs = list(set(func_inputs)) # prompt - llm_docstring = process_docstring_with_LLM(llm, '\n'.join(API_description), json.dumps(func_inputs),json.dumps(func_outputs), description_text=code_blocks['text']) - new_name = process_name_with_LLM(llm,','.join(sub_API_names),llm_docstring) + print('llm: ', llm) + llm_docstring = process_docstring_with_LLM(llm, tokenizer, '\n'.join(API_description), json.dumps(func_inputs),json.dumps(func_outputs), description_text=code_blocks['text']) + new_name = process_name_with_LLM(llm, tokenizer, ','.join(sub_API_names),llm_docstring) if new_name=='function': new_name = f'function_{idxxxxx}' idxxxxx+=1 @@ -471,7 +459,7 @@ def main_get_LLM_docstring(unique_code_blocks): def generate_api_callings(results, basic_types=['str', 'int', 'float', 'bool', 'list', 'dict', 'tuple', 'set', 'any', 'List', 'Dict']): updated_results = {} for api_name, api_info in results.items(): - if api_info["api_type"] in ['function', 'method', 'class', 'functools.partial']: + if api_info["api_type"]: # in ['function', 'method', 'class', 'functools.partial'] # Update the optional_value key for each parameter for param_name, param_details in api_info["Parameters"].items(): param_type = param_details.get('type') diff --git a/src/dataloader/utils/code_analyzer.py b/src/dataloader/utils/code_analyzer.py index 03476f1..3929763 100644 --- a/src/dataloader/utils/code_analyzer.py +++ b/src/dataloader/utils/code_analyzer.py @@ -6,11 +6,9 @@ import pandas as pd import numpy as np import seaborn as sns -import cv2 import math import sklearn.preprocessing import sklearn -import scipy def is_variable_in_parentheses(var: str, code: str) -> bool: """ diff --git a/src/deploy/inference_dialog_server.py b/src/deploy/inference_dialog_server.py index c902771..b4f1108 100644 --- a/src/deploy/inference_dialog_server.py +++ b/src/deploy/inference_dialog_server.py @@ -949,7 +949,7 @@ def run_pipeline_after_doublechecking_API_selection(self, user_input): print(self.user_states) #[callback.on_tool_start() for callback in self.callbacks] #[callback.on_tool_end() for callback in self.callbacks] - [callback.on_agent_action(block_id="log-"+str(self.indexxxx), task="However, there are still some parameters with special type undefined. Please start from uploading data, or input your query from preprocessing dataset.",task_title="Missing Parameters: special type") for callback in self.callbacks] + [callback.on_agent_action(block_id="log-"+str(self.indexxxx), task="However, there are still some parameters with special type undefined. Please start from uploading data, or check your parameter type in json files.",task_title="Missing Parameters: special type") for callback in self.callbacks] self.indexxxx+=1 self.last_user_states = self.user_states self.user_states = "initial" @@ -1270,7 +1270,7 @@ def run_pipeline_after_doublechecking_execution_code(self, user_input): code = result['code'] output_list = result['output_list'] self.executor.load_environment("./tmp/tmp_output_run_pipeline_execution_code_variables.pkl") - #print('check:', code, output_list, self.executor.execute_code, self.executor.variables) + print('check:', code, output_list, self.executor.execute_code, self.executor.variables) if len(execution_code_list)>0: self.last_execute_code = self.get_last_execute_code(code) @@ -1372,12 +1372,12 @@ def run_pipeline_after_doublechecking_execution_code(self, user_input): pass else: pass - logging.info("Show current variables in namespace:") - logging.info(json.dumps(list(self.executor.variables.keys()))) + print("Show current variables in namespace:") + print(json.dumps(list(self.executor.variables.keys()))) new_str = [] for i in self.executor.execute_code: new_str.append({"code":i['code'],"execution_results":i['success']}) - logging.info("Currently all executed code: %s", json.dumps(new_str)) + print("Currently all executed code: %s", json.dumps(new_str)) filename = f"./tmp/sessions/{str(self.session_id)}_environment.pkl" self.last_user_states = self.user_states self.user_states = "initial" diff --git a/src/gpt/gpt_baseline.ipynb b/src/gpt/gpt_baseline.ipynb index 74aea8c..9d0c3f3 100644 --- a/src/gpt/gpt_baseline.ipynb +++ b/src/gpt/gpt_baseline.ipynb @@ -21,19 +21,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b7ae6210", "metadata": {}, "outputs": [], "source": [ "#### remember modify the LIB before start running\n", - "LIB = \"scanpy\"\n", + "LIB = \"squidpy\"\n", "#### Also notice that the prompt also contains `scanpy` which needs to be modified manually" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "id": "f1c7497b", "metadata": {}, "outputs": [ @@ -41,11 +41,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "182\n", - "361\n", - "1443\n", - "1986\n", - "182 182\n" + "53\n", + "105\n", + "420\n", + "578\n", + "53 53\n" ] } ], @@ -83,7 +83,15 @@ "import re, os\n", "from string import punctuation\n", "end_of_docstring_summary = re.compile(r'[{}\\n]+'.format(re.escape(punctuation)))\n", - "all_apis = {x['api_name']: end_of_docstring_summary.split(x['Docstring'])[0].strip() for x in data}\n", + "all_apis = {}\n", + "for x in data:\n", + " api_name = x['api_name']\n", + " if x['description']:\n", + " description = x['description']\n", + " else:\n", + " description = end_of_docstring_summary.split(x['Docstring'])[0].strip()\n", + " all_apis[api_name] = description\n", + "\n", "all_apis = list(all_apis.items())\n", "all_apis_json = {i[0]:i[1] for i in all_apis}\n", "print(len(all_apis), len(all_apis_json))" @@ -91,10 +99,32 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "9ac6a71a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building corpus...\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Path ./hugging_models/retriever_model_finetuned/squidpy/assigned/ not found", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/sw/gx57sf_s48530_wzcytrf5jc0000gn/T/ipykernel_35068/111563411.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0mdevice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'cuda:0'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0mretriever\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mToolRetriever\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_tsv_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"../data/standard_process/{LIB}/retriever_train_data/corpus.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"./hugging_models/retriever_model_finetuned/{LIB}/assigned/\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffled_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshuffled\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/var/folders/sw/gx57sf_s48530_wzcytrf5jc0000gn/T/ipykernel_35068/111563411.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, corpus_tsv_path, model_path, shuffled_data)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mToolRetriever\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus_tsv_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffled_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_retrieval_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_tsv_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffled_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbuild_retrieval_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus_tsv_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffled_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Building corpus...\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/var/folders/sw/gx57sf_s48530_wzcytrf5jc0000gn/T/ipykernel_35068/111563411.py\u001b[0m in \u001b[0;36mbuild_retrieval_corpus\u001b[0;34m(self, corpus_tsv_path, model_path, shuffled_data)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mcorpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcid\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcorpus_ids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membedder\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSentenceTransformer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus_embeddings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membedder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert_to_tensor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshuffled_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mshuffled_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/opt/anaconda3/lib/python3.9/site-packages/sentence_transformers/SentenceTransformer.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, model_name_or_path, modules, device, cache_folder, use_auth_token)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;31m#Not a path, load from hub\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'\\\\'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_name_or_path\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mmodel_name_or_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Path {} not found\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_name_or_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'/'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_name_or_path\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mmodel_name_or_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbasic_transformer_models\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Path ./hugging_models/retriever_model_finetuned/squidpy/assigned/ not found" + ] + } + ], "source": [ "from sentence_transformers import SentenceTransformer, util\n", "class ToolRetriever:\n", @@ -156,7 +186,7 @@ "\n", "import pandas as pd\n", "device = 'cuda:0'\n", - "retriever = ToolRetriever(corpus_tsv_path=f\"../data/standard_process/{LIB}/retriever_train_data/corpus.tsv\", model_path=f\"./hugging_models/retriever_model_finetuned/{LIB}/assigned/\",shuffled_data=shuffled)" + "retriever = ToolRetriever(corpus_tsv_path=f\"../data/standard_process/{LIB}/retriever_train_data/corpus.tsv\", model_path=f\"../hugging_models/retriever_model_finetuned/{LIB}/assigned/\",shuffled_data=shuffled)" ] }, { @@ -166,7 +196,7 @@ "metadata": {}, "outputs": [], "source": [ - "mode_index = 'similarseed' # 'similarseed' or randomseed\n", + "mode_index = 'randomseed' # 'similarseed' or randomseed\n", "# whether use similar shot example retriving mode, the similar shot example retriving is \n", "# to retrieve the similar queries that similar to the input query\n", "# noted that this mode always retrieve 5 shot queries for the same API, as the query for same API is always similar\n", @@ -206,7 +236,7 @@ "outputs": [], "source": [ "prompt = \"\"\"\n", - "Task: name the function from the ScanPy library that should be used for the instruction. Only use function whose names start with scanpy. Do not give arguments.\n", + "Task: name the function from the lib: {lib_name} library that should be used for the instruction. Only use function whose names start with the {lib_name}. Do not give arguments.\n", "\n", "{similar_queries}\n", "\n", @@ -226,8 +256,8 @@ " similar_queries = \"\".join([\"\\nInstruction: \" + ex['query'] + \"\\nFunction: \" + ex['gold'] for ex in sampled_shuffled])\n", " else:\n", " raise NotImplementedError\n", - " print(prompt.format(query=ex['query'],similar_queries=similar_queries))\n", - " p = gpt_interface.query_openai(prompt.format(query=ex['query'],similar_queries=similar_queries), mode=mode, model=gpt_model, max_tokens=max_tokens)\n", + " print(prompt.format(lib_name=LIB, query=ex['query'],similar_queries=similar_queries))\n", + " p = gpt_interface.query_openai(prompt.format(lib_name=LIB, query=ex['query'],similar_queries=similar_queries), mode=mode, model=gpt_model, max_tokens=max_tokens)\n", " p = p.split(',')[0] # hack for if GPT answers this or that\n", " p = p.split('(')[0]\n", " p = p.split(' or ')[0]\n", @@ -270,6 +300,7 @@ "metadata": {}, "outputs": [], "source": [ + "# correct: 0.2571428571428571: 100%|█| 105/105 [00:45<00:00, 2.33it/s\n", "title = f'gpt-3.5-turbo-16k-trainsample'\n", "run_gpt(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)" ] @@ -281,8 +312,9 @@ "metadata": {}, "outputs": [], "source": [ - "#title = f'gpt-4-trainsample'\n", - "#run_gpt(val, 'gpt-4', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)" + "# correct: 0.3142857142857143: 100%|█| 105/105 [02:20<00:00, 1.34s/it\n", + "title = f'gpt-4-trainsample'\n", + "run_gpt(val, 'gpt-4', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)" ] }, { @@ -401,11 +433,38 @@ } ], "source": [ + "# correct: 0.7904761904761904: 100%|█| 105/105 [00:56<00:00, 1.86it/s\n", "top_k = 3\n", "title = f'gpt-3.5-turbo-16k-topk-{top_k}-trainsample'\n", "run_gpt_new(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a32dd61", + "metadata": {}, + "outputs": [], + "source": [ + "# correct: 0.8113207547169812: 100%|██| 53/53 [00:23<00:00, 2.22it/s]\n", + "top_k = 3\n", + "title = f'gpt-3.5-turbo-16k-topk-{top_k}-test'\n", + "run_gpt_new(test, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6b35adc", + "metadata": {}, + "outputs": [], + "source": [ + "# correct: 0.9238095238095239: 100%|█| 105/105 [02:29<00:00, 1.43s/it\n", + "top_k = 3\n", + "title = f'gpt-4-topk-{top_k}-trainsample'\n", + "run_gpt_new(val, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)\n" + ] + }, { "cell_type": "code", "execution_count": 116, @@ -424,11 +483,28 @@ } ], "source": [ - "#top_k = 3\n", - "#title = f'gpt-4-topk-{top_k}-trainsample'\n", - "#run_gpt_new(val, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)" + "# correct: 0.8490566037735849: 100%|██| 53/53 [01:03<00:00, 1.20s/it]\n", + "top_k = 3\n", + "title = f'gpt-4-topk-{top_k}-test'\n", + "run_gpt_new(test, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)" + ] + }, + { + "cell_type": "markdown", + "id": "31d5f5b5", + "metadata": {}, + "source": [ + "### ambiguous pair" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "40aa72a8", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 64, diff --git a/src/inference/execution_UI.py b/src/inference/execution_UI.py index 926078d..05a8cb0 100644 --- a/src/inference/execution_UI.py +++ b/src/inference/execution_UI.py @@ -170,13 +170,32 @@ def get_import_code(self, api_name): continue print(f"==?# Error: Could not generate import code for {api_name}") return "", "" + def is_str_at_first_level(self, type_str): + print('change a stype for ensuring str type') + import re + # remove "Union[" and "]", to solve the internal type + def remove_outer_union(s): + if s.startswith("Union[") and s.endswith("]"): + return s[6:-1] + return s + + # split top level types + def split_top_level_types(s): + return re.split(r',\s*(?![^[\]]*\])', s) + + # check whether top level contains 'str' + s = remove_outer_union(type_str) + top_level_types = split_top_level_types(s) + return 'str' in top_level_types + def format_value(self, value, value_type): try: if str(value).strip().startswith('result_'): return str(value) except: pass - if "str" in value_type: + #if "str" in value_type: + if self.is_str_at_first_level(value_type): value = str(value).strip() if value.startswith("("): # if user input tuple parameters, return directly return value # (('tuple' in value) or ('Tuple' in value)) and diff --git a/src/models/model.py b/src/models/model.py index 0a9b9f0..664fc00 100644 --- a/src/models/model.py +++ b/src/models/model.py @@ -31,9 +31,9 @@ def LLM_model(local=True): gpt_interface.setup_openai('', mode='openai') llm = None tokenizer = None - elif llm_model_dict[LLM_MODEL]['platform']=='OPENAI':# 231201 deprecate - llm = OpenAI(temperature=TEMPERATURE,model_name='gpt-3.5-turbo-16k') - tokenizer = None + #elif llm_model_dict[LLM_MODEL]['platform']=='OPENAI':# 231201 deprecate + # llm = OpenAI(temperature=TEMPERATURE,model_name='gpt-3.5-turbo-16k') + # tokenizer = None elif llm_model_dict[LLM_MODEL]['platform']=='GORILLA': from langchain.chat_models import ChatOpenAI openai.api_key = "EMPTY" # Key is ignored and does not matter diff --git a/src/prompt/composite.py b/src/prompt/composite.py new file mode 100644 index 0000000..a858793 --- /dev/null +++ b/src/prompt/composite.py @@ -0,0 +1,25 @@ +""" +Author: Zhengyuan Dong +Date Created: January 16, 2024 +Last Modified: January 16, 2024 +Description: prompts for composite docstring/name generation +""" + +def build_prompt_for_composite_docstring(API_description, func_inputs, func_outputs, description_text): + return f""" +Write a concise docstring for an invisible function in Python, focusing solely on its core functionality derived from the sequential composition of sub APIs. +- API Description: {API_description} +- Parameters: {func_inputs} +- Returns: {func_outputs} +- Additional Description: {description_text} +Craft a 1-2 sentence docstring that extracts and polishes the core information. The response should be in reStructuredText format, excluding specific API names and unprofessional terms. Remember to use parameter details only to refine the core functionality explanation, not for plain input/output information. +""" + +def build_prompt_for_composite_name(sub_API_names, llm_docstring): + return f"""Your task is to suggest an appropriate name for the given invisible function: +- Here are the sub API used together with function's docstring, please consider the API name to generate function name. sub API names: {sub_API_names}, +function docstring: ```{llm_docstring}``` +- Your name should consist of 4 to 5 keywords that combined with `_`, name should be recognizable and contain as much information as you can in keywords, and should display API information in a sequential order. +Your Response format: {{'func_name': (your designed function name)}} +Please do not include other information except for response format. +""" diff --git a/src/prompt/instruction.py b/src/prompt/instruction.py index b6eb446..82bb084 100644 --- a/src/prompt/instruction.py +++ b/src/prompt/instruction.py @@ -1,44 +1,23 @@ -Task_Description_of_Singletool_oneapi_Instructions = """ +def Task_Description_of_Singletool_oneapi_Instructions_template(detailed_summarized, could_must, specific_concised): + return f""" You are provided with one API function, its descriptions, the parameters and returns information required for each API function. -Your task involves creating 5 varied, innovative, and detailed user queries that employ the given API function. +Your task involves creating 5 varied, innovative, and {detailed_summarized} user queries that employ the given API function. The queries exemplify how to utilize the API call. A query should only use the given API. -Additionally, you must incorporate the input parameters required for each API call. +Additionally, you {could_must} incorporate the input parameters required for each API call. To achieve this, generate random information for required parameters according to its type. -The 5 queries should be very specific. +The 5 queries should be very {specific_concised}. Note that you shouldn't ask 'which API to use', rather, simply state your needs that can be addressed by these APIs. Never explicitly mentioning the specific calling of API functions in your response. You should also avoid asking for the input parameters required by the API call, but instead directly provide the parameter in your query. """ -Other_Requirements_singletool_oneapi = """ -Please produce 5 queries in line with the given requirements and inputs. -These 5 queries should display a diverse range of sentence structures: -some queries should be in the form of imperative sentences, others declarative, and yet others interrogative. -Equally, they should encompass a variety of tones, with some being polite, some being straightforward, some like layman. -Ensure they vary in length. -Aim to include a number of engaging queries as long as they relate to API calls. -Keep in mind that -- Each query should consist of a minimum of twenty words. -- Never explicitly mentioning the specific calling of API functions in your response. For example, never include API as 'xx.yy.zz(parameters)'. -- The response must be a list of effective json. -- For quotation, use ` . -- Never include the reference paper in your response. -- Never including library keyword or specific type in sentence. Instead, use their description, you can find type description in return type description or parameters type description. -- Restricted to the response format: [{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"}] -""" -# simplified -Task_Description_of_Singletool_oneapi_Instructions_simple = """ -You are provided with one API function, its descriptions, the parameters and returns information required for each API function. -Your task involves creating 5 varied, innovative, and summarized user queries that employ the given API function. -The queries exemplify how to utilize the API call. A query should only use the given API. -Additionally, you must incorporate the input parameters required for each API call. -To achieve this, generate random information for required parameters according to its type. -The 5 queries should be concised. -Note that you shouldn't ask 'which API to use', rather, simply state your needs that can be addressed by these APIs. -Never explicitly mentioning the specific calling of API functions in your response. -You should also avoid asking for the input parameters required by the API call, but instead directly provide the parameter in your query. -""" -Other_Requirements_singletool_oneapi_simple = """ +Task_Description_of_Singletool_oneapi_Instructions = Task_Description_of_Singletool_oneapi_Instructions_template("detailed", "could", "specific") + +Task_Description_of_Singletool_oneapi_Instructions_simple = Task_Description_of_Singletool_oneapi_Instructions_template("summarized", "could", "concised") + + +def Other_Requirements_singletool_oneapi_template(word_minimum_number): + return f""" Please produce 5 queries in line with the given requirements and inputs. These 5 queries should display a diverse range of sentence structures: some queries should be in the form of imperative sentences, others declarative, and yet others interrogative. @@ -46,11 +25,14 @@ Ensure they vary in length. Aim to include a number of engaging queries as long as they relate to API calls. Keep in mind that -- Each query should consist of a minimum of fifteen words. -- Never explicitly mentioning the specific calling of API functions in your response. For example, never include API as 'xx.yy.zz(parameters)'. -- The response must be a list of effective json. -- For quotation, use ` . -- Never include the reference paper in your response. -- Never including library keyword or specific type in sentence. Instead, use their description, you can find type description in return type description or parameters type description. -- Restricted to the response format: [{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"}] +- Queries should be around {word_minimum_number} words, and avoid explicit mentions of API calls like 'xx.yy.zz(parameters)', or PyPI lib name as 'use API in xx', or reference paper like 'Zhang21 et al.', or API function keywords, or specific parameters type. +- For quotation, use ` instead of '. +- Avoid technical terms like 'the given API' or unmeaningful terms like 'data with all observations' in your inquiry; keep it natural and focus on the user's intention to accomplish a real task. +- Avoiding repeated the same information within each inquiry, like "based on the data object for the given data". +- Queries should be unique in structure and phrasing, and vocabulary should be varied precise, accurate, and diverse. +- Restricted to the response format as a list of effective json: [{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"}] """ + +Other_Requirements_singletool_oneapi_simple = Other_Requirements_singletool_oneapi_template("fifteen") + +Other_Requirements_singletool_oneapi = Other_Requirements_singletool_oneapi_template("twenty") \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt index 21b2427..cd2a022 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -1,8 +1,10 @@ astunparse==1.6.3 beautifulsoup4==4.12.2 +bionty==0.36.0 biotite==0.38.0 biopython==1.81 bitsandbytes==0.41.1 +cellrank==2.0.2 deap==1.4.1 deepspeed==0.12.0 docstring_parser==0.15 @@ -17,6 +19,7 @@ langchain==0.0.330 lightning==2.0.0 lxml==4.9.3 matplotlib==3.8.1 +medcat==1.10.0 mmtf_python==1.1.3 nbformat==5.9.2 networkx==3.2.1 @@ -44,10 +47,13 @@ sentence_transformers==2.2.2 sentencepiece==0.1.99 scanpy==1.9.5 scikit-learn-intelex +scvelo==0.3.1 squidpy==1.3.1 tenacity==8.2.3 tensorboard==2.15.1 +timm==0.9.12 torch==2.1.0 +torchsummary==1.5.1 tqdm==4.66.1 transformers==4.35.0 typing_extensions==4.8.0