minor update for prompt and new libs

batmen-lab · Jan 16, 2024 · 6276539 · 6276539
1 parent 1baeb0a
commit 6276539
Show file tree

Hide file tree

Showing 11 changed files with 198 additions and 104 deletions.
diff --git a/docs/PyPI2APP.md b/docs/PyPI2APP.md
@@ -48,7 +48,7 @@ export LIB=scanpy
 # download materials according to your provided url links
 python dataloader/utils/other_download.py --LIB ${LIB}
 # generate codes for your downloaded tutorial files, support for either html, ipynb.
-python dataloader/utils/tutorial_loader_strategy.py --LIB ${LIB} --file_type 'html'
+python dataloader/utils/tutorial_loader_strategy.py --LIB ${LIB} --file_type 'ipynb'
 ```
 
 NOTE it requires API_HTML_PATH, READTHEDOC_PATH and TUTORIAL_GITHUB to run the above script!

diff --git a/src/configs/Lib_cheatsheet.json b/src/configs/Lib_cheatsheet.json
@@ -174,18 +174,18 @@
         "LIB": "ehrapy",
         "LIB_ALIAS": "ehrapy",
         "API_HTML_PATH": "ehrapy.readthedocs.io/en/latest/usage/usage.html",
-        "GITHUB_LINK": null,
-        "READTHEDOC_LINK": null,
+        "GITHUB_LINK": "https://github.com/theislab/ehrapy",
+        "READTHEDOC_LINK": "https://ehrapy.readthedocs.io/",
         "TUTORIAL_HTML_PATH": "https://ehrapy.readthedocs.io/en/latest/tutorials/index.html",
-        "TUTORIAL_GITHUB": null
+        "TUTORIAL_GITHUB": "https://github.com/theislab/ehrapy-tutorials"
     },
     "snapatac2": {
         "LIB": "snapatac2",
         "LIB_ALIAS": "snapatac2",
         "API_HTML_PATH": "kzhang.org/SnapATAC2/api/index.html",
-        "GITHUB_LINK": null,
-        "READTHEDOC_LINK": null,
+        "GITHUB_LINK": "https://github.com/kaizhang/SnapATAC2",
+        "READTHEDOC_LINK": "https://kzhang.org/SnapATAC2",
         "TUTORIAL_HTML_PATH": "https://kzhang.org/SnapATAC2/tutorials/index.html",
-        "TUTORIAL_GITHUB": null
+        "TUTORIAL_GITHUB": "https://github.com/kaizhang/SnapATAC2"
     }
 }
diff --git a/src/dataloader/get_API_composite_from_tutorial.py b/src/dataloader/get_API_composite_from_tutorial.py
@@ -11,6 +11,8 @@
 from configs.model_config import ANALYSIS_PATH, get_all_variable_from_cheatsheet #tut, html_dict, code
 from dataloader.utils.tutorial_loader_strategy import main_convert_tutorial_to_py
 from dataloader.utils.code_analyzer import extract_io_variables
+from models.model import LLM_model, LLM_response
+from prompt.composite import build_prompt_for_composite_docstring, build_prompt_for_composite_name
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--LIB', type=str, required=True, help='PyPI tool')
@@ -342,33 +344,19 @@ def extract_api_calls(code_block, imports, lib_alias):
     except SyntaxError:
         return []
 
-def process_docstring_with_LLM(llm, API_description, func_inputs,func_outputs, description_text=""):
+def process_docstring_with_LLM(llm, tokenizer, API_description, func_inputs,func_outputs, description_text=""):
     # LLM for modifying docstring
-    prompt = f"""You are an expert in Python programming. Your task is to write the docstring for the given information of an invisible function. Interpret the assigned inputs and return variables in the docstring.
-The description of used APIs inside this code is: {API_description}
-The input and output parameter information is as below:
-- Parameters: {func_inputs}
-- Returns: {func_outputs}
-- The other description associated with the code is: {description_text}
-- Please extract the core information in 1-2 sentences and polish it. Docstring description should only use 1-2 sentences.
-Your Response format is detailed docstring. Please do not include other information except for response information, in reStructuredText format. Never include specific API information in description.
-"""
-    response = llm.predict(prompt)
+    prompt = build_prompt_for_composite_docstring(API_description, func_inputs, func_outputs, description_text)
+    response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={})
     print(f'==>GPT docstring response: {response}')
     if 'def' in response.split('\n')[0]:
         return '\n'.join(response.split('\n')[1:])
     else:
         return response
 
-def process_name_with_LLM(llm,sub_API_names,llm_docstring):
-    prompt=f"""Your task is to suggest an appropriate name for the given invisible function:
-- Here are the sub API used together with function's docstring, please consider the API name to generate function name. sub API names: {sub_API_names}, 
-function docstring: ```{llm_docstring}```
-- Your name should consist of 4-5 keywords that combined with `_`, name should be recognizable and contain as much information as you can in keywords.
-Your Response format: {{'func_name': (your designed function name)}}
-Please do not include other information except for response format.
-"""
-    response = llm.predict(prompt)
+def process_name_with_LLM(llm,tokenizer,sub_API_names,llm_docstring):
+    prompt = build_prompt_for_composite_name(sub_API_names, llm_docstring)
+    response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={})
     print(f'==>GPT name response: {response}')
     MAX_trial = 5
     count=0
@@ -381,7 +369,7 @@ def process_name_with_LLM(llm,sub_API_names,llm_docstring):
                 ans = ast.literal_eval(response)
                 return list(ans.keys())[0]
             except:
-                response = llm.predict(prompt)
+                response, history = LLM_response(llm,tokenizer,prompt,history=[],kwargs={})
                 print(f'==>retry GPT {count}: {response}')
         count+=1
     return "function"
@@ -412,7 +400,6 @@ def main_get_API_composite(LIB_ANALYSIS_PATH, output_folder_json):
     return unique_code_blocks
 
 def main_get_LLM_docstring(unique_code_blocks):
-    from models.model import LLM_model
     # LLM model
     llm, tokenizer = LLM_model()
     # load API_init.json
@@ -439,8 +426,9 @@ def main_get_LLM_docstring(unique_code_blocks):
         # drop duplicate
         func_inputs = list(set(func_inputs))
         # prompt
-        llm_docstring = process_docstring_with_LLM(llm, '\n'.join(API_description), json.dumps(func_inputs),json.dumps(func_outputs), description_text=code_blocks['text'])
-        new_name = process_name_with_LLM(llm,','.join(sub_API_names),llm_docstring)
+        print('llm: ', llm)
+        llm_docstring = process_docstring_with_LLM(llm, tokenizer, '\n'.join(API_description), json.dumps(func_inputs),json.dumps(func_outputs), description_text=code_blocks['text'])
+        new_name = process_name_with_LLM(llm, tokenizer, ','.join(sub_API_names),llm_docstring)
         if new_name=='function':
             new_name = f'function_{idxxxxx}'
             idxxxxx+=1
@@ -471,7 +459,7 @@ def main_get_LLM_docstring(unique_code_blocks):
 def generate_api_callings(results, basic_types=['str', 'int', 'float', 'bool', 'list', 'dict', 'tuple', 'set', 'any', 'List', 'Dict']):
     updated_results = {}
     for api_name, api_info in results.items():
-        if api_info["api_type"] in ['function', 'method', 'class', 'functools.partial']:
+        if api_info["api_type"]: # in ['function', 'method', 'class', 'functools.partial']
             # Update the optional_value key for each parameter
             for param_name, param_details in api_info["Parameters"].items():
                 param_type = param_details.get('type')

diff --git a/src/dataloader/utils/code_analyzer.py b/src/dataloader/utils/code_analyzer.py
@@ -6,11 +6,9 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
-import cv2
 import math
 import sklearn.preprocessing
 import sklearn
-import scipy
 
 def is_variable_in_parentheses(var: str, code: str) -> bool:
     """

diff --git a/src/deploy/inference_dialog_server.py b/src/deploy/inference_dialog_server.py
@@ -949,7 +949,7 @@ def run_pipeline_after_doublechecking_API_selection(self, user_input):
             print(self.user_states)
             #[callback.on_tool_start() for callback in self.callbacks]
             #[callback.on_tool_end() for callback in self.callbacks]
-            [callback.on_agent_action(block_id="log-"+str(self.indexxxx), task="However, there are still some parameters with special type undefined. Please start from uploading data, or input your query from preprocessing dataset.",task_title="Missing Parameters: special type") for callback in self.callbacks]
+            [callback.on_agent_action(block_id="log-"+str(self.indexxxx), task="However, there are still some parameters with special type undefined. Please start from uploading data, or check your parameter type in json files.",task_title="Missing Parameters: special type") for callback in self.callbacks]
             self.indexxxx+=1
             self.last_user_states = self.user_states
             self.user_states = "initial"
@@ -1270,7 +1270,7 @@ def run_pipeline_after_doublechecking_execution_code(self, user_input):
         code = result['code']
         output_list = result['output_list']
         self.executor.load_environment("./tmp/tmp_output_run_pipeline_execution_code_variables.pkl")
-        #print('check:', code, output_list, self.executor.execute_code, self.executor.variables)
+        print('check:', code, output_list, self.executor.execute_code, self.executor.variables)
 
         if len(execution_code_list)>0:
             self.last_execute_code = self.get_last_execute_code(code)
@@ -1372,12 +1372,12 @@ def run_pipeline_after_doublechecking_execution_code(self, user_input):
                 pass
         else:
             pass
-        logging.info("Show current variables in namespace:")
-        logging.info(json.dumps(list(self.executor.variables.keys())))
+        print("Show current variables in namespace:")
+        print(json.dumps(list(self.executor.variables.keys())))
         new_str = []
         for i in self.executor.execute_code:
             new_str.append({"code":i['code'],"execution_results":i['success']})
-        logging.info("Currently all executed code: %s", json.dumps(new_str))
+        print("Currently all executed code: %s", json.dumps(new_str))
         filename = f"./tmp/sessions/{str(self.session_id)}_environment.pkl"
         self.last_user_states = self.user_states
         self.user_states = "initial"