update instruction generation setting

batmen-lab · Jan 18, 2024 · 08bbd8f · 08bbd8f
1 parent 6276539
commit 08bbd8f
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 42 deletions.
diff --git a/src/dataloader/preprocess_retriever_data.py b/src/dataloader/preprocess_retriever_data.py
@@ -11,9 +11,7 @@
 from sklearn.utils import shuffle
 from models.model import LLM_response, LLM_model
 #from configs.model_config import LIB
-from prompt.instruction import Task_Description_of_Singletool_oneapi_Instructions,  \
-Other_Requirements_singletool_oneapi, Task_Description_of_Singletool_oneapi_Instructions_simple, \
-Other_Requirements_singletool_oneapi_simple
+from prompt.instruction import Task_Description_of_Singletool_oneapi_Instructions_whole, Other_Requirements_singletool_oneapi_whole
 from inference.utils import process_retrieval_document, compress_api_str_from_list
 parser = argparse.ArgumentParser()
 parser.add_argument('--LIB', type=str, help='PyPI tool')
@@ -22,8 +20,7 @@
 
 semaphore = asyncio.Semaphore(args.concurrency)
 
-prompt_oneapi = f"{Task_Description_of_Singletool_oneapi_Instructions}\n{Other_Requirements_singletool_oneapi}"
-prompt_oneapi_simple = f"{Task_Description_of_Singletool_oneapi_Instructions_simple}\n{Other_Requirements_singletool_oneapi_simple}"
+prompt_oneapi_whole = f"{Task_Description_of_Singletool_oneapi_Instructions_whole}\n{Other_Requirements_singletool_oneapi_whole}"
 
 def unify_response_format(response):
     list_pattern = re.compile(r'\[\{.*?\}\]', re.DOTALL)
@@ -85,12 +82,11 @@ async def preprocess_instruction_generation(API_composite, QUERY_FILE):
     #tasks = [process_api_async(api_name, ori_data[api_name], llm, tokenizer) for api_name in tqdm(ori_data)]
     all_tasks = []
     print('Start instruction generation ...')
-    print('Num. of Tasks is twice of the num. of APIs ...')
-    progress = tqdm_asyncio(total=len(ori_data) * 2)
+    print('Num. of Tasks is one times of the num. of APIs ...')
+    progress = tqdm_asyncio(total=len(ori_data))
     for api_name in tqdm_asyncio(ori_data):
         async with semaphore:
-            all_tasks.append(process_prompt_async(api_name, ori_data[api_name], llm, tokenizer, prompt_oneapi, progress))
-            all_tasks.append(process_prompt_async(api_name, ori_data[api_name], llm, tokenizer, prompt_oneapi_simple, progress))
+            all_tasks.append(process_prompt_async(api_name, ori_data[api_name], llm, tokenizer, prompt_oneapi_whole, progress))
     # Run the tasks and collect results
     results_from_tasks = await asyncio.gather(*(all_tasks))
     # close progres bar

diff --git a/src/gpt/gpt_interface.py b/src/gpt/gpt_interface.py
@@ -24,7 +24,6 @@ def setup_openai(fname, mode='azure'):
         openai.api_key = secrets['MS_KEY']
     return secrets
 
-
 @T.retry(stop=T.stop_after_attempt(5), wait=T.wait_fixed(60), after=lambda s: logging.error(repr(s)))
 def query_openai(prompt, mode='azure', model='gpt-35-turbo', **kwargs):
     if mode == 'openai':

diff --git a/src/inference/utils.py b/src/inference/utils.py
@@ -123,7 +123,8 @@ def compress_api_str_from_list_query_version(api):
     req_params = json.dumps(api['required_parameters'])
     opt_params = json.dumps(api['optional_parameters'])
     return_schema = json.dumps(api['Returns'])
-    compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, optional_params: {opt_params}, return_schema: {return_schema}"
+    #compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, optional_params: {opt_params}, return_schema: {return_schema}"
+    compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, optional_params: {opt_params}"
     return compressed_str
 
 def process_retrieval_document_query_version(documents_df):
@@ -160,7 +161,8 @@ def compress_api_str_from_list(api):
         req_params = json.dumps({})
         opt_params = json.dumps({})
     return_schema = json.dumps(api['Returns'])
-    compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, return_schema: {return_schema}" # optional_params: {opt_params},
+    #compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, return_schema: {return_schema}" # optional_params: {opt_params},
+    compressed_str = f"API description: {api_desc_truncated}, required_params: {req_params}" # optional_params: {opt_params},
     return compressed_str 
 
 def process_retrieval_document(documents_df):
@@ -211,3 +213,4 @@ def fast_get_environment(pre_code):
     print(executor.variables)
     executor.execute_api_call('print(a)')
 
+
diff --git a/src/models/model.py b/src/models/model.py
@@ -95,7 +95,8 @@ def LLM_response(llm,tokenizer,chat_prompt,history=[],kwargs={}):
     if llm_model_dict[LLM_MODEL]['platform'] in ['OPENAI']:
         from gpt import gpt_interface
         gpt_interface.setup_openai('', mode='openai')
-        response = gpt_interface.query_openai(chat_prompt, mode="openai", model="gpt-3.5-turbo-16k", max_tokens=MAX_NEW_TOKENS)
+        response = gpt_interface.query_openai(chat_prompt, mode="openai", model="gpt-3.5-turbo-1106", max_tokens=MAX_NEW_TOKENS)
+        # Modified 240117, changed to gpt-3.5-turbo-1106	
         #response = gpt_interface.query_openai(chat_prompt, mode="openai", model="gpt-4", max_tokens=MAX_NEW_TOKENS)
         history.append([chat_prompt, response])
     elif llm_model_dict[LLM_MODEL]['platform'] in ['OPENAI']: # 231201_deprecate

diff --git a/src/prompt/instruction.py b/src/prompt/instruction.py
@@ -1,38 +1,24 @@
-def Task_Description_of_Singletool_oneapi_Instructions_template(detailed_summarized, could_must, specific_concised):
-    return f"""
-You are provided with one API function, its descriptions, the parameters and returns information required for each API function. 
-Your task involves creating 5 varied, innovative, and {detailed_summarized} user queries that employ the given API function. 
-The queries exemplify how to utilize the API call. A query should only use the given API. 
-Additionally, you {could_must} incorporate the input parameters required for each API call. 
-To achieve this, generate random information for required parameters according to its type.
-The 5 queries should be very {specific_concised}. 
-Note that you shouldn't ask 'which API to use', rather, simply state your needs that can be addressed by these APIs. 
-Never explicitly mentioning the specific calling of API functions in your response.
+Task_Description_of_Singletool_oneapi_Instructions_whole = f"""
+You are provided with the API function, its descriptions, the parameters required for each API function. 
+Your task involves creating a total of 10 totally differentiate user queries for a given API function, 5 should be detailed and specific and other 5 brief and concise. Each query is innovative.
+These queries illustrate only on how to accomplish the exact task that the API is designed for, and the user never intend to use API/function/tool to solve the task.  
+Incorporate randomly generated values for required parameters, ensuring variation among queries based on their types.
+Never explicitly mentioning any keywords of API function names in your response.
 You should also avoid asking for the input parameters required by the API call, but instead directly provide the parameter in your query.
 """
 
-Task_Description_of_Singletool_oneapi_Instructions = Task_Description_of_Singletool_oneapi_Instructions_template("detailed", "could", "specific")
-
-Task_Description_of_Singletool_oneapi_Instructions_simple = Task_Description_of_Singletool_oneapi_Instructions_template("summarized", "could", "concised")
-
-
-def Other_Requirements_singletool_oneapi_template(word_minimum_number):
-    return f"""
-Please produce 5 queries in line with the given requirements and inputs. 
-These 5 queries should display a diverse range of sentence structures: 
+Other_Requirements_singletool_oneapi_whole = f"""
+Create queries in line with the given requirements and inputs. 
+These queries should display a diverse range of sentence structures: 
 some queries should be in the form of imperative sentences, others declarative, and yet others interrogative. 
 Equally, they should encompass a variety of tones, with some being polite, some being straightforward, some like layman.
 Ensure they vary in length. 
 Aim to include a number of engaging queries as long as they relate to API calls. 
 Keep in mind that 
-- Queries should be around {word_minimum_number} words, and avoid explicit mentions of API calls like 'xx.yy.zz(parameters)', or PyPI lib name as 'use API in xx', or reference paper like 'Zhang21 et al.', or API function keywords, or specific parameters type.
-- For quotation, use ` instead of '.
-- Avoid technical terms like 'the given API' or unmeaningful terms like 'data with all observations' in your inquiry; keep it natural and focus on the user's intention to accomplish a real task.
-- Avoiding repeated the same information within each inquiry, like "based on the data object for the given data".
-- Queries should be unique in structure and phrasing, and vocabulary should be varied precise, accurate, and diverse. 
-- Restricted to the response format as a list of effective json: [{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"},{"Query": "(query content)"}]
+- Queries should contain about 10-20 words, avoiding direct references to API calls (e.g., 'xx.yy.zz(parameters)'), library names (e.g., 'use API in xx'), function names (e.g., 'zz'), academic references (e.g., 'Zhang21 et al.'), or specific parameter types (e.g. 'AnnData data matrix object').
+- Use backticks (`) for quotes, not single quotes (').
+- Keep language natural and task-focused, avoiding overly technical or vague terms (e.g. 'data with all observations', 'using the given API').
+- Ensure each query is distinct without repeating the same information (e.g. `based on the data object for the given data`).
+- Queries should be structurally unique, with precise and diverse vocabulary.
+- Format responses as a list in JSON: [{{"Query": "(query content)"}}, {{"Query": "(query content)"}}, {{"Query": "(query content)"}},{{"Query": "(query content)"}}, {{"Query": "(query content)"}}, {{"Query": "(query content)"}},{{"Query": "(query content)"}}, {{"Query": "(query content)"}}, {{"Query": "(query content)"}},{{"Query": "(query content)"}}].
 """
-
-Other_Requirements_singletool_oneapi_simple = Other_Requirements_singletool_oneapi_template("fifteen")
-
-Other_Requirements_singletool_oneapi = Other_Requirements_singletool_oneapi_template("twenty")