update task planning & execution correction prompt & pipeline

- letting it distinguish using built-in dataset API or using local data loading API, if user upload files or not - letting it polish the next subtask based on the variables information after execution, to conduct better parameters prediction - letting user input the subtask inquiry if execution correction fails 3 times, instead of exits. - update requirements.txt - visualize UI in a user friendly way
batmen-lab · Jun 10, 2024 · 4cbe301 · 4cbe301
1 parent a921f16
commit 4cbe301
Show file tree

Hide file tree

Showing 9 changed files with 279 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -56,6 +56,8 @@ We provide several ways to run the service: terminal CLI, Docker, railway, pytho
 pip install git+https://github.com/batmen-lab/BioMANIA.git  --index-url https://pypi.org/simple
 # setup OPENAI_API_KEY
 echo 'OPENAI_API_KEY="sk-proj-xxxx"' >> .env
+# (optional) setup github token
+echo "GITHUB_TOKEN=your_github_token" >> .env
 # download data, retriever, and resources from drive, and put them to the 
 # - data/standard_process/{LIB} and 
 # - hugging_models/retriever_model_finetuned/{LIB} and 

diff --git a/chatbot_ui_biomania/components/Chat/ProgressCards/LoggingCard.tsx b/chatbot_ui_biomania/components/Chat/ProgressCards/LoggingCard.tsx
@@ -30,11 +30,20 @@ const LoggingCard = ({ title, logString, tableData, logColor = 'black', imageDat
 
   const successMatch = /\[Success\]/;
   const failMatch = /\[Fail\]/;
+  const confirmationMatch = /Enter Parameters|Can you confirm|User Confirmation|Could you confirm whether this API should be called\? Please enter y\/n\./;
+  const planMatch = /Multi step Task Planning|SubTask Execution|Continue to the next subtask|Step \d+: .*/;
+  const erroranalysisMatch = /Error Analysis/;
 
   if (successMatch.test(title)) {
     titleColor = 'green';
   } else if (failMatch.test(title)) {
     titleColor = 'red';
+  } else if (confirmationMatch.test(title)) {
+    titleColor = 'orange';
+  } else if (planMatch.test(title)) {
+    titleColor = 'blue';
+  } else if (erroranalysisMatch.test(title)) {
+    titleColor = 'blue';
   }
 
   const theme = useTheme();

diff --git a/docs/PyPI2APP.md b/docs/PyPI2APP.md
@@ -57,7 +57,12 @@ python -m src.dataloader.download_issues --LIB ${LIB} --token {GITHUB_TOKEN}
 # TODO: download prepared corpus `data/github_issues/{LIB}/*` from google drive
 python -m src.dataloader.prepare_issue_corpus --LIB ${LIB}
 # query the corpus with command:
-python -m src.models.query_issue_corpus --LIB scanpy --example_query "KeyError: 'No "neighbors" in .uns'" --method sentencebert --field issue_description --top_k 3
+python -m src.models.query_issue_corpus --LIB scanpy --example_query "KeyError: \"No \"neighbors\" in .uns\"" --top_k 3 --query_source local
+```
+
+(Optional) Or you can either choose query issue online
+```bash
+python -m src.models.query_issue_corpus --query_source online --LIB scanpy --example_query "KeyError: \"No \"neighbors\" in .uns\"" --top_k 10
 ```
 
 NOTE it requires API_HTML_PATH, READTHEDOC_PATH and TUTORIAL_GITHUB to run the above script!

diff --git a/requirements.txt b/requirements.txt
@@ -49,7 +49,7 @@ Requests==2.31.0
 #scikit_learn
 scipy==1.9.2
 #scvi_tools==1.0.4
-seaborn==0.13.0
+seaborn
 sentence_transformers==2.2.2
 sentencepiece==0.1.99
 scanpy==1.9.6
@@ -62,6 +62,7 @@ tensorboard==2.15.1
 timm==0.9.12
 torch==2.1.0
 torchsummary==1.5.1
+trimap==1.1.4
 tqdm==4.66.1
 transformers==4.35.0
 typing_extensions==4.8.0

diff --git a/src/dataloader/download_issues.py b/src/dataloader/download_issues.py
@@ -34,13 +34,17 @@ def fetch_issues(repo, threshold):
                 solutions.append((comment.body, reactions))
         if solutions:
             solutions = sorted(solutions, key=lambda x: x[1], reverse=True)
-            best_solution = solutions[0][0]
+            #best_solution = solutions[0][0]
+            top_k = 3
+            best_solutions = [sol[0] for sol in solutions[:top_k]]
+            best_solutions = '\n'.join([f'Rank {i+1}: {solution}' for i, solution in enumerate(best_solutions)])
         else:
-            best_solution = None
+            #best_solutions = [None, None, None]
+            best_solutions = "No solutions"
         pair = {
             'issue_title': issue_title,
             'issue_body': issue_body,
-            'solution': best_solution,
+            'solution': best_solutions,
             'count': count
         }
         issue_solution_pairs.append(pair)

diff --git a/src/deploy/model.py b/src/deploy/model.py
diff --git a/src/inference/execution_UI.py b/src/inference/execution_UI.py
@@ -382,12 +382,15 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
                 maybe_instance_name = maybe_class_name.lower() + "_instance"
                 pass
             if single_class_API:
+                self.logger.info('single_class_API: {}', single_class_API)
                 if api_type in ['property', 'constant']:
                     api_call = f"{maybe_instance_name} = {maybe_class_name}"
                 else:
                     api_call = f"{maybe_instance_name} = {maybe_class_name}({class_params_formatted})"
             else:
+                self.logger.info('no single_class_API')
                 if maybe_instance_name not in self.variables: # not initialized
+                    self.logger.info('==> maybe_instance_name not in self.variables')
                     if api_type in ['property', 'constant']:
                         api_call = f"{maybe_instance_name} = {maybe_class_name}"
                     else:
@@ -412,10 +415,10 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
             index_parenthesis = tmp_api_call.find("(")
             comparison_result = index_equal < index_parenthesis
             if index_equal!=-1 and comparison_result:
-                self.logger.info('debugging1 for return class API:', api_name, return_type, api_call, '--end')
+                self.logger.info('debugging1 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
                 return import_code+'\n'+f"{api_call}"
             else:
-                self.logger.info('debugging2 for return class API:', api_name, return_type, api_call, '--end')
+                self.logger.info('debugging2 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
                 self.counter = max(self.counter, self.get_newest_counter_from_namespace())
                 self.counter += 1
                 return_var = f"result_{self.counter} = "
@@ -426,7 +429,7 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
                 self.generate_code.append(new_code)
                 return import_code+'\n'+new_code
         else:
-            self.logger.info('debugging3 for return class API:', api_name, return_type, api_call, '--end')
+            self.logger.info('debugging3 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
             self.generate_code.append(f"{api_call}")
             return import_code+'\n'+f"{api_call}"
     def split_tuple_variable(self, last_code_status):

diff --git a/src/models/query_issue_corpus.py b/src/models/query_issue_corpus.py
@@ -4,19 +4,19 @@
 Last Modified: May 29, 2024
 Description: Query the issue corpus for the specified library
 Usage: 
-python -m src.models.query_issue_corpus --LIB scanpy --example_query "Traceback (most recent call last): \n File "/home/z6dong/BioChat/refer/src/2024_biomania_phase2/./examples/case2.1/output/3.sh.execute.py", line 13, in <module>\n     sc.tl.louvain(adata)\nFile "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/tools/_louvain.py", line 115, in louvain\n    adjacency = _choose_graph(adata, obsp, neighbors_key)\n  File "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/_utils/__init__.py", line 767, in _choose_graph\n   neighbors = NeighborsView(adata, neighbors_key)\n  File "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/_utils/__init__.py", line 711, in __init__\n    raise KeyError('No "neighbors" in .uns')\nKeyError: 'No "neighbors" in .uns'" --method sentencebert --field issue_description --top_k 3
+python -m src.models.query_issue_corpus --LIB scanpy --example_query "ValueError: cannot specify integer bins when input data contains infinity" --method sentencebert --field issue_description --top_k 1
+Notice: if we input wrong example_query, the output will be empty.
 """
 
-import os
-import json
-import argparse
+import os, json, requests, argparse, ast
 from typing import Tuple, List, Dict, Any
 from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, util
+from dotenv import load_dotenv
 from ..retrievers import BM25Retriever
 from ..gpt.utils import load_json
-import ast
-from sentence_transformers import SentenceTransformer, util
 from ..dataloader.prepare_issue_corpus import ERROR_KEYWORDS, get_error_type
+from ..configs.model_config import get_all_variable_from_cheatsheet
 
 def prepare_corpus(queries: List[Dict[str, Any]], field: str) -> Dict[str, Tuple[List[Dict[str, Any]], List[str]]]:
     """
@@ -38,7 +38,7 @@ def prepare_corpus(queries: List[Dict[str, Any]], field: str) -> Dict[str, Tuple
     """
     corpus_dict = {}
     for query in queries:
-        if query['solution'] is None:
+        if query['solution'] in [None, 'No solutions']:
             continue
         error_types = query.get('error_type', {'Other'})
         for error_type in error_types:
@@ -101,6 +101,57 @@ def sentencebert_retriever(corpus_texts: List[str], query: str, top_k: int) -> L
     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
     return [hit['corpus_id'] for hit in hits]
 
+def search_github_issues(lib, topk, question):
+    info_json = get_all_variable_from_cheatsheet(lib)
+    GITHUB_LINK = info_json['GITHUB_LINK']
+    repo_name = GITHUB_LINK.replace('https://github.com/','').replace('.git','')
+    if repo_name.endswith('/'):
+        repo_name = repo_name[:-1]
+    load_dotenv()
+    github_token = os.getenv('GITHUB_TOKEN', None)
+    if not github_token:
+        print("No GitHub token provided. Unable to retrieve issues from GitHub.")
+    search_url = 'https://api.github.com/search/issues'
+    params = {
+        'q': f'repo:{repo_name} "{question}" is:issue',
+        'sort': 'comments',
+        'order': 'desc',
+        'per_page': topk
+    }
+    print('params', params)
+    headers = {
+        'Authorization': f'token {github_token}'
+    }
+    def fetch_issues():
+        response = requests.get(search_url, headers=headers, params=params)
+        if response.status_code == 200:
+            issues = response.json()['items']
+            return issues if issues else ""
+        else:
+            return ""
+    def fetch_comments(comments_url):
+        comments_response = requests.get(comments_url, headers=headers)
+        if comments_response.status_code == 200:
+            return comments_response.json()
+        else:
+            return []
+    issues = fetch_issues()
+    if not issues:
+        return ""
+    results = []
+    for issue in issues:
+        issue_title = issue['title']
+        comments_url = issue['comments_url']
+        comments = fetch_comments(comments_url)
+        if comments:
+            sorted_comments = sorted(comments, key=lambda x: x['reactions']['total_count'], reverse=True)
+            solutions = [f"Solution {idx + 1}: {comment['body']} (Reactions: {comment['reactions']['total_count']})" for idx, comment in enumerate(sorted_comments)]
+            result = f"issue: {issue_title}, solutions: {'; '.join(solutions)}"
+        else:
+            result = f"issue: {issue_title}, solutions: No comments found"
+        results.append(result)
+    return "\n".join(results)
+
 def retrieved_issue_solution(LIB: str, top_k: int, example_query: str, method: str, field: str) -> None:
     """
     Main function to prepare data, create a retriever, and evaluate its performance.
@@ -149,17 +200,26 @@ def retrieved_issue_solution(LIB: str, top_k: int, example_query: str, method: s
     print(f"Retrieved titles: {retrieved_titles}")
     print(f"Retrieved issue descriptions: {retrieved_issue_description}")
     print(f"Retrieved solutions: {retrieved_solution}")
+
+    return retrieved_solution
 
 def main():
     parser = argparse.ArgumentParser(description='Query the issue corpus for a library')
     parser.add_argument('--LIB', type=str, required=True, help='Library name')
     parser.add_argument('--example_query', type=str, required=True, help='Example query to test')
-    parser.add_argument('--method', type=str, required=True, choices=['bm25', 'sentencebert'], help='Retrieval method to use')
-    parser.add_argument('--field', type=str, required=True, choices=['issue_title', 'issue_description'], help='Field to compare')
-    parser.add_argument('--top_k', type=int, default=3, help='Number of top documents to retrieve')
+    parser.add_argument('--method', type=str, default="sentencebert", choices=['bm25', 'sentencebert'], help='Retrieval method to use')
+    parser.add_argument('--field', type=str, default="issue_title", choices=['issue_title', 'issue_description'], help='Field to compare')
+    parser.add_argument('--top_k', type=int, default=10, help='Number of top documents to retrieve')
+    parser.add_argument('--query_source', type=str, default="online", help='query issue with solutions online')
     args = parser.parse_args()
-
-    retrieved_issue_solution(args.LIB, args.top_k, args.example_query, args.method, args.field)
+
+    if args.query_source=='local':
+        retrieved_issue_solution(args.LIB, args.top_k, args.example_query, args.method, args.field)
+    elif args.query_source=='online':
+        solutions = search_github_issues(args.LIB, args.top_k, args.example_query)
+        print('solutions: ', solutions)
+    else:
+        raise NotImplementedError("Unsupported query source. Use 'local' or 'online'.")
 
 if __name__ == "__main__":
     main()