Skip to content

Commit

Permalink
update task planning & execution correction prompt & pipeline
Browse files Browse the repository at this point in the history
- letting it distinguish using built-in dataset API or using local data loading API, if user upload files or not
- letting it polish the next subtask based on the variables information after execution, to conduct better parameters prediction
- letting user input the subtask inquiry if execution correction fails 3 times, instead of exits.
- update requirements.txt
- visualize UI in a user friendly way
  • Loading branch information
DoraDong-2023 committed Jun 10, 2024
1 parent a921f16 commit 4cbe301
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 86 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ We provide several ways to run the service: terminal CLI, Docker, railway, pytho
pip install git+https://github.com/batmen-lab/BioMANIA.git --index-url https://pypi.org/simple
# setup OPENAI_API_KEY
echo 'OPENAI_API_KEY="sk-proj-xxxx"' >> .env
# (optional) setup github token
echo "GITHUB_TOKEN=your_github_token" >> .env
# download data, retriever, and resources from drive, and put them to the
# - data/standard_process/{LIB} and
# - hugging_models/retriever_model_finetuned/{LIB} and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,20 @@ const LoggingCard = ({ title, logString, tableData, logColor = 'black', imageDat

const successMatch = /\[Success\]/;
const failMatch = /\[Fail\]/;
const confirmationMatch = /Enter Parameters|Can you confirm|User Confirmation|Could you confirm whether this API should be called\? Please enter y\/n\./;
const planMatch = /Multi step Task Planning|SubTask Execution|Continue to the next subtask|Step \d+: .*/;
const erroranalysisMatch = /Error Analysis/;

if (successMatch.test(title)) {
titleColor = 'green';
} else if (failMatch.test(title)) {
titleColor = 'red';
} else if (confirmationMatch.test(title)) {
titleColor = 'orange';
} else if (planMatch.test(title)) {
titleColor = 'blue';
} else if (erroranalysisMatch.test(title)) {
titleColor = 'blue';
}

const theme = useTheme();
Expand Down
7 changes: 6 additions & 1 deletion docs/PyPI2APP.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ python -m src.dataloader.download_issues --LIB ${LIB} --token {GITHUB_TOKEN}
# TODO: download prepared corpus `data/github_issues/{LIB}/*` from google drive
python -m src.dataloader.prepare_issue_corpus --LIB ${LIB}
# query the corpus with command:
python -m src.models.query_issue_corpus --LIB scanpy --example_query "KeyError: 'No "neighbors" in .uns'" --method sentencebert --field issue_description --top_k 3
python -m src.models.query_issue_corpus --LIB scanpy --example_query "KeyError: \"No \"neighbors\" in .uns\"" --top_k 3 --query_source local
```

(Optional) Or you can either choose query issue online
```bash
python -m src.models.query_issue_corpus --query_source online --LIB scanpy --example_query "KeyError: \"No \"neighbors\" in .uns\"" --top_k 10
```

NOTE it requires API_HTML_PATH, READTHEDOC_PATH and TUTORIAL_GITHUB to run the above script!
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Requests==2.31.0
#scikit_learn
scipy==1.9.2
#scvi_tools==1.0.4
seaborn==0.13.0
seaborn
sentence_transformers==2.2.2
sentencepiece==0.1.99
scanpy==1.9.6
Expand All @@ -62,6 +62,7 @@ tensorboard==2.15.1
timm==0.9.12
torch==2.1.0
torchsummary==1.5.1
trimap==1.1.4
tqdm==4.66.1
transformers==4.35.0
typing_extensions==4.8.0
Expand Down
10 changes: 7 additions & 3 deletions src/dataloader/download_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@ def fetch_issues(repo, threshold):
solutions.append((comment.body, reactions))
if solutions:
solutions = sorted(solutions, key=lambda x: x[1], reverse=True)
best_solution = solutions[0][0]
#best_solution = solutions[0][0]
top_k = 3
best_solutions = [sol[0] for sol in solutions[:top_k]]
best_solutions = '\n'.join([f'Rank {i+1}: {solution}' for i, solution in enumerate(best_solutions)])
else:
best_solution = None
#best_solutions = [None, None, None]
best_solutions = "No solutions"
pair = {
'issue_title': issue_title,
'issue_body': issue_body,
'solution': best_solution,
'solution': best_solutions,
'count': count
}
issue_solution_pairs.append(pair)
Expand Down
176 changes: 139 additions & 37 deletions src/deploy/model.py

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions src/inference/execution_UI.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,12 +382,15 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
maybe_instance_name = maybe_class_name.lower() + "_instance"
pass
if single_class_API:
self.logger.info('single_class_API: {}', single_class_API)
if api_type in ['property', 'constant']:
api_call = f"{maybe_instance_name} = {maybe_class_name}"
else:
api_call = f"{maybe_instance_name} = {maybe_class_name}({class_params_formatted})"
else:
self.logger.info('no single_class_API')
if maybe_instance_name not in self.variables: # not initialized
self.logger.info('==> maybe_instance_name not in self.variables')
if api_type in ['property', 'constant']:
api_call = f"{maybe_instance_name} = {maybe_class_name}"
else:
Expand All @@ -412,10 +415,10 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
index_parenthesis = tmp_api_call.find("(")
comparison_result = index_equal < index_parenthesis
if index_equal!=-1 and comparison_result:
self.logger.info('debugging1 for return class API:', api_name, return_type, api_call, '--end')
self.logger.info('debugging1 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
return import_code+'\n'+f"{api_call}"
else:
self.logger.info('debugging2 for return class API:', api_name, return_type, api_call, '--end')
self.logger.info('debugging2 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
self.counter = max(self.counter, self.get_newest_counter_from_namespace())
self.counter += 1
return_var = f"result_{self.counter} = "
Expand All @@ -426,7 +429,7 @@ def generate_execution_code_for_one_api(self, api_name, selected_params, return_
self.generate_code.append(new_code)
return import_code+'\n'+new_code
else:
self.logger.info('debugging3 for return class API:', api_name, return_type, api_call, '--end')
self.logger.info('debugging3 for return class API: {}, {}, {} --end', api_name, return_type, api_call)
self.generate_code.append(f"{api_call}")
return import_code+'\n'+f"{api_call}"
def split_tuple_variable(self, last_code_status):
Expand Down
84 changes: 72 additions & 12 deletions src/models/query_issue_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@
Last Modified: May 29, 2024
Description: Query the issue corpus for the specified library
Usage:
python -m src.models.query_issue_corpus --LIB scanpy --example_query "Traceback (most recent call last): \n File "/home/z6dong/BioChat/refer/src/2024_biomania_phase2/./examples/case2.1/output/3.sh.execute.py", line 13, in <module>\n sc.tl.louvain(adata)\nFile "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/tools/_louvain.py", line 115, in louvain\n adjacency = _choose_graph(adata, obsp, neighbors_key)\n File "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/_utils/__init__.py", line 767, in _choose_graph\n neighbors = NeighborsView(adata, neighbors_key)\n File "/home/z6dong/anaconda3/envs/biomania2/lib/python3.10/site-packages/scanpy/_utils/__init__.py", line 711, in __init__\n raise KeyError('No "neighbors" in .uns')\nKeyError: 'No "neighbors" in .uns'" --method sentencebert --field issue_description --top_k 3
python -m src.models.query_issue_corpus --LIB scanpy --example_query "ValueError: cannot specify integer bins when input data contains infinity" --method sentencebert --field issue_description --top_k 1
Notice: if we input wrong example_query, the output will be empty.
"""

import os
import json
import argparse
import os, json, requests, argparse, ast
from typing import Tuple, List, Dict, Any
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
from ..retrievers import BM25Retriever
from ..gpt.utils import load_json
import ast
from sentence_transformers import SentenceTransformer, util
from ..dataloader.prepare_issue_corpus import ERROR_KEYWORDS, get_error_type
from ..configs.model_config import get_all_variable_from_cheatsheet

def prepare_corpus(queries: List[Dict[str, Any]], field: str) -> Dict[str, Tuple[List[Dict[str, Any]], List[str]]]:
"""
Expand All @@ -38,7 +38,7 @@ def prepare_corpus(queries: List[Dict[str, Any]], field: str) -> Dict[str, Tuple
"""
corpus_dict = {}
for query in queries:
if query['solution'] is None:
if query['solution'] in [None, 'No solutions']:
continue
error_types = query.get('error_type', {'Other'})
for error_type in error_types:
Expand Down Expand Up @@ -101,6 +101,57 @@ def sentencebert_retriever(corpus_texts: List[str], query: str, top_k: int) -> L
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
return [hit['corpus_id'] for hit in hits]

def search_github_issues(lib, topk, question):
info_json = get_all_variable_from_cheatsheet(lib)
GITHUB_LINK = info_json['GITHUB_LINK']
repo_name = GITHUB_LINK.replace('https://github.com/','').replace('.git','')
if repo_name.endswith('/'):
repo_name = repo_name[:-1]
load_dotenv()
github_token = os.getenv('GITHUB_TOKEN', None)
if not github_token:
print("No GitHub token provided. Unable to retrieve issues from GitHub.")
search_url = 'https://api.github.com/search/issues'
params = {
'q': f'repo:{repo_name} "{question}" is:issue',
'sort': 'comments',
'order': 'desc',
'per_page': topk
}
print('params', params)
headers = {
'Authorization': f'token {github_token}'
}
def fetch_issues():
response = requests.get(search_url, headers=headers, params=params)
if response.status_code == 200:
issues = response.json()['items']
return issues if issues else ""
else:
return ""
def fetch_comments(comments_url):
comments_response = requests.get(comments_url, headers=headers)
if comments_response.status_code == 200:
return comments_response.json()
else:
return []
issues = fetch_issues()
if not issues:
return ""
results = []
for issue in issues:
issue_title = issue['title']
comments_url = issue['comments_url']
comments = fetch_comments(comments_url)
if comments:
sorted_comments = sorted(comments, key=lambda x: x['reactions']['total_count'], reverse=True)
solutions = [f"Solution {idx + 1}: {comment['body']} (Reactions: {comment['reactions']['total_count']})" for idx, comment in enumerate(sorted_comments)]
result = f"issue: {issue_title}, solutions: {'; '.join(solutions)}"
else:
result = f"issue: {issue_title}, solutions: No comments found"
results.append(result)
return "\n".join(results)

def retrieved_issue_solution(LIB: str, top_k: int, example_query: str, method: str, field: str) -> None:
"""
Main function to prepare data, create a retriever, and evaluate its performance.
Expand Down Expand Up @@ -149,17 +200,26 @@ def retrieved_issue_solution(LIB: str, top_k: int, example_query: str, method: s
print(f"Retrieved titles: {retrieved_titles}")
print(f"Retrieved issue descriptions: {retrieved_issue_description}")
print(f"Retrieved solutions: {retrieved_solution}")

return retrieved_solution

def main():
parser = argparse.ArgumentParser(description='Query the issue corpus for a library')
parser.add_argument('--LIB', type=str, required=True, help='Library name')
parser.add_argument('--example_query', type=str, required=True, help='Example query to test')
parser.add_argument('--method', type=str, required=True, choices=['bm25', 'sentencebert'], help='Retrieval method to use')
parser.add_argument('--field', type=str, required=True, choices=['issue_title', 'issue_description'], help='Field to compare')
parser.add_argument('--top_k', type=int, default=3, help='Number of top documents to retrieve')
parser.add_argument('--method', type=str, default="sentencebert", choices=['bm25', 'sentencebert'], help='Retrieval method to use')
parser.add_argument('--field', type=str, default="issue_title", choices=['issue_title', 'issue_description'], help='Field to compare')
parser.add_argument('--top_k', type=int, default=10, help='Number of top documents to retrieve')
parser.add_argument('--query_source', type=str, default="online", help='query issue with solutions online')
args = parser.parse_args()

retrieved_issue_solution(args.LIB, args.top_k, args.example_query, args.method, args.field)

if args.query_source=='local':
retrieved_issue_solution(args.LIB, args.top_k, args.example_query, args.method, args.field)
elif args.query_source=='online':
solutions = search_github_issues(args.LIB, args.top_k, args.example_query)
print('solutions: ', solutions)
else:
raise NotImplementedError("Unsupported query source. Use 'local' or 'online'.")

if __name__ == "__main__":
main()
Loading

0 comments on commit 4cbe301

Please sign in to comment.