Update documentation for R2APP and Git2APP and report generation

1. update report generation document and code 2. update R2APP and Git2APP documentation
batmen-lab · Oct 16, 2024 · 1823535 · 1823535
1 parent 4fa1d89
commit 1823535
Show file tree

Hide file tree

Showing 17 changed files with 10,917 additions and 3,987 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -54,6 +54,7 @@ COPY src/retrievers    /app/src/retrievers
 COPY images    /app/images
 #COPY src/tmp    /app/src/tmp
 COPY data/standard_process/${LIB}/ /app/data/standard_process/${LIB}/
+COPY data/standard_process/base/ /app/data/standard_process/base/
 COPY data/autocoop/${LIB}/ /app/data/autocoop/${LIB}/
 COPY data/conversations/ /app/data/conversations/
 COPY data/others-data/ /app/data/others-data/

diff --git a/chatbot_ui_biomania/components/Chat/ChatMessage.tsx b/chatbot_ui_biomania/components/Chat/ChatMessage.tsx
@@ -221,13 +221,13 @@ export const ChatMessage: FC<Props> = memo(({ message, messageIndex, onEdit}) =>
                       return !inline ? (
                         <CodeBlock
                           //key={Math.random()}
-                          key={messageIndex}
+                          //key={messageIndex}
                           language={(match && match[1]) || ''}
                           value={String(children).replace(/\n$/, '')}
                           {...props}
                         />
                       ) : (
-                        <code key={messageIndex} className={className} {...props}>
+                        <code className={className} {...props}>
                           {children}
                         </code>
                       );

diff --git a/docs/Git2APP.md b/docs/Git2APP.md
@@ -116,41 +116,17 @@ Install your GitHub repository to your local machine using the `pip install` com
 
 ```bash
 # install remotely
-pip install git+https://github.com/your_github_page/your_repository.git. 
+pip install git+https://github.com/your_github_page/your_repository.git
 # Or build locally
 pip install  -e .
 ```
 
 ## Step 2: Use BioMANIA
 
-Now that your GitHub repository is properly installed, you can create a BioMANIA app to use it. Feel free to skip any Optional steps to make it easy. Here are the steps:
+Now that your GitHub repository is properly installed, you can create a BioMANIA app to use it. Just treat your library as a PyPI package, and follow the steps in [the steps in readme file.](../docs/PyPI2APP.md), start from adding the information to the `Lib_cheatsheet.py`.
 
-### 2.1. Create BioMANIA app:
-
-#### 2.1.1 Generate Initial API Configuration:
-To obtain `API_init_prepare.json`, use the following script:
-```shell
-export LIB=scanpy
-python Git2APP/get_API_init_from_sourcecode.py --LIB ${LIB}
-```
-
-Note: You have the option to alter the filtering rules by modifying filter_specific_apis.
-
-#### 2.1.2. Refine Function Definitions and Docstrings:
-(Optional) Now, you need to add docstrings in your source code and refine the function definitions. You can either choose running our provided script and get `API_init.json` with GPT:
-
-```shell
-python Git2APP/get_API_docstring_from_sourcecode.py --LIB ${LIB}
-```
-
-Tips 
-- This script is designed to execute only on APIs lacking docstrings or well-documented parameters. It automatically skips APIs that meet the established documentation standards. Please note that running this script requires a paid OpenAI account. 
-
-- This script is based on LLM responses for modification, and the quality of the results may not be entirely satisfactory. Users need to ensure that the necessary parameters type are provided for inference, as `None` type may lead to execution failures in the API due to missing essential parameters.
-
-- To accommodate the fact that `args`, `kwargs`, and similar parameters in general APIs are optional, we currently filter them out during prediction. Therefore, it's advisable to avoid using args as parameters in the code.
-
-It is better that if you can design the docstrings by yourself as it is more accurate. `NumPy` format is preferred than `reStructuredText` and `Google` format. Here's a basic example of an effective docstring :
+### Tips: Refine Function Definitions and Docstrings:
+(Optional) If you need to add docstrings in your source code and refine the function definitions. You can either choose using GPT or design by yourself. The latter is better as it is more accurate. `NumPy` format is preferred than `reStructuredText` and `Google` format. Here's a basic example of an effective docstring :
 
 ```python
 from typing import Union
@@ -176,76 +152,25 @@ def add(a:int, b:int) -> int:
     return a + b
 ```
 
-You can refer to the prompts available in [BioMANIA](https://www.biorxiv.org/content/10.1101/2023.10.29.564479v1) to add the function body, or either using the [prompt](./src/Git2APP/get_API_docstring_from_sourcecode.py) that modified based on that.
-
-If you already have a well-documented code, generate API_init.json with:
-```shell
-cp -r data/standard_process/${LIB}/API_init_prepare.json data/standard_process/${LIB}/API_init.json 
-```
-
-#### 2.1.3. Run Training Scripts:
-For subsequent steps, start from obtaining `API_composite.json` of [`Run with script/Training`](./PyPI2APP.md#training) section. Following these instructions will result in the generation of data files in data/standard_process/your_project_name and model files in hugging_models.
+You can refer to the prompts available in [BioMANIA](https://www.biorxiv.org/content/10.1101/2023.10.29.564479) to add the function body, or either using the [prompt](./src/Git2APP/get_API_docstring_from_sourcecode.py) that modified based on that.
 
-```python
-data/standard_process/your_project_name
-├── API_composite.json
-├── API_init.json
-├── API_init_prepare.json
-├── API_inquiry.json
-├── API_inquiry_annotate.json
-├── API_instruction_testval_query_ids.json
-├── api_data.csv
-├── centroids.pkl
-├── retriever_train_data
-│   ├── corpus.tsv
-│   ├── qrels.test.tsv
-│   ├── qrels.train.tsv
-│   ├── qrels.val.tsv
-│   ├── test.json
-│   ├── test.query.txt
-│   ├── train.json
-│   ├── train.query.txt
-│   ├── val.json
-│   └── val.query.txt
-└── vectorizer.pkl
-
-hugging_models
-└── retriever_model_finetuned
-    ├── your_project_name
-    │   ├── assigned
-    │   │   ├── 1_Pooling
-    │   │   ├── README.md
-    │   │   ├── config.json
-    │   │   ├── config_sentence_transformers.json
-    │   │   ├── eval
-    │   │   ├── model.safetensors
-    │   │   ├── modules.json
-    │   │   ├── sentence_bert_config.json
-    │   │   ├── special_tokens_map.json
-    │   │   ├── tokenizer.json
-    │   │   ├── tokenizer_config.json
-    │   │   └── vocab.txt
-    │   └── tensorboard
-    │       └── name_desc
-```
+If you already have a well-documented code, just ignore this step and follow the steps in readme file.
 
-### 2.2 Add logo
+### Tips: Add logo to UI
 
 Add a logo image to `BioMANIA/chatbot_ui_biomania/public/apps/` and modify the link in `BioMANIA/chatbot_ui_biomania/components/Chat/LibCardSelect.tsx`.
 
 Be mindful of the capitalization in library names, as it affects the recognition of the related model data loading paths.
 
-### 2.3 Use UI service.
+### Tips: Use UI service.
 
 Follow the steps in [`Run with script/Inference`](../README.md#inference) section in `README` to start UI service. Don’t forget to set an OpenAI key in `.env` file as recommended in `README`.
 
 Remember to update the app's accordingly to your repository improvements.
 
-**Tips: Currently, we do not support real-time communication. Therefore, if there is content that requires a long time to run, such as training a model, it is best to train only one epoch per inquiry. We might plan to support real-time display of results in the near future.**
-
-### 2.4 Share your APP!
+### Tips: Share your APP!
 
-Follow the steps in [`Share your APP`](../README.md#share-your-app) section in `README` to introduce your tool to others!
+If you want your app to be used by others, follow the steps in [`Share your APP`](../README.md#share-your-app) section in `README` to introduce your tool to others!
 
 
 I hope this tutorial helps you create your BioMANIA app with your GitHub-hosted package. If you have any further questions or need assistance with specific steps, feel free to ask!
diff --git a/docs/R2APP.md b/docs/R2APP.md
@@ -10,78 +10,30 @@ Before starting, ensure that R and `rpy2` are installed. `rpy2` allows for runni
 ```shell
 pip install rpy2
 ```
+- **Install your R lib**: In terminal, enter `R` to enter R environment, and install its package
+```shell
+R
+> install.packages('Your Package Name')
+> exit
+```
 
 #### Step 2: Generating API Initialization File
-Use a Python script to generate the `./data/R/{Your_R_Library_Name}/API_init.json` file. This file contains initial information about the APIs in the R package.
+Use a Python script to generate the `./data/standard_process/{Your_R_Library_Name}/API_init.json` file. This file contains initial information about the APIs in the R package.
 
 - Run the following command:
 ```bash
-python dataloader/get_API_init_from_sourcecode_R.py --LIB [Your_R_Library_Name]
-cp -r ./data/R/{Your_R_Library_Name}/API_init.json /data/R/{Your_R_Library_Name}/API_composite.json
-```
-
-#### Step 3: Preparing Retriever Data
-Prepare the data required for the retriever model using another Python script. This process generates the instruction under `./data/R/{your_r_lib_name}/API_inquiry.json` file and `./data/R/{your_r_lib_name}/retriever_train_data/` folder.
-
-- Execute the command:
-```bash
-python dataloader/prepare_retriever_data.py --LIB [Your_R_Library_Name]
-```
-
-#### Step 4: Training Models
-Train the chat classification and retriever models. These models are crucial for the app's functionality.
-
-1. **Chitchat Classification Model**: 
-- Train the model using the following command:
-```bash
-python models/chitchat_classification.py --LIB ${LIB}
-```
-
-2. **Retriever Model**: 
-- Train the model with this command (adjust the command with necessary parameters):
-```bash
-export LIB=scanpy
-CUDA_VISIBLE_DEVICES=0
-mkdir ./hugging_models/retriever_model_finetuned/${LIB}
-python models/train_retriever.py \
---data_path ./data/standard_process/${LIB}/retriever_train_data/ \
---model_name bert-base-uncased \
---output_path ./hugging_models/retriever_model_finetuned/${LIB} \
---num_epochs 25 \
---train_batch_size 32 \
---learning_rate 1e-5 \
---warmup_steps 500 \
---max_seq_length 256 \
---optimize_top_k 3 \
---plot_dir ./plot/${LIB}/retriever/
-```
-
-#### Step 5: Use model
-Start back-end UI service with:
-```bash
-export LIB=ggplot2
-CUDA_VISIBLE_DEVICES=0 \
-python deploy/inference_dialog_server_R.py \
-    --retrieval_model_path ./hugging_models/retriever_model_finetuned/${LIB}/assigned \
-    --top_k 3
-```
-
-Install and start the front-end service in a new terminal with:
-```bash
-cd src/chatbot_ui_biomania
-npm i # install
-export BACKEND_URL="https://[ngrok_id].ngrok-free.app" # "http://localhost:5000";
-npm run dev # run
+python -m src.R2APP.get_API_init_from_sourcecode_R --LIB [Your_R_Library_Name] # e.g.  Seurat, ggplot2
+cp -r ./data/standard_process/{Your_R_Library_Name}/API_init.json /data/standard_process/{Your_R_Library_Name}/API_composite.json
 ```
 
-Your chatbot server is now operational at `http://localhost:3000/en`, primed to process user queries.
+#### Follow the [remaining steps](PyPI2APP) from preparing data synthesis on. Remember to add the information into `Lib_cheatsheet.py`
 
 #### Key Differences Between R and Python Integration
 
-- **Library Loading**: In R, use `library(LIB)` to load packages directly. There's no need to modify `Lib_cheatsheet.py` as in Python.
+- **Library Loading**: In R, use `library(LIB)` to load packages directly. 
 - **Documentation Access**: R documentation can be accessed through `help()`, `??`, or the `.__doc__` attribute after converting R functions to Python via `rpy2`.
 - **Arguments Information**: R documentation didn't always provide `type` information for parameters.
-- **Simplified Process**: The process for R integration is more straightforward, focusing primarily on data preparation and model training, without the need to adjust library settings extensively.
+- **Simplified Process**: The process for R integration is more straightforward, focusing primarily on data preparation and model training, without the need to search for more documentation resources.
 
 #### Final Notes
 This framework outlines the key steps and differences for integrating an R package into BioMANIA. Adjust the Python commands and paths according to your package's specifics. If you have any questions or need assistance with specific steps, feel free to reach out!
diff --git a/docs/Report_Generation.md b/docs/Report_Generation.md
@@ -9,7 +9,7 @@ Firstly, press `export chat` button on UI to get the chat json data. Convert the
 
 ```bash
 # cd src
-python report/Chat2Py.py report/demo_Preprocessing_and_clustering_3k_PBMCs.json
+python -m report.Chat2Py report/chatbot_ui_history_10-16.json
 ```
 ![](https://github.com/batmen-lab/BioMANIA/tree/main/images/pyfile.jpg)
 
@@ -20,32 +20,32 @@ Convert the chat JSON into an [ipynb report](https://github.com/batmen-lab/BioMA
 
 ```bash
 # cd src
-python report/Chat2jupyter.py report/demo_Preprocessing_and_clustering_3k_PBMCs.json
+python -m report.Chat2jupyter report/chatbot_ui_history_10-16.json
 ```
 ![](https://github.com/batmen-lab/BioMANIA/tree/main/images/jupyter.jpg)
 
 
-### For performance report
+### For performance report (under developing)
 
 Combine and sort the performance figures into a short report.
 
 ```bash
 # cd src
-python report/PNG2report.py scanpy
+python -m report.PNG2report scanpy
 ```
 
 Please note that the generation of this report must be based on the premise that the retriever models have already been trained, and the gpt baseline has already been tested. You need to first obtain the results of each model before running this script. Here is a reference for a [demo report](https://github.com/batmen-lab/BioMANIA/tree/main/src/report/performance_report.pdf).
 
 ![](https://github.com/batmen-lab/BioMANIA/tree/main/images/performance_report.jpg)
 
 
-### For common issue report
+### For common issue report (under developing)
 
 Displaying common issues in the process of converting Python tools into libraries
 
 ```bash
 # cd src
-python report/Py2report.py scanpy
+python -m report.Py2report scanpy
 ```
 
 The output files are located in the ./report folder.

diff --git a/src/R2APP/get_API_init_from_sourcecode_R.py b/src/R2APP/get_API_init_from_sourcecode_R.py
@@ -6,8 +6,8 @@
 import pydoc, argparse, json, re, os, collections, inspect, importlib, typing, functools
 from docstring_parser import parse
 from langchain.document_loaders import BSHTMLLoader
-from configs.model_config import ANALYSIS_PATH, get_all_variable_from_cheatsheet, get_all_basic_func_from_cheatsheet
-from dataloader.extract_function_from_sourcecode import get_returnparam_docstring
+from ..configs.model_config import ANALYSIS_PATH, get_all_variable_from_cheatsheet, get_all_basic_func_from_cheatsheet
+from ..dataloader.extract_function_from_sourcecode import get_returnparam_docstring
 
 import rpy2.robjects as robjects
 from rpy2.robjects.packages import importr
@@ -196,20 +196,19 @@ def get_r_function_doc_rd2txt(package_name, function_name):
 
 def main_get_API_init():
     # import R
-    ggplot2 = importr('ggplot2')
+    library = importr(args.LIB)
     # get functions name
-    package_functions = [func for func in dir(ggplot2) if not func.startswith("_")]
+    package_functions = [func for func in dir(library) if not func.startswith("_")]
     api_info = {}
     for func in package_functions:
         try:
-            r_func = getattr(ggplot2, func, None)
+            r_func = getattr(library, func, None)
             if r_func:
                 doc_string = r_func.__doc__
                 # doc_string = robjects.r('capture.output(??("{function_name}", package="{package_name}"))'.format(function_name=func, package_name=args.LIB))
                 if doc_string:
                     # Parsing the provided docstring with the new function
                     parsed_json_structure = parse_r_docstring_to_json_structure(doc_string, func)
-                    print(parsed_json_structure)
                     api_info[func] = parsed_json_structure
         except:
             pass

diff --git a/src/report/Chat2PDF.py b/src/report/Chat2PDF.py
@@ -10,7 +10,26 @@
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from reportlab.lib.utils import simpleSplit
-from ..gpt.utils import load_json
+import json
+
+def load_json(filename: str) -> dict:
+    """
+    Load JSON data from a specified file.
+
+    Parameters
+    ----------
+    filename : str
+        The path to the JSON file to be loaded.
+
+    Returns
+    -------
+    dict
+        The data loaded from the JSON file.
+    """
+    with open(filename, 'r') as file:
+        data = json.load(file)
+    return data
+
 
 # draw text with markdown style
 def draw_markdown_text(c, text, line_num, font_name="Helvetica", font_size=10, leading=15, max_width=440):