Merge pull request #754 from kbase/dev_build_busco

add busco tool
kbase · Aug 27, 2024 · c896ed3 · c896ed3
2 parents 758e036 + 28fd747
commit c896ed3
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 5 deletions.
diff --git a/.github/workflows/build-push-busco-image.yml b/.github/workflows/build-push-busco-image.yml
@@ -0,0 +1,31 @@
+name: Build & Push BUSCO Image to GHCR
+
+on:
+  pull_request:
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - ready_for_review
+    paths:
+      - 'src/loaders/compute_tools/busco/versions.yaml'
+      - '.github/workflows/build-push-busco-image.yml'
+      - '.github/workflows/build-push-tool-images.yml'
+
+  push:
+    branches:
+      - main
+      - master
+      - develop
+    paths:
+      - 'src/loaders/compute_tools/busco/versions.yaml'
+      - '.github/workflows/build-push-busco-image.yml'
+      - '.github/workflows/build-push-tool-images.yml'
+
+jobs:
+  trigger-build-push:
+    uses: ./.github/workflows/build-push-tool-images.yml
+    with:
+      tool_name: busco
+      version_file: 'src/loaders/compute_tools/busco/versions.yaml'
+    secrets: inherit
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -2,7 +2,7 @@
 
 ## 0.1.3
 
-* Added BBMap tool to the CDM pipeline.
+* Added BBMap and BUSCO tool to the CDM pipeline.
 * Included metadata file generation after each tool's execution.
 * Updated Python library dependencies to the latest versions.
 * Standardized thread management logic across all tools.

diff --git a/src/loaders/compute_tools/busco/Dockerfile b/src/loaders/compute_tools/busco/Dockerfile
@@ -0,0 +1,35 @@
+FROM continuumio/miniconda3:24.5.0-0
+
+# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated
+ARG BUSCO_VER=5.7.1
+ENV CONDA_ENV busco-$BUSCO_VER
+
+# Add Bioconda and Conda-Forge channels
+RUN conda config --add channels bioconda
+RUN conda config --add channels conda-forge
+
+# Install BUSCO
+# Certain dependencies (e.g., dendropy, sepp) are only compatible with Python versions up to 3.9.
+ARG PYTHON_VER=3.9
+RUN conda create -n $CONDA_ENV python=$PYTHON_VER
+RUN conda install -n $CONDA_ENV pandas=2.2.2 jsonlines=4.0.0 mamba=1.5.8 pyyaml=6.0.1
+# Suggestions from BUSCO team to use mamba for speeding up the installation process:
+# https://busco.ezlab.org/busco_userguide.html#installation-with-conda
+RUN conda run -n $CONDA_ENV mamba install -c bioconda -c conda-forge -y busco=$BUSCO_VER
+
+# Activate the environment
+RUN echo "source activate $CONDA_ENV" >> ~/.bashrc
+
+# Set up directories
+RUN mkdir -p /app
+COPY ./ /app/collections
+RUN rm -r /app/collections/.git
+
+ENV PYTHONPATH /app/collections
+WORKDIR /app
+
+ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/busco/busco.py
+
+RUN chmod -R 777 /app/collections
+
+ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"]
diff --git a/src/loaders/compute_tools/busco/busco.py b/src/loaders/compute_tools/busco/busco.py
@@ -0,0 +1,83 @@
+"""
+Run BUSCO tool on a set of fna files.
+
+This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
+Therefore, the parser program is not compatible with data generated by this tool.
+
+"""
+import os
+import time
+from pathlib import Path
+
+from src.loaders.common.loader_common_names import TOOL_METADATA
+from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
+from src.loaders.compute_tools.tool_version import extract_latest_reference_db_version
+
+
+def _run_busco_single(
+        tool_safe_data_id: str,
+        data_id: str,
+        source_file: Path,
+        output_dir: Path,
+        threads_per_tool_run: int,
+        debug: bool) -> None:
+    start = time.time()
+    print(f'Start executing BUSCO for {data_id}')
+
+    metadata_file = output_dir / TOOL_METADATA
+    if metadata_file.exists():
+        print(f"Skipping {source_file} as it has already been processed.")
+        return
+
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    version_file = os.path.join(current_dir, 'versions.yaml')
+    ref_db_version = extract_latest_reference_db_version(version_file)
+
+    # Please refer to https://docs.google.com/document/d/15yV-S41Iqe20F-I2MRLWdzJwVdr8QKUfZPw7oq8WvB0/edit#heading=h.elgudks5mtxu
+    # for more information on the BUSCO command options we are using here.
+    command = [
+        'busco',
+        '-i', str(source_file),
+        '-o', data_id,
+        '--out_path', str(output_dir),
+        '--datasets_version', ref_db_version,
+        '--download_path', '/reference_data',
+        '-c', str(threads_per_tool_run),
+        '--auto-lineage-prok',
+        '-m', 'genome',
+        '-f',
+        '--augustus',
+    ]
+
+    run_command(command, output_dir if debug else None)
+
+    end_time = time.time()
+    run_time = end_time - start
+    print(
+        f'Used {round(run_time / 60, 2)} minutes to execute BUSCO for {data_id}')
+
+    # Save run info to a metadata file in the output directory for parsing later
+    additional_metadata = {
+        'source_file': str(source_file),
+        'data_id': data_id,
+        "reference_db": {
+            "version": ref_db_version,
+        },
+    }
+    create_tool_metadata(
+        output_dir,
+        tool_name="busco",
+        version="5.7.1",
+        command=command,
+        run_time=round(run_time, 2),
+        batch_size=1,
+        additional_metadata=additional_metadata)
+
+
+def main():
+    runner = ToolRunner("busco")
+    runner.parallel_single_execution(_run_busco_single, unzip=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/loaders/compute_tools/busco/versions.yaml b/src/loaders/compute_tools/busco/versions.yaml
@@ -0,0 +1,9 @@
+# This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
+# Therefore, the parser program is not compatible with data generated by this tool.
+
+versions:
+  - version: 0.1.0
+    date: 2024-08-22
+    notes: |
+      - initial BUSCO implementation
+    reference_db_version: odb10
diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py
@@ -38,10 +38,15 @@
   --force               Force overwrite of existing job directory
   --source_file_ext SOURCE_FILE_EXT
                         Select files from source data directory that match the given extension.
-                        
+ 
+TODO: The recommended approach by NERSC for running tasks with intensive I/O tools (most of our tools), is to utilize 
+the scratch directory. Before executing the task, source data and reference libraries should be copied to the scratch 
+directory. Soft links (such as for collection sources) should be created as needed. Once the task is complete, 
+the results should be copied back to the user's directory. For more information, refer to the NERSC documentation:
+https://docs.nersc.gov/filesystems/perlmutter-scratch/                      
 '''
 
-TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap']
+TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap', 'busco']
 
 NODE_TIME_LIMIT_DEFAULT = 5  # hours
 # Used as THREADS variable in the batch script which controls the number of parallel tasks per node
@@ -56,16 +61,16 @@
 #    for single genome tools, such as microtrait and mash, the chunk_size is the number of genomes to process in a
 #    serial manner
 # exe_time is the estimated execution time for a single task (default is 60 minutes)
-# threads_per_tool_run is the number of threads to use for each tool execution (default is 32)
+# threads_per_tool_run is the number of threads to use for each tool execution (default is SYSTEM_CPU_CORES (256) / number of parallel tasks per node)
 # tasks_per_node is the number of parallel tasks to run on a node (default is 1)
 # node_time_limit is the time limit for the node we reserved for the task (default is 5 hours)
 # if no specific metadata is provided for a tool, the default values are used.
 TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4, 'threads_per_tool_run': 32},
              'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5},  # Memory intensive tool - reserve more nodes with less node reservation time
+             'busco': {'chunk_size': 50, 'exe_time': 90, 'node_time_limit': 1.5},  # 1.5 minutes per genome with a single task per node on the user's drive. TODO: Aim to test multi-threading per node along with scratch execution, and adjust `tasks_per_node` accordingly.
              'default': {'chunk_size': 5000, 'exe_time': 60}}
 MAX_NODE_NUM = 100  # maximum number of nodes to use
 
-
 REGISTRY = 'ghcr.io/kbase/collections'
 VERSION_FILE = 'versions.yaml'
 COMPUTE_TOOLS_DIR = '../../compute_tools'  # relative to task_generator.py