Skip to content

Commit

Permalink
Merge pull request #754 from kbase/dev_build_busco
Browse files Browse the repository at this point in the history
add busco tool
  • Loading branch information
Tianhao-Gu authored Aug 27, 2024
2 parents 758e036 + 28fd747 commit c896ed3
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 5 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/build-push-busco-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Build & Push BUSCO Image to GHCR

on:
pull_request:
types:
- opened
- reopened
- synchronize
- ready_for_review
paths:
- 'src/loaders/compute_tools/busco/versions.yaml'
- '.github/workflows/build-push-busco-image.yml'
- '.github/workflows/build-push-tool-images.yml'

push:
branches:
- main
- master
- develop
paths:
- 'src/loaders/compute_tools/busco/versions.yaml'
- '.github/workflows/build-push-busco-image.yml'
- '.github/workflows/build-push-tool-images.yml'

jobs:
trigger-build-push:
uses: ./.github/workflows/build-push-tool-images.yml
with:
tool_name: busco
version_file: 'src/loaders/compute_tools/busco/versions.yaml'
secrets: inherit
2 changes: 1 addition & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## 0.1.3

* Added BBMap tool to the CDM pipeline.
* Added BBMap and BUSCO tool to the CDM pipeline.
* Included metadata file generation after each tool's execution.
* Updated Python library dependencies to the latest versions.
* Standardized thread management logic across all tools.
Expand Down
35 changes: 35 additions & 0 deletions src/loaders/compute_tools/busco/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM continuumio/miniconda3:24.5.0-0

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated
ARG BUSCO_VER=5.7.1
ENV CONDA_ENV busco-$BUSCO_VER

# Add Bioconda and Conda-Forge channels
RUN conda config --add channels bioconda
RUN conda config --add channels conda-forge

# Install BUSCO
# Certain dependencies (e.g., dendropy, sepp) are only compatible with Python versions up to 3.9.
ARG PYTHON_VER=3.9
RUN conda create -n $CONDA_ENV python=$PYTHON_VER
RUN conda install -n $CONDA_ENV pandas=2.2.2 jsonlines=4.0.0 mamba=1.5.8 pyyaml=6.0.1
# Suggestions from BUSCO team to use mamba for speeding up the installation process:
# https://busco.ezlab.org/busco_userguide.html#installation-with-conda
RUN conda run -n $CONDA_ENV mamba install -c bioconda -c conda-forge -y busco=$BUSCO_VER

# Activate the environment
RUN echo "source activate $CONDA_ENV" >> ~/.bashrc

# Set up directories
RUN mkdir -p /app
COPY ./ /app/collections
RUN rm -r /app/collections/.git

ENV PYTHONPATH /app/collections
WORKDIR /app

ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/busco/busco.py

RUN chmod -R 777 /app/collections

ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"]
83 changes: 83 additions & 0 deletions src/loaders/compute_tools/busco/busco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Run BUSCO tool on a set of fna files.
This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
Therefore, the parser program is not compatible with data generated by this tool.
"""
import os
import time
from pathlib import Path

from src.loaders.common.loader_common_names import TOOL_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
from src.loaders.compute_tools.tool_version import extract_latest_reference_db_version


def _run_busco_single(
tool_safe_data_id: str,
data_id: str,
source_file: Path,
output_dir: Path,
threads_per_tool_run: int,
debug: bool) -> None:
start = time.time()
print(f'Start executing BUSCO for {data_id}')

metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
return

current_dir = os.path.dirname(os.path.abspath(__file__))
version_file = os.path.join(current_dir, 'versions.yaml')
ref_db_version = extract_latest_reference_db_version(version_file)

# Please refer to https://docs.google.com/document/d/15yV-S41Iqe20F-I2MRLWdzJwVdr8QKUfZPw7oq8WvB0/edit#heading=h.elgudks5mtxu
# for more information on the BUSCO command options we are using here.
command = [
'busco',
'-i', str(source_file),
'-o', data_id,
'--out_path', str(output_dir),
'--datasets_version', ref_db_version,
'--download_path', '/reference_data',
'-c', str(threads_per_tool_run),
'--auto-lineage-prok',
'-m', 'genome',
'-f',
'--augustus',
]

run_command(command, output_dir if debug else None)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute BUSCO for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
additional_metadata = {
'source_file': str(source_file),
'data_id': data_id,
"reference_db": {
"version": ref_db_version,
},
}
create_tool_metadata(
output_dir,
tool_name="busco",
version="5.7.1",
command=command,
run_time=round(run_time, 2),
batch_size=1,
additional_metadata=additional_metadata)


def main():
runner = ToolRunner("busco")
runner.parallel_single_execution(_run_busco_single, unzip=True)


if __name__ == "__main__":
main()
9 changes: 9 additions & 0 deletions src/loaders/compute_tools/busco/versions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work.
# Therefore, the parser program is not compatible with data generated by this tool.

versions:
- version: 0.1.0
date: 2024-08-22
notes: |
- initial BUSCO implementation
reference_db_version: odb10
13 changes: 9 additions & 4 deletions src/loaders/jobs/taskfarmer/task_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,15 @@
--force Force overwrite of existing job directory
--source_file_ext SOURCE_FILE_EXT
Select files from source data directory that match the given extension.
TODO: The recommended approach by NERSC for running tasks with intensive I/O tools (most of our tools), is to utilize
the scratch directory. Before executing the task, source data and reference libraries should be copied to the scratch
directory. Soft links (such as for collection sources) should be created as needed. Once the task is complete,
the results should be copied back to the user's directory. For more information, refer to the NERSC documentation:
https://docs.nersc.gov/filesystems/perlmutter-scratch/
'''

TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap']
TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap', 'busco']

NODE_TIME_LIMIT_DEFAULT = 5 # hours
# Used as THREADS variable in the batch script which controls the number of parallel tasks per node
Expand All @@ -56,16 +61,16 @@
# for single genome tools, such as microtrait and mash, the chunk_size is the number of genomes to process in a
# serial manner
# exe_time is the estimated execution time for a single task (default is 60 minutes)
# threads_per_tool_run is the number of threads to use for each tool execution (default is 32)
# threads_per_tool_run is the number of threads to use for each tool execution (default is SYSTEM_CPU_CORES (256) / number of parallel tasks per node)
# tasks_per_node is the number of parallel tasks to run on a node (default is 1)
# node_time_limit is the time limit for the node we reserved for the task (default is 5 hours)
# if no specific metadata is provided for a tool, the default values are used.
TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4, 'threads_per_tool_run': 32},
'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5}, # Memory intensive tool - reserve more nodes with less node reservation time
'busco': {'chunk_size': 50, 'exe_time': 90, 'node_time_limit': 1.5}, # 1.5 minutes per genome with a single task per node on the user's drive. TODO: Aim to test multi-threading per node along with scratch execution, and adjust `tasks_per_node` accordingly.
'default': {'chunk_size': 5000, 'exe_time': 60}}
MAX_NODE_NUM = 100 # maximum number of nodes to use


REGISTRY = 'ghcr.io/kbase/collections'
VERSION_FILE = 'versions.yaml'
COMPUTE_TOOLS_DIR = '../../compute_tools' # relative to task_generator.py
Expand Down

0 comments on commit c896ed3

Please sign in to comment.