Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add fetch-ncbi-taxonomy action #118

Merged
merged 26 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
194a545
Add colorify to utils
Sann5 Dec 22, 2023
ea08a40
Register action
Sann5 Dec 22, 2023
471aaad
colorify only with green
Sann5 Jan 11, 2024
4fd93d8
Update q2_moshpit/plugin_setup.py
Sann5 Jan 11, 2024
bc9f7c6
Updated the output name and description
Sann5 Jan 12, 2024
452824f
Implement build_eggnog_diamond_db action
Sann5 Jan 12, 2024
f4e711f
Add test
Sann5 Jan 12, 2024
9b0a4c0
Further refactor EggnogSequenceTaxa to EggnogProteinSequences
Sann5 Jan 15, 2024
db0a0c8
Add validation for taxon IDs and corresponding test
Sann5 Jan 15, 2024
527bfec
Implement fetch-ncbi-taxonomy
Sann5 Jan 18, 2024
92cb2f5
_write_version_tsv functionality to separate function.
Sann5 Jan 18, 2024
2cdae75
implement tests
Sann5 Jan 18, 2024
205913a
fix bug in tests
Sann5 Jan 18, 2024
8233505
Add ellipsis to green prompts
Sann5 Jan 18, 2024
650160d
remove duplicate action
Sann5 Jan 19, 2024
5cfb55b
Update q2_moshpit/plugin_setup.py
Sann5 Jan 19, 2024
35386b6
Merge branch 'main' into fetch_ncbi_iss_107
Sann5 Jan 22, 2024
1633654
correct indentation
Sann5 Jan 22, 2024
ca2dedb
Merge branch 'main' into fetch_ncbi_iss_107
Sann5 Jan 22, 2024
1ef872d
Remove version file + adjust tests
Sann5 Jan 23, 2024
a0bf457
Adjust file size in prompt
Sann5 Jan 23, 2024
f680567
Reorganize fetch_ncbi_taxonomy
Sann5 Jan 23, 2024
af0f652
Add tests
Sann5 Jan 23, 2024
c4e28f0
Update q2_moshpit/eggnog/_dbs.py
Sann5 Jan 24, 2024
301ae00
Eliminate duplicated action
Sann5 Jan 24, 2024
9a2de57
Reveiw comments Michal
Sann5 Jan 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021
keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
pages = {366--368},
}

@misc{NCBI,
title = {National Center for Biotechnology Information (NCBI)},
url = {https://www.ncbi.nlm.nih.gov/},
note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;},
}
5 changes: 3 additions & 2 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from ._method import eggnog_diamond_search, eggnog_annotate
from ._dbs import (
fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
fetch_eggnog_proteins
fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
)


__all__ = [
'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins'
'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy'
]
141 changes: 141 additions & 0 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import datetime
import pandas as pd
from q2_types.feature_data import ProteinSequencesDirectoryFormat
import shutil
from q2_types_genomics.reference_db import (
EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
EggnogProteinSequencesDirFmt
Expand Down Expand Up @@ -174,3 +177,141 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
))

return eggnog_fa


def build_eggnog_diamond_db(
eggnog_proteins: EggnogProteinSequencesDirFmt,
taxon: int
) -> DiamondDatabaseDirFmt:
"""
Creates an DIAMOND database which contains the protein
sequences that belong to the specified taxon.
"""
# Validate taxon ID
_validate_taxon_id(eggnog_proteins, taxon)

# Initialize output objects
diamond_db = DiamondDatabaseDirFmt()

# Define command.
cmd = [
"create_dbs.py",
"--data_dir", str(eggnog_proteins),
"--taxids", str(taxon),
"--dbname", "ref_db"
]
run_command(cmd)

# The script will create the diamond DB in side the directory of
# eggnog_proteins object, so we need to move it to diamond_db
source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd")
destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
shutil.move(source_path, destination_path)

# Return objects
return diamond_db


def _validate_taxon_id(eggnog_proteins, taxon):
# Validate taxon id number
# Read in valid taxon ids
taxid_info = pd.read_csv(
os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"),
sep="\t"
)

# Convert them into a set
tax_ids = set()
for lineage in taxid_info["Taxid Lineage"]:
tax_ids.update(
set(
lineage.strip().split(",")
)
)

# Check for overlap with provided taxon id
if not tax_ids.intersection(set(str(taxon))):
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv "
"file in the input eggnog_proteins input."
)
misialq marked this conversation as resolved.
Show resolved Hide resolved


def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
"""
Script fetches 3 files from the internet and puts them into the folder of
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
a NCBITaxonomyDirFmt object.
"""
# Initialize output object and paths
ncbi_data = NCBITaxonomyDirFmt()
zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
names_path = os.path.join(str(ncbi_data), "names.dmp")
proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
version_path = os.path.join(str(ncbi_data), "version.tsv")
Sann5 marked this conversation as resolved.
Show resolved Hide resolved

# Download zip file
print(colorify("Downloading *.dmp files..."))
run_command(
cmd=[
"wget", "-O", zip_path,
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
]
)

# Unzip
run_command(
cmd=[
"unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
"-d", str(ncbi_data)
]
)

# Remove zip file
run_command(cmd=["rm", zip_path])
Sann5 marked this conversation as resolved.
Show resolved Hide resolved

# Download proteins
print(colorify("Downloading proteins file (~15 GB)..."))
run_command(
cmd=[
"wget", "-O", proteins_path,
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
"prot.accession2taxid.gz"
]
)

# Constructing version file
print(colorify("Constructing version file..."))
_write_version_tsv(nodes_path, names_path, proteins_path, version_path)

# Return object
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
print(colorify(
"Done! Moving data from temporary directory to final location..."
))
return ncbi_data


def _write_version_tsv(nodes, names, proteins, version):
names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes))
nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names))
proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins))
Sann5 marked this conversation as resolved.
Show resolved Hide resolved

# Create a DataFrame with file names and last modification times
data = {'file_name': [
'names.dmp',
'nodes.dmp',
'prot.accession2taxid.gz'
],
'date': [
names_time.strftime('%d/%m/%Y'),
nodes_time.strftime('%d/%m/%Y'),
proteins_time.strftime('%d/%m/%Y')
],
'time': [
names_time.strftime('%H:%M:%S'),
nodes_time.strftime('%H:%M:%S'),
proteins_time.strftime('%H:%M:%S')
]
}
pd.DataFrame(data).to_csv(version, sep='\t', index=False)
Loading
Loading