Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incorporate use of datasets-cli into genome prepare pipeline #375

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions containers/ncbi_datasets_v16.17.1.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Bootstrap: docker
From: ubuntu:24.04

%environment
export SINGULARITY_SHELL=/bin/bash
export DEBIAN_FRONTEND=noninteractive
export LC_ALL=C

%labels
Author [email protected]
Software "NCBI's datasets and dataformat"
Software.version v16.17.1
Software.website "https://github.com/ncbi/datasets/releases/tag/v16.17.1"
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved
Description "NCBI Datasets is a new resource that lets you easily gather data from across NCBI databases."

%post
apt-get update && apt-get -y upgrade
apt-get -y install \
wget \
unzip \
procps \
ca-certificates \

rm -rf /var/lib/apt/lists/*
apt-get clean

#Installing ncbi datasets & dataformat
cd /usr/local/bin/ && \
wget https://github.com/ncbi/datasets/releases/download/v16.17.1/linux-amd64.cli.package.zip && \
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved
unzip linux-amd64.cli.package.zip && \
rm linux-amd64.cli.package.zip && \
chmod +x datasets dataformat && \
wget https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64 -O jq && \
chmod +x jq

%test
#!/usr/bin/bash
echo "Testing OS is Ubuntu...."
source /etc/os-release
grep -q -e "PRETTY_NAME=\"Ubuntu" /etc/os-release
if [ $? -eq 0 ]; then
if [ $VERSION_ID == "24.04" ]; then
echo "Container base is Ubuntu version ${VERSION_ID} as expected."
fi
else
echo "Container base is not Ubuntu."
exit 1
fi

echo -e -n "\n** Checking we have datasets installed **\n"
datasets --version
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Assembly Accession Assembly Unplaced Count Assembly-unit accession Chromosome name GC Count GC Percent GenBank seq accession Molecule type Ordering RefSeq seq accession Role Seq length Sequence name UCSC style name Unlocalized Count
GCA_017607445.1 Primary Assembly Un JAEVHH010000011.1 Chromosome unplaced-scaffold 60710
GCA_017607445.1 non-nuclear MT 9872 CM029948.1 Mitochondrion assembled-molecule 41220
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
63 changes: 63 additions & 0 deletions pipelines/nextflow/modules/download/download_asm_with_datasets.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


process DOWNLOAD_ASM_NCBI_DATSETS {
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved
tag "${meta.accession}"
label 'adaptive'
label 'cached'
label 'datasets_cli'

input:
tuple val(meta), path(json_file)

output:
tuple val(meta),
path("*_assembly_report.txt"),
path("*_genomic.fna.gz"),
path("*genomic.gbff.gz"),
emit: min_set
tuple val(meta),
path("*genomic.gff.gz"),
path("*protein.faa.gz"),
path("*genomic.gbff.gz"),
emit: opt_set, optional: true

shell:
'''
datasets download genome accession !{meta.accession} \
--include seq-report,genome,gbff,gff3,protein \
--assembly-source all \
--dehydrated \
--filename !{meta.accession}_ncbi.zip \
--no-progressbar
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved

unzip !{meta.accession}_ncbi.zip -d !{meta.accession}_ncbi

datasets rehydrate --directory !{meta.accession}_ncbi \
--gzip \
--no-progressbar
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved

cp -f !{meta.accession}_ncbi/ncbi_dataset/data/!{meta.accession}/* ./

dataformat tsv genome-seq --inputfile sequence_report.jsonl > !{meta.accession}_assembly_report.txt
'''

stub:
"""
datasets --help
cp $workflow.projectDir/../../../../data/test/modules/download_asm_with_datasets/output/* .
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ process DATASETS_METADATA {
tag "$accession"
label 'local'
label 'cached'

label 'datasets_cli'

input:
val(accession)

Expand Down
8 changes: 4 additions & 4 deletions pipelines/nextflow/subworkflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

// Import modules/subworkflows
include { CHECK_JSON_SCHEMA as CHECK_JSON_SCHEMA_GENOME } from '../../modules/schema/check_json_schema.nf'
include { DOWNLOAD_ASM_DATA } from '../../modules/download/download_asm_data.nf'
include { DOWNLOAD_ASM_NCBI_DATSETS } from '../../modules/download/download_asm_with_datasets.nf'
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved
include { UNPACK_GFF3 } from '../../modules/gff3/unpack_gff3.nf'
include { PROCESS_GFF3 } from '../../modules/gff3/process_gff3.nf'
include { GFF3_VALIDATION } from '../../modules/gff3/gff3_validation.nf'
Expand All @@ -45,9 +45,9 @@ workflow GENOME_PREPARE {
checked_genome = CHECK_JSON_SCHEMA_GENOME(genomic_dataset).verified_json

// Download genome data files. Files may or may not include gene models (GFF3) and/or peptides.
DOWNLOAD_ASM_DATA(checked_genome)
download_min = DOWNLOAD_ASM_DATA.out.min_set
download_opt = DOWNLOAD_ASM_DATA.out.opt_set
DOWNLOAD_ASM_NCBI_DATSETS(checked_genome)
download_min = DOWNLOAD_ASM_NCBI_DATSETS.out.min_set
download_opt = DOWNLOAD_ASM_NCBI_DATSETS.out.opt_set
ens-LCampbell marked this conversation as resolved.
Show resolved Hide resolved

// Decompress GFF3 file, output with accession in tuple
unpacked_gff = UNPACK_GFF3(download_opt, 'gff')
Expand Down
2 changes: 1 addition & 1 deletion pipelines/nextflow/tests/workflows/test_genome_prepare.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@
# Manifest depends on the genome checksum, so also date dependent
- path: ./tmp/genome_prepare/GCA_017607445.1/manifest.json
- path: ./tmp/genome_prepare/GCA_017607445.1/seq_region.json
md5sum: 28518b0c7cbc19a2890a6b347367a82f
md5sum: 6a45dc461c53e33dde33807c6def7b63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The content of the file has significantly changed. Here's a diff:

        "synonyms": [                                                   "synonyms": [
            {                                                               {
                "name": "CM029948.1",                                           "name": "CM029948.1",
                "source": "GenBank"                                             "source": "GenBank"
            },                                                              },
            {                                                               {
                "name": "Mitochondrion",                      |                 "name": "MT",
                "source": "INSDC"                                               "source": "INSDC"
            },                                                              },
            {                                                               {
                "name": "",                                   |                 "name": "HcG217B07",
                "source": "INSDC_submitted_name"                                "source": "INSDC_submitted_name"
            },                                                <
            {                                                 <
                "name": "",                                   <
                "source": "RefSeq"                            <
            }                                                               }

- path: ./tmp/genome_prepare/GCA_017607445.1/stats.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ params {

singularity {
enabled = true
}
}
3 changes: 3 additions & 0 deletions pipelines/nextflow/workflows/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,7 @@ process {
storeDir = { "${params.cache_dir}/${task.process.tokenize(':')[-1].toLowerCase()}/${task.tag}" }
afterScript = { "sleep ${params.storeDir_latency}" }
}
withLabel: 'datasets_cli' {
container = 'library://lcampbell/ensembl-genomio/ncbi-datasets-v16.17.1:latest'
JAlvarezJarreta marked this conversation as resolved.
Show resolved Hide resolved
}
}
4 changes: 2 additions & 2 deletions src/python/ensembl/io/genomio/genome_metadata/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
# Create the seq_regions
seq_regions = []
for row in reader:
refseq_name = row["RefSeq-Accn"]
genbank_name = row["GenBank-Accn"]
refseq_name = row["RefSeq seq accession"]
genbank_name = row["GenBank seq accession"]
if refseq_name == "na":
refseq_name = ""
if genbank_name == "na":
Expand Down
22 changes: 10 additions & 12 deletions src/python/ensembl/io/genomio/seq_region/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@

SYNONYM_RESOURCES = ["GenBank", "RefSeq", "INSDC"]
SYNONYM_MAP = {
"Assigned-Molecule": "INSDC",
"GenBank-Accn": "GenBank",
"RefSeq-Accn": "RefSeq",
"Sequence-Name": "INSDC_submitted_name",
"Molecule type": "INSDC",
"GenBank seq accession": "GenBank",
"RefSeq seq accession": "RefSeq",
"Sequence name": "INSDC_submitted_name",
}
MOLECULE_LOCATION = {
"apicoplast": "apicoplast_chromosome",
Expand Down Expand Up @@ -143,7 +143,7 @@ def add_insdc_seq_region_name(
brc_name = names[source_name]
break
else:
raise UnknownMetadata(f'Cannot set BRC4 sequence region name for {seqr["name"]}')
raise UnknownMetadata(f'Cannot set BRC4 sequence region name for {seqr["Sequence name"]}')
brc_name = brc_name.partition(".")[0]
seqr["BRC4_seq_region_name"] = brc_name
seqr["EBI_seq_region_name"] = seqr["name"]
Expand All @@ -168,7 +168,6 @@ def add_mitochondrial_codon_table(seq_regions: List[SeqRegion], taxon_id: int) -
if response.text.startswith("<"):
raise ValueError(f"Response from {url} is not JSON")
decoded = response.json()

if "mitochondrialGeneticCode" not in decoded:
logging.warning("No mitochondria genetic code found for taxon {taxon_id}")
else:
Expand Down Expand Up @@ -217,7 +216,6 @@ def merge_seq_regions(
seq_regions.sort(key=lambda x: x["name"])
return seq_regions


def get_gbff_seq_regions(gbff_path: PathLike) -> SeqRegionDict:
"""Returns the sequence regions found in the GBFF file (if any).

Expand Down Expand Up @@ -372,11 +370,11 @@ def make_seq_region(
seq_region = {}
# Set accession as the sequence region name
src = "RefSeq" if is_refseq else "GenBank"
accession_id = data.get(f"{src}-Accn", "")
accession_id = data.get(f"{src} seq accession", "")
if accession_id and (accession_id != "na"):
seq_region["name"] = accession_id
else:
logging.warning(f'No {src} accession ID found for {data["Sequence-Name"]}')
logging.warning(f'No {src} accession ID found for {data["Assembly Accession"]}')
return {}
# Add synonyms
synonyms = []
Expand All @@ -388,18 +386,18 @@ def make_seq_region(
synonyms.sort(key=lambda x: x["source"])
seq_region["synonyms"] = synonyms
# Add sequence length
field = "Sequence-Length"
field = "Seq length"
if (field in data) and (data[field].casefold() != "na"):
seq_region["length"] = int(data[field])
# Add coordinate system and location
seq_role = data["Sequence-Role"]
seq_role = data["Role"]
# Scaffold?
if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
seq_region["coord_system_level"] = "scaffold"
# Chromosome? Check location
elif seq_role == "assembled-molecule":
seq_region["coord_system_level"] = "chromosome"
location = data["Assigned-Molecule-Location/Type"].lower()
location = data["Molecule type"].lower()
# Get location metadata
try:
seq_region["location"] = molecule_location[location]
Expand Down