Ensembl · ens-LCampbell · May 22, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/containers/ncbi_datasets_v16.17.1.def b/containers/ncbi_datasets_v16.17.1.def
@@ -0,0 +1,66 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Bootstrap: docker
+From: ubuntu:24.04
+
+%environment
+    export SINGULARITY_SHELL=/bin/bash
+    export DEBIAN_FRONTEND=noninteractive
+    export LC_ALL=C
+
+%labels
+    Author [email protected]
+    Software "NCBI's datasets and dataformat"
+    Software.version v16.17.1
+    Software.website "https://github.com/ncbi/datasets/releases/tag/v16.17.1"
+    Description "NCBI Datasets is a new resource that lets you easily gather data from across NCBI databases."
+
+%post
+    apt-get update && apt-get -y upgrade
+    apt-get -y install \
+    wget \
+    unzip \
+    procps \
+    ca-certificates \
+
+    rm -rf /var/lib/apt/lists/*
+    apt-get clean
+
+    #Installing ncbi datasets & dataformat
+    cd /usr/local/bin/ && \
+    wget https://github.com/ncbi/datasets/releases/download/v16.17.1/linux-amd64.cli.package.zip && \
+    unzip linux-amd64.cli.package.zip && \
+    rm linux-amd64.cli.package.zip && \
+    chmod +x datasets dataformat && \
+    wget https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64 -O jq && \
+    chmod +x jq
+
+%test
+    #!/usr/bin/bash
+    echo "Testing OS is Ubuntu...."
+    source /etc/os-release
+    grep -q -e "PRETTY_NAME=\"Ubuntu" /etc/os-release
+    if [ $? -eq 0 ]; then
+        if [ $VERSION_ID == "24.04" ]; then
+	   echo "Container base is Ubuntu version ${VERSION_ID} as expected."
+        fi
+    else
+        echo "Container base is not Ubuntu."
+        exit 1
+    fi
+
+    echo -e -n "\n** Checking we have datasets installed **\n"
+    datasets --version
diff --git a/...odules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_assembly_report.txt b/...odules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_assembly_report.txt
@@ -0,0 +1,3 @@
+Assembly Accession	Assembly Unplaced Count	Assembly-unit accession	Chromosome name	GC Count	GC Percent	GenBank seq accession	Molecule type	Ordering	RefSeq seq accession	Role	Seq length	Sequence name	UCSC style name	Unlocalized Count
+GCA_017607445.1		Primary Assembly	Un			JAEVHH010000011.1	Chromosome			unplaced-scaffold	60710			
+GCA_017607445.1		non-nuclear	MT	9872		CM029948.1	Mitochondrion			assembled-molecule	41220			
diff --git a/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.fna.gz b/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.fna.gz
diff --git a/...st/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.gbff.gz b/...st/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.gbff.gz
diff --git a/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.gff.gz b/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_genomic.gff.gz
diff --git a/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_protein.faa.gz b/...est/modules/download_asm_with_datasets/output/GCA_017607445.1_ASM1760744v1_protein.faa.gz
diff --git a/pipelines/nextflow/modules/download/download_asm_with_datasets.nf b/pipelines/nextflow/modules/download/download_asm_with_datasets.nf
@@ -0,0 +1,63 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+process DOWNLOAD_ASM_NCBI_DATSETS {
+    tag "${meta.accession}"
+    label 'adaptive'
+    label 'cached'
+    label 'datasets_cli'
+
+    input:
+        tuple val(meta), path(json_file)
+
+    output:
+        tuple val(meta),
+            path("*_assembly_report.txt"),
+            path("*_genomic.fna.gz"),
+            path("*genomic.gbff.gz"),
+            emit: min_set
+        tuple val(meta),
+            path("*genomic.gff.gz"),
+            path("*protein.faa.gz"),
+            path("*genomic.gbff.gz"),
+            emit: opt_set, optional: true
+
+    shell:
+        '''
+        datasets download genome accession !{meta.accession} \
+        --include seq-report,genome,gbff,gff3,protein \
+        --assembly-source all \
+        --dehydrated \
+        --filename !{meta.accession}_ncbi.zip \
+        --no-progressbar
+
+        unzip !{meta.accession}_ncbi.zip -d !{meta.accession}_ncbi
+
+        datasets rehydrate --directory !{meta.accession}_ncbi \
+        --gzip \
+        --no-progressbar
+
+        cp -f !{meta.accession}_ncbi/ncbi_dataset/data/!{meta.accession}/* ./
+
+        dataformat tsv genome-seq --inputfile sequence_report.jsonl > !{meta.accession}_assembly_report.txt
+        '''
+
+    stub:
+        """
+        datasets --help
+        cp $workflow.projectDir/../../../../data/test/modules/download_asm_with_datasets/output/* .
+        """
+}
diff --git a/pipelines/nextflow/modules/genome_metadata/datasets_metadata.nf b/pipelines/nextflow/modules/genome_metadata/datasets_metadata.nf
@@ -17,7 +17,8 @@ process DATASETS_METADATA {
     tag "$accession"
     label 'local'
     label 'cached'
-
+    label 'datasets_cli'
+
     input:
         val(accession)
 

diff --git a/pipelines/nextflow/subworkflows/genome_prepare/main.nf b/pipelines/nextflow/subworkflows/genome_prepare/main.nf
@@ -18,7 +18,7 @@
 
 // Import modules/subworkflows
 include { CHECK_JSON_SCHEMA as CHECK_JSON_SCHEMA_GENOME } from '../../modules/schema/check_json_schema.nf'
-include { DOWNLOAD_ASM_DATA } from '../../modules/download/download_asm_data.nf'
+include { DOWNLOAD_ASM_NCBI_DATSETS } from '../../modules/download/download_asm_with_datasets.nf'
 include { UNPACK_GFF3 } from '../../modules/gff3/unpack_gff3.nf'
 include { PROCESS_GFF3 } from '../../modules/gff3/process_gff3.nf'
 include { GFF3_VALIDATION } from '../../modules/gff3/gff3_validation.nf'
@@ -45,9 +45,9 @@ workflow GENOME_PREPARE {
         checked_genome = CHECK_JSON_SCHEMA_GENOME(genomic_dataset).verified_json
 
         // Download genome data files. Files may or may not include gene models (GFF3) and/or peptides.
-        DOWNLOAD_ASM_DATA(checked_genome)
-        download_min = DOWNLOAD_ASM_DATA.out.min_set
-        download_opt = DOWNLOAD_ASM_DATA.out.opt_set
+        DOWNLOAD_ASM_NCBI_DATSETS(checked_genome)
+        download_min = DOWNLOAD_ASM_NCBI_DATSETS.out.min_set
+        download_opt = DOWNLOAD_ASM_NCBI_DATSETS.out.opt_set
 
         // Decompress GFF3 file, output with accession in tuple
         unpacked_gff = UNPACK_GFF3(download_opt, 'gff')

diff --git a/pipelines/nextflow/tests/workflows/test_genome_prepare.yml b/pipelines/nextflow/tests/workflows/test_genome_prepare.yml
@@ -47,5 +47,5 @@
     # Manifest depends on the genome checksum, so also date dependent
     - path: ./tmp/genome_prepare/GCA_017607445.1/manifest.json
     - path: ./tmp/genome_prepare/GCA_017607445.1/seq_region.json
-      md5sum: 28518b0c7cbc19a2890a6b347367a82f
+      md5sum: 6a45dc461c53e33dde33807c6def7b63
     - path: ./tmp/genome_prepare/GCA_017607445.1/stats.txt
diff --git a/pipelines/nextflow/workflows/genome_prepare/nextflow.config b/pipelines/nextflow/workflows/genome_prepare/nextflow.config
@@ -26,4 +26,4 @@ params {
 
 singularity {
 	enabled = true
-}
+}
diff --git a/pipelines/nextflow/workflows/nextflow.config b/pipelines/nextflow/workflows/nextflow.config
@@ -114,4 +114,7 @@ process {
         storeDir = { "${params.cache_dir}/${task.process.tokenize(':')[-1].toLowerCase()}/${task.tag}" }
         afterScript = { "sleep ${params.storeDir_latency}" }
     }
+    withLabel: 'datasets_cli' {
+        container = 'library://lcampbell/ensembl-genomio/ncbi-datasets-v16.17.1:latest'
+    }
 }
diff --git a/src/python/ensembl/io/genomio/genome_metadata/extend.py b/src/python/ensembl/io/genomio/genome_metadata/extend.py
@@ -113,8 +113,8 @@ def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
     # Create the seq_regions
     seq_regions = []
     for row in reader:
-        refseq_name = row["RefSeq-Accn"]
-        genbank_name = row["GenBank-Accn"]
+        refseq_name = row["RefSeq seq accession"]
+        genbank_name = row["GenBank seq accession"]
         if refseq_name == "na":
             refseq_name = ""
         if genbank_name == "na":

diff --git a/src/python/ensembl/io/genomio/seq_region/prepare.py b/src/python/ensembl/io/genomio/seq_region/prepare.py
@@ -61,10 +61,10 @@
 
 SYNONYM_RESOURCES = ["GenBank", "RefSeq", "INSDC"]
 SYNONYM_MAP = {
-    "Assigned-Molecule": "INSDC",
-    "GenBank-Accn": "GenBank",
-    "RefSeq-Accn": "RefSeq",
-    "Sequence-Name": "INSDC_submitted_name",
+    "Molecule type": "INSDC",
+    "GenBank seq accession": "GenBank",
+    "RefSeq seq accession": "RefSeq",
+    "Sequence name": "INSDC_submitted_name",
 }
 MOLECULE_LOCATION = {
     "apicoplast": "apicoplast_chromosome",
@@ -143,7 +143,7 @@ def add_insdc_seq_region_name(
                 brc_name = names[source_name]
                 break
         else:
-            raise UnknownMetadata(f'Cannot set BRC4 sequence region name for {seqr["name"]}')
+            raise UnknownMetadata(f'Cannot set BRC4 sequence region name for {seqr["Sequence name"]}')
         brc_name = brc_name.partition(".")[0]
         seqr["BRC4_seq_region_name"] = brc_name
         seqr["EBI_seq_region_name"] = seqr["name"]
@@ -168,7 +168,6 @@ def add_mitochondrial_codon_table(seq_regions: List[SeqRegion], taxon_id: int) -
     if response.text.startswith("<"):
         raise ValueError(f"Response from {url} is not JSON")
     decoded = response.json()
-
     if "mitochondrialGeneticCode" not in decoded:
         logging.warning("No mitochondria genetic code found for taxon {taxon_id}")
     else:
@@ -217,7 +216,6 @@ def merge_seq_regions(
     seq_regions.sort(key=lambda x: x["name"])
     return seq_regions
 
-
 def get_gbff_seq_regions(gbff_path: PathLike) -> SeqRegionDict:
     """Returns the sequence regions found in the GBFF file (if any).
 
@@ -372,11 +370,11 @@ def make_seq_region(
     seq_region = {}
     # Set accession as the sequence region name
     src = "RefSeq" if is_refseq else "GenBank"
-    accession_id = data.get(f"{src}-Accn", "")
+    accession_id = data.get(f"{src} seq accession", "")
     if accession_id and (accession_id != "na"):
         seq_region["name"] = accession_id
     else:
-        logging.warning(f'No {src} accession ID found for {data["Sequence-Name"]}')
+        logging.warning(f'No {src} accession ID found for {data["Assembly Accession"]}')
         return {}
     # Add synonyms
     synonyms = []
@@ -388,18 +386,18 @@ def make_seq_region(
         synonyms.sort(key=lambda x: x["source"])
         seq_region["synonyms"] = synonyms
     # Add sequence length
-    field = "Sequence-Length"
+    field = "Seq length"
     if (field in data) and (data[field].casefold() != "na"):
         seq_region["length"] = int(data[field])
     # Add coordinate system and location
-    seq_role = data["Sequence-Role"]
+    seq_role = data["Role"]
     # Scaffold?
     if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
         seq_region["coord_system_level"] = "scaffold"
     # Chromosome? Check location
     elif seq_role == "assembled-molecule":
         seq_region["coord_system_level"] = "chromosome"
-        location = data["Assigned-Molecule-Location/Type"].lower()
+        location = data["Molecule type"].lower()
         # Get location metadata
         try:
             seq_region["location"] = molecule_location[location]