Add missing test files

bokulich-lab · Jun 11, 2024 · cb3b08a · cb3b08a
1 parent d54e1da
commit cb3b08a
Show file tree

Hide file tree

Showing 8 changed files with 38,500 additions and 1 deletion.
diff --git a/rescript/tests/data/ncbi-dataset/README.md b/rescript/tests/data/ncbi-dataset/README.md
@@ -0,0 +1,41 @@
+# NCBI Datasets
+
+https://www.ncbi.nlm.nih.gov/datasets
+
+This zip archive contains an NCBI Datasets Data Package.
+
+NCBI Datasets Data Packages can include sequence, annotation and other data files, and metadata in one or more data report files.
+Data report files are in JSON Lines format.
+
+---
+## FAQs
+### Where is the data I requested?
+
+Your data is in the subdirectory `ncbi_dataset/data/` contained within this zip archive.
+
+### I still can't find my data, can you help?
+
+We have identified a bug affecting Mac Safari users. When downloading data from the NCBI Datasets web interface, you may see only this README file after the download has completed (while other files appear to be missing).
+As a workaround to prevent this issue from recurring, we recommend disabling automatic zip archive extraction in Safari until Apple releases a bug fix.
+For more information, visit:
+https://www.ncbi.nlm.nih.gov/datasets/docs/reference-docs/mac-zip-bug/
+
+### How do I work with JSON Lines data reports?
+
+Visit our JSON Lines data report documentation page:
+https://www.ncbi.nlm.nih.gov/datasets/docs/v2/tutorials/working-with-jsonl-data-reports/
+
+### What is NCBI Datasets?
+
+NCBI Datasets is a resource that lets you easily gather data from across NCBI databases. Find and download gene, transcript, protein and genome sequences, annotation and metadata.
+
+### Where can I find NCBI Datasets documentation?
+
+Visit the NCBI Datasets documentation pages:
+https://www.ncbi.nlm.nih.gov/datasets/docs/
+
+---
+
+National Center for Biotechnology Information
+National Library of Medicine
+[email protected]
diff --git a/.../data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/GCA_000008865.2_ASM886v2_genomic.fna b/.../data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/GCA_000008865.2_ASM886v2_genomic.fna
@@ -0,0 +1,11 @@
+>BA000007.3 Escherichia coli O157:H7 str. Sakai DNA, complete genome
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTCTCTGACAGCAGCTTCTGAACTG
+AAGAAAGCTTCGTAGAAGCT
+--
+>AB011548.2 Escherichia coli O157:H7 str. Sakai plasmid pOSAK1 DNA, complete sequence
+TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGCGTCCAGTTTTTCGAGCAGCG
+TCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGCAGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTC
+--
+>AB011549.2 Escherichia coli O157:H7 str. Sakai plasmid pO157 DNA, complete sequence
+AGCCAGATTTTACCCGCCCATCCTAAAGAAGGGGATAGTCAACCACATCTGACCAGCCTGCGGAAAAGTCTGCTGCTTGT
+CCGTCCGGTGAAAGCTGATGATAAAACACCTGTTCAGGTGGAAGCCCGCGATGATAATAATAAAATTCTCGGTACGTTAA
diff --git a/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/genomic.gff b/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/genomic.gff
diff --git a/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/protein.faa b/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/protein.faa
diff --git a/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/sequence_report.jsonl b/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/sequence_report.jsonl
@@ -0,0 +1,3 @@
+{"assemblyAccession":"GCA_000008865.2","assemblyUnit":"Primary Assembly","assignedMoleculeLocationType":"Chromosome","chrName":"chromosome","gcCount":"2778819","genbankAccession":"BA000007.3","length":5498578,"refseqAccession":"NC_002695.2","role":"assembled-molecule"}
+{"assemblyAccession":"GCA_000008865.2","assemblyUnit":"Primary Assembly","assignedMoleculeLocationType":"Plasmid","chrName":"pOSAK1","gcCount":"1435","genbankAccession":"AB011548.2","length":3306,"refseqAccession":"NC_002127.1","role":"assembled-molecule"}
+{"assemblyAccession":"GCA_000008865.2","assemblyUnit":"Primary Assembly","assignedMoleculeLocationType":"Plasmid","chrName":"pO157","gcCount":"44135","genbankAccession":"AB011549.2","length":92721,"refseqAccession":"NC_002128.1","role":"assembled-molecule"}
diff --git a/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/assembly_data_report.jsonl b/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/assembly_data_report.jsonl
@@ -0,0 +1 @@
+{"annotationInfo":{"name":"Annotation submitted by GIRC","releaseDate":"2018-06-08","source":"GIRC","stats":{"geneCounts":{"nonCoding":126,"proteinCoding":5113,"pseudogene":136,"total":5375}}},"assemblyInfo":{"assemblyAccession":"GCA_000008865.2","assemblyLevel":"Complete Genome","assemblyName":"ASM886v2","assemblyStatus":"current","assemblyType":"haploid","bioprojectLineage":[{"bioprojects":[{"accession":"PRJNA226","title":"Enterohemorrhagic Escherichia coli"}]}],"biosample":{"accession":"SAMN01911278","attributes":[{"name":"collection_date","value":"1996"},{"name":"geo_loc_name","value":"Japan: Sakai City, Osaka prefecture"},{"name":"lat_lon","value":"Missing"},{"name":"isolation_source","value":"Human intestinal microflora"},{"name":"host","value":"Homo sapiens"},{"name":"host_disease","value":"Hemorrhagic colitis, HUS"},{"name":"collected_by","value":"M Yoh, T Honda"},{"name":"host_tissue_sampled","value":"Human feces"},{"name":"host_description","value":"typical patient during the Sakai outbreak"},{"name":"culture_collection","value":"ATCC:BAA-460"},{"name":"serotype","value":"E. coli O157:H7"},{"name":"pathotype","value":"EHEC, STEC"},{"name":"Sequencing Center","value":"RIKEN"},{"name":"GOLD","value":"Gc00046"},{"name":"Genbank","value":"BA000007"},{"name":"Refseq","value":"NC_002695"},{"name":"strain","value":"SAKAI (EHEC)"}],"bioprojects":[{"accession":"PRJNA226"}],"description":{"comment":"Full genome sequenced type strain Escherichia coli O157:H7 str. SAKAI.  This strain is associated with Hamburger disease, which is caused by the contamination of meat products by enterohemorrhagic E. coli (EHEC). The identifier O157:H7 refers to the serotype of EHEC, and reflects the specific antigenic markers found on the surface of the cell. EHEC attaches and effaces to cells in the large intestine.There are numerous differences that distinguish O157:H7 from K-12, hundreds of them associated with genomic islands in either strain, including at least 9 large PAIs in O157:H7 that encode virulence factors. A type III secretion system, the locus of enterocyte effacement, numerous toxins and adhesins, as well as fimbrial gene clusters and iron uptake systems are found in these PAIs. Pathogenicity genes are also found on the plasmid pO157. Escherichia coli O157:H7 strain Sakai. This strain of O157:H7 was isolated in a 1996 outbreak in Sakai, Japan.","organism":{"organismName":"Escherichia coli O157:H7 str. Sakai","taxId":386585},"title":"Bacterial, clinical or host-associated sample for Escherichia coli O157:H7 str. SAKAI (EHEC)"},"lastUpdated":"2019-05-23T15:25:40.989","models":["Pathogen.ba-cl"],"owner":{"contacts":[{}],"name":"ATCC"},"package":"Pathogen.cl.1.0","publicationDate":"2013-02-05T00:00:00.000","sampleIds":[{"label":"Sample name","value":"Escherichia coli O157:H7 str. Sakai"},{"db":"SRA","value":"SRS391048"}],"status":{"status":"live","when":"2014-11-20T09:44:57"},"submissionDate":"2013-02-05T09:09:06.203"},"biosampleAccession":"SAMN01911278","comments":"On May 10, 2018 this sequence version replased gi: 47118301\nThis update is obtained as follows: the Sakai genome was resequenced by illumina MiSeq and PacBio RS II sequencers and 78 sites of single base error, 11 sites of insertion (10 one-base insertions and a 141-base insertion) and 16 sites of deletion (14 one-base deletions, a three-base deletion and a six-base deletion) were corrected. Annotation was performed using DFAST auto-annotation system with manual curation.\nThis work was done in collaboration with Tetsuya Hayashi, Makoto Ohnishi, Keisuke Nakayama (Miyazaki Medical College), Kozo Makino, Ken Kurokawa, Katsushi Yokoyama, Masashi Tanaka, Takeshi Honda, Teruo Yasunaga, Hideo Shinagawa (Osaka University), Takahiro Murata (Shinshu University), Chang-Gyun Han, Eiichi Ohtsubo, Toru Tobe, Chihiro Sasakawa (University of Tokyo), Hideto Takami (Japan Marine Science and Technology Center), Naotake Ogasawara (Nara Institute of Science and Technology), Satoru Kuhara (Kuyshu University), and supported by the Research for the Future Program of the Japan Society for the Promotion of Science.","currentAssemblyAccession":"GCA_000008865.2","genbankAssmAccession":"GCA_000008865.2","pairedAssembly":{"accession":"GCF_000008865.2","annotationName":"Annotation submitted by NCBI RefSeq","status":"current"},"pairedAssemblyAccession":"GCF_000008865.2","refseqAssmAccession":"GCF_000008865.2","submissionDate":"2018-06-08","submitter":"GIRC"},"assemblyStats":{"contigL50":1,"contigN50":5498578,"gcCount":"2824389","numberOfComponentSequences":3,"numberOfContigs":3,"numberOfScaffolds":3,"scaffoldL50":1,"scaffoldN50":5498578,"totalNumberOfChromosomes":3,"totalSequenceLength":"5594605","totalUngappedLength":"5594605"},"organismName":"Escherichia coli O157:H7 str. Sakai","strain":"Sakai substr. RIMD 0509952","taxId":386585}
diff --git a/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/dataset_catalog.json b/rescript/tests/data/ncbi-dataset/ncbi_dataset/data/dataset_catalog.json
@@ -0,0 +1,31 @@
+{
+"apiVersion": "V1",
+"assemblies": [
+{
+  "files": [
+    {
+      "filePath": "assembly_data_report.jsonl",
+      "fileType": "DATA_REPORT"
+    }
+  ]
+},{
+  "accession": "GCA_000008865.2",
+  "files": [
+    {
+      "filePath": "GCA_000008865.2/GCA_000008865.2_ASM886v2_genomic.fna",
+      "fileType": "GENOMIC_NUCLEOTIDE_FASTA"
+    },
+    {
+      "filePath": "GCA_000008865.2/protein.faa",
+      "fileType": "PROTEIN_FASTA"
+    },
+    {
+      "filePath": "GCA_000008865.2/genomic.gff",
+      "fileType": "GFF3"
+    },
+    {
+      "filePath": "GCA_000008865.2/sequence_report.jsonl",
+      "fileType": "SEQUENCE_REPORT"
+    }
+  ]
+}]}
diff --git a/setup.py b/setup.py
@@ -23,7 +23,11 @@
     url="https://github.com/nbokulich/RESCRIPt",
     entry_points={'qiime2.plugins': ['rescript=rescript.plugin_setup:plugin']},
     package_data={
-        'rescript.tests': ['data/*'],
+        'rescript.tests': [
+            'data/*', 'data/ncbi-dataset/*',
+            'data/ncbi-dataset/ncbi_dataset/data/*',
+            'data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/*',
+        ],
         'rescript.types.tests': ['data/*'],
         'rescript': ['citations.bib', 'assets/*'],
     },
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"annotationInfo":{"name":"Annotation submitted by GIRC","releaseDate":"2018-06-08","source":"GIRC","stats":{"geneCounts":{"nonCoding":126,"proteinCoding":5113,"pseudogene":136,"total":5375}}},"assemblyInfo":{"assemblyAccession":"GCA_000008865.2","assemblyLevel":"Complete Genome","assemblyName":"ASM886v2","assemblyStatus":"current","assemblyType":"haploid","bioprojectLineage":[{"bioprojects":[{"accession":"PRJNA226","title":"Enterohemorrhagic Escherichia coli"}]}],"biosample":{"accession":"SAMN01911278","attributes":[{"name":"collection_date","value":"1996"},{"name":"geo_loc_name","value":"Japan: Sakai City, Osaka prefecture"},{"name":"lat_lon","value":"Missing"},{"name":"isolation_source","value":"Human intestinal microflora"},{"name":"host","value":"Homo sapiens"},{"name":"host_disease","value":"Hemorrhagic colitis, HUS"},{"name":"collected_by","value":"M Yoh, T Honda"},{"name":"host_tissue_sampled","value":"Human feces"},{"name":"host_description","value":"typical patient during the Sakai outbreak"},{"name":"culture_collection","value":"ATCC:BAA-460"},{"name":"serotype","value":"E. coli O157:H7"},{"name":"pathotype","value":"EHEC, STEC"},{"name":"Sequencing Center","value":"RIKEN"},{"name":"GOLD","value":"Gc00046"},{"name":"Genbank","value":"BA000007"},{"name":"Refseq","value":"NC_002695"},{"name":"strain","value":"SAKAI (EHEC)"}],"bioprojects":[{"accession":"PRJNA226"}],"description":{"comment":"Full genome sequenced type strain Escherichia coli O157:H7 str. SAKAI. This strain is associated with Hamburger disease, which is caused by the contamination of meat products by enterohemorrhagic E. coli (EHEC). The identifier O157:H7 refers to the serotype of EHEC, and reflects the specific antigenic markers found on the surface of the cell. EHEC attaches and effaces to cells in the large intestine.There are numerous differences that distinguish O157:H7 from K-12, hundreds of them associated with genomic islands in either strain, including at least 9 large PAIs in O157:H7 that encode virulence factors. A type III secretion system, the locus of enterocyte effacement, numerous toxins and adhesins, as well as fimbrial gene clusters and iron uptake systems are found in these PAIs. Pathogenicity genes are also found on the plasmid pO157. Escherichia coli O157:H7 strain Sakai. This strain of O157:H7 was isolated in a 1996 outbreak in Sakai, Japan.","organism":{"organismName":"Escherichia coli O157:H7 str. Sakai","taxId":386585},"title":"Bacterial, clinical or host-associated sample for Escherichia coli O157:H7 str. SAKAI (EHEC)"},"lastUpdated":"2019-05-23T15:25:40.989","models":["Pathogen.ba-cl"],"owner":{"contacts":[{}],"name":"ATCC"},"package":"Pathogen.cl.1.0","publicationDate":"2013-02-05T00:00:00.000","sampleIds":[{"label":"Sample name","value":"Escherichia coli O157:H7 str. Sakai"},{"db":"SRA","value":"SRS391048"}],"status":{"status":"live","when":"2014-11-20T09:44:57"},"submissionDate":"2013-02-05T09:09:06.203"},"biosampleAccession":"SAMN01911278","comments":"On May 10, 2018 this sequence version replased gi: 47118301\nThis update is obtained as follows: the Sakai genome was resequenced by illumina MiSeq and PacBio RS II sequencers and 78 sites of single base error, 11 sites of insertion (10 one-base insertions and a 141-base insertion) and 16 sites of deletion (14 one-base deletions, a three-base deletion and a six-base deletion) were corrected. Annotation was performed using DFAST auto-annotation system with manual curation.\nThis work was done in collaboration with Tetsuya Hayashi, Makoto Ohnishi, Keisuke Nakayama (Miyazaki Medical College), Kozo Makino, Ken Kurokawa, Katsushi Yokoyama, Masashi Tanaka, Takeshi Honda, Teruo Yasunaga, Hideo Shinagawa (Osaka University), Takahiro Murata (Shinshu University), Chang-Gyun Han, Eiichi Ohtsubo, Toru Tobe, Chihiro Sasakawa (University of Tokyo), Hideto Takami (Japan Marine Science and Technology Center), Naotake Ogasawara (Nara Institute of Science and Technology), Satoru Kuhara (Kuyshu University), and supported by the Research for the Future Program of the Japan Society for the Promotion of Science.","currentAssemblyAccession":"GCA_000008865.2","genbankAssmAccession":"GCA_000008865.2","pairedAssembly":{"accession":"GCF_000008865.2","annotationName":"Annotation submitted by NCBI RefSeq","status":"current"},"pairedAssemblyAccession":"GCF_000008865.2","refseqAssmAccession":"GCF_000008865.2","submissionDate":"2018-06-08","submitter":"GIRC"},"assemblyStats":{"contigL50":1,"contigN50":5498578,"gcCount":"2824389","numberOfComponentSequences":3,"numberOfContigs":3,"numberOfScaffolds":3,"scaffoldL50":1,"scaffoldN50":5498578,"totalNumberOfChromosomes":3,"totalSequenceLength":"5594605","totalUngappedLength":"5594605"},"organismName":"Escherichia coli O157:H7 str. Sakai","strain":"Sakai substr. RIMD 0509952","taxId":386585}