From b284a886e546b0e5cb0c50d99becdcde075fd3d8 Mon Sep 17 00:00:00 2001 From: Sai Sneha Date: Wed, 1 May 2024 23:16:13 +0530 Subject: [PATCH] docs: added introduction to docs and motivation, inspiration, and approach --- docs/_static/custom.css | 9 ++++-- docs/csvs/sample.csv | 46 +++++++++++++-------------- docs/csvs/user.csv | 2 +- docs/data_standards/samples.rst | 2 +- docs/data_standards/terminologies.rst | 15 ++++++++- docs/index.rst | 45 ++++++++++++++++++++++++-- docs/jsons/eg1_accession.json | 10 ++++++ pyproject.toml | 2 +- 8 files changed, 98 insertions(+), 33 deletions(-) create mode 100644 docs/jsons/eg1_accession.json diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 1c37369..81da045 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -1,6 +1,9 @@ -.wy-nav-content { - max-width: 60% !important; -} +@media (min-width: 768px) { + .wy-nav-content { + max-width: 60% !important; + } + } + .wy-table-responsive table td, .wy-table-responsive table th { white-space: inherit; diff --git a/docs/csvs/sample.csv b/docs/csvs/sample.csv index ad9b842..a2abe44 100644 --- a/docs/csvs/sample.csv +++ b/docs/csvs/sample.csv @@ -1,37 +1,37 @@ -attribute category,attribute,flags,"description, example, and binding",type -meta,uuid,"-i, -m",UUID based on UUID4 from RFC4122. Example: urn:uuid:f71dab9c-d12a-400d-b49a-5f4972bb4c23,PrimitiveType.uri.uuid -,sampleCollectorUUID,"-i, -m",User/meta.uuid, -,barcode,"-i, -m, -a","Number scanned from the barcode, assigned to the database entry automatically", +attribute category,attribute,flags,"description, example, and binding",type +meta,uuid,"-i, -m",UUID based on UUID4 from RFC4122. Example: urn:uuid:f71dab9c-d12a-400d-b49a-5f4972bb4c23,PrimitiveType.uri.uuid +,sampleCollectorUUID,"-i, -m",User/meta.uuid, +,barcode,"-i, -m, -a","Number scanned from the barcode, assigned to the database entry automatically", ,accessionIDs,,"Accession to various other standard databases. Stored in JSON, XML, or YAML format. -{ - { system: ""https://gisaid.org/"" - code: ""MyGISAID_AccessionID"", - display: ""GISAID Accession ID"" - }, - { system: ""https://www.ncbi.nlm.nih.gov/genbank/"" - code: ""MyGenBank_AccessionID"", - display: ""Genbank Accession"" + { + { ""system"": ""https://gisaid.org/"", + ""code"": ""MyGISAID_AccessionID"", + ""display"": ""GISAID Accession ID"" + }, + { ""system"": ""https://www.ncbi.nlm.nih.gov/genbank/"", + ""code"": ""MyGenBank_AccessionID"", + ""display"": ""Genbank Accession"" + } } -} -extensible to support any database ID.", +extensible to support any database ID.", ,diseaseOfInterest,-m,"Code based on existing standards, e.g. ICD11 and SNOMED-CT. Like with accession IDs, multiple codes are supported { ""system"": ""https://icd.who.int/browse/2024-01/mms/en#/http://id.who.int/icd/entity/1959883044"", ""value"": ""1F05.3"", ""display"": ""Foot and Mouth Disease"" -}", -,CollectionDate,-a,Date of Sample Collection. Autofilled when barcode is scanned., +}", +,CollectionDate,-a,Date of Sample Collection. Autofilled when barcode is scanned., location,country,"-m, -v","Two letter Code based on ISO3166-1 A-2 (https://www.iso.org/iso-3166-country-codes.html). { ""system"": ""https://www.iso.org/standard/72482.html"", ""code"": ""IN"", ""display"": ""INDIA"" -}", -,geoLatLong,-m,"(longitude, latitude). It's recommended to retain 6 decimal points, but at least two are required.", +}", +,geoLatLong,-m,"(longitude, latitude). It's recommended to retain 6 decimal points, but at least two are required.", ,geoadmin,-v,"Store the highest Resolution ID associated with the sample, along with the hierarchy system used to collect it. (Wards vs Villages for Urban vs Rural, for e.g.) { @@ -40,9 +40,9 @@ location,country,"-m, -v","Two letter Code based on ISO3166-1 A-2 (https://www.i ""code"": ""ward_276600-12"", ""display"": ""DODDA BOMMASANDRA"", ""parents"": [""zone_276600-10"", ""ulb_277600"",""state_29""] -}", -,pinCode,-v,e.g. 560012, -collectionInfo,siteName,,string, -,siteType,,, -,sampleType,,"Terminology Binding (Milk/Soil/Feed/Water Runoff/Air, Slurry)", +}", +,pinCode,-v,e.g. 560012, +collectionInfo,siteName,,string, +,siteType,,, +,sampleType,,"Terminology Binding (Milk/Soil/Feed/Water Runoff/Air, Slurry)", ,storage,-ao,Terminology Binding (Room Temperature vs Cold Chain), \ No newline at end of file diff --git a/docs/csvs/user.csv b/docs/csvs/user.csv index ce32504..d9ea931 100644 --- a/docs/csvs/user.csv +++ b/docs/csvs/user.csv @@ -1,5 +1,5 @@ attribute category,attribute,flags,"description, example, and binding",type -meta,uuid,"-i, -m",UUID based on UUID4 from RFC4122. Example: urn:uuid:f71dab9c-d12a-400d-b49a-5f4972bb4c23,PrimitiveType.uri.uuid +meta,uuid,"-i, -m","UUID based on UUID4 from `RFC4122 `_. Example: urn:uuid:f71dab9c-d12a-400d-b49a-5f4972bb4c23",PrimitiveType.uri.uuid ,id,,Alphanumerical ID assigned after database reconciliation., info,name,,, ,email,,, diff --git a/docs/data_standards/samples.rst b/docs/data_standards/samples.rst index 0d0caf5..ed8565f 100644 --- a/docs/data_standards/samples.rst +++ b/docs/data_standards/samples.rst @@ -5,4 +5,4 @@ Resource: Sample .. csv-table:: Sample Attributes :file: ../csvs/sample.csv :widths: 20,20,10,40,10 - :header-rows: 1 \ No newline at end of file + :header-rows: 1 diff --git a/docs/data_standards/terminologies.rst b/docs/data_standards/terminologies.rst index b55925c..4c919e3 100644 --- a/docs/data_standards/terminologies.rst +++ b/docs/data_standards/terminologies.rst @@ -1,4 +1,4 @@ -Terminologies, Concepts and Conventions +Terminologies, Concepts, and Conventions ======================================== .. warning:: @@ -40,3 +40,16 @@ Terminologies, Concepts and Conventions :file: ../csvs/flag.csv :widths: 20,30,50 :header-rows: 1 + +.. code-block:: JSON + + { + { "system": "https://gisaid.org/", + "code": "MyGISAID_AccessionID", + "display": "GISAID Accession ID" + }, + { "system": "https://www.ncbi.nlm.nih.gov/genbank/", + "code": "MyGenBank_AccessionID", + "display": "Genbank Accession" + } + } \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index da438d1..d575890 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,13 +3,52 @@ Data Standards for Environmental Surveillance (dses) #################################################### .. warning:: - This documentation is under active development and might be changing. Check out our Github to see the latest changes. + These standards are currently in alpha (under active development) and might be changing. Check out our Github to see the latest changes. This is the documentation for the data standards for environmental surveillance, developed by the Data Science Innovation Hub at `ARTPARK `_ at the `Indian Institute of Science `_. -The aim is to create a set of data standards, along the lines of `FHIR HL7 `_ that are fast to adopt, interoperable, and can be scaled to any kind of disease while allowing extensibility for any user's specific use-case. -If you are currently on Github - you can find the docs hosted at `dses.dsih.artpark.in `_. +Introduction: Why do we need Standards? +------------------------------------------ + +Environmental Surveillance has the capability to complement public health surveillance at the national and sub-national levels. At the moment, environmental surveillance research is largely fragmented across different groups across the world. Specifically, in India, +environmental surveillance is performed primarily by various research institutions who further deliver the results of their analyses to the relevant public health authority within the government. This involves the collection of various types of samples across various sites and locations. There are also various use-cases for the utilisation of this data. +For instance, environmental surveillance for SARS-CoV-2 was crucial in identifying Variants of Concern in waste-water before there was an outbreak. +For animal diseases, it might be targetted to identify epidemics and guide vaccination strategies in endemic areas. + +Environmental Surveillance is also already existent in public health systems - in India and in various tropical countries - where there is a significant incidence of dengue. In India, for instance, there is a systematic government effort under the *National Vector Borne Disease Control Program* (NVBDCP) to identify breeding spots for *Aedes Aegypti* and *Aedes Albopictus*. +These breeding spots are systematically identified and eliminated, in programs commonly referred to together as *Source Reduction Activities*. + +There are various diseases of interest - animal diseases such as Foot-and-mouth Disease, Lumpy Skin Disease and Avian Influenza, and human diseases like COVID19, Influenza, and dengue. +The goals for each environmental surveillance program might differ - but various policymakers turn to epidemiologists to engage in disease modelling using the results of environmental surveillance, the larger goal remaining an integrated environmental surveillance program within health ministries worldwide. +There is therefore a need for a consistent framework and data definitions, that will allow researchers to engage in disease modelling using data from various sites and groups. +This will also allow tools and models to be extensible from one disease to another, which will prove crucial for similar diseases (for e.g. dengue, malaria, and chikungunya). + +Principles +---------- + +Existing groups, such as the `Public Health Alliance for Genomic Epidemiology (pha4ge) `_ are working to establish Data Structures for Genomic Epidemiology. +The goal of this set of standards is to complement such efforts and create an extensible framework for the standards that can be used by any research group or public entity that intends to start working on genomic epidemiology and environmental surveillance. +Crucially, the efforts listed here are built to complement clinical surveillance, and eventually integrate with existing health surveillance systems at national and subnational levels, at scale. +The framework and standards we develop would need to comply with the following principles so we can solve the problems mentioned above: + +#. **Fast and Easy to Adopt**: The standard would need to be constructed keeping different users in mind. For instance, the standard should be easy to adopt for a team of researchers collecting data on excel sheets, but should also allow fast development of applications for use at scale by public health authorities. +#. **Modular, Extensible, and Adaptable across Contexts**: The standard would need to be constructed to ensure that only the absolutely crucial aspects are mandatory, allowing for specific users to add modules of their own. This is also to ensure that a standard that might work in one context are not enforced across contexts. For instance, rather than enforcing an ontology for disease codes, we allow different users to use the ontology that makes the most sense in their context. For human diseases, the logical conclusion might be the `ICD `_ or `SNOMED CT `_. For animal diseases, this might reference the `Terrestrial Animal Health Code `_, or rely on the standards being developed by the `Global Burden of Diseases and the OIE `_. +#. **Works well with existing standards**: The framework and standard would also attempt to ensure inter-operability with existing standards from the ground up. This will involve ensuring that the data can be converted to formats specified by the pha4ge, and also ensuring to use existing ontologies and standards. This could mean using ontologies like ICD/SNOMED/GBAD/OIE, as mentioned above. This could also mean creating country agnostic standards for administrative regions, and relying on country specific standards for administrative units within each country, while providing a base framework to aid uniform implementation. +#. **Public Domain and Open Source**: Crucially, this will also involve constructing these standards, from the ground up, in an open source manner, allowing members of the genomics, epidemiology, public health, and developer communities to contribute and provide feedback, and allow for researchers using the standards on the ground to provide real feedback, and extend the standards as they see fit for their own use case. This also involves constructing the standards in a manner that is friendly for both developers and non-developers to understand, implement, and provide feedback. + + +Approach +-------- + +When we get started with building our application to aid environmental surveillance efforts on the ground with a group of researchers, we turned to the `pha4ge `_ standards to get started. They define a set of columns that are to be maintained, some optional, some mandatory, and some recommended. +Our approach to construct our framework and standards starts with the following steps: + +#. Take the broader columns defined by pha4ge, and modularise these columns based on two primary criteria - who will collect/compute this information, and who will finally use this information. A sample might be collected on the field by a technician, who will log the sample collection site, and the barcode associated with the sample. They would also log the environmental conditions during sample collection. On the other end of the *environmental surveillance supply chain*, a bioinformatician might take all the relevant information provided by the sample collection technician, the molecular biologist, and the sequencing team, and use that information to conduct an analysis, perhaps engage in disease modelling. This step also includes defining the accepted datatypes for all data points that someone who uses our framework might use. +#. We then focus on making these columns friendly for developers and computer systems, and, crucially, understanding how these columns can be standardised. This includes tying them to any existing ontologies. For instance, as mentioned above, we would rely on the `Census of India `_ and the `Ministry of Panchayati Raj's LGD `_ for the names and codes of administrative units in India. We can rely on ISO standards for date, time, latitude and longitude, and country codes, and the WHO and OIE/GBAD for disease specific ontologies and codes. + +These steps can be likened to the first step being to create the larger framework and standards that can be used across contexts and diseases, and the second step can be likened to creating our very own implementation guide for this framework, similar to the approach taken by `FHIR HL7 `_. + Table of Contents diff --git a/docs/jsons/eg1_accession.json b/docs/jsons/eg1_accession.json new file mode 100644 index 0000000..494547e --- /dev/null +++ b/docs/jsons/eg1_accession.json @@ -0,0 +1,10 @@ +{ + { "system": "https://gisaid.org/", + "code": "MyGISAID_AccessionID", + "display": "GISAID Accession ID" + }, + { "system": "https://www.ncbi.nlm.nih.gov/genbank/", + "code": "MyGenBank_AccessionID", + "display": "Genbank Accession" + } + } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bca49ac..212207c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dses" -version = "0.1.0" +version = "0.1.1" description = "Inter-operable Data Standards for Environmental Surveillance" authors = ["Sneha S"] license = "None"