diff --git a/.github/workflows/check_dataset.yml b/.github/workflows/check_dataset.yml index daa7d138e..5a2cc9671 100644 --- a/.github/workflows/check_dataset.yml +++ b/.github/workflows/check_dataset.yml @@ -16,7 +16,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install amdirt==1.6.1 + pip install amdirt==1.6.2 pip install jsonschema==4.17.0 - name: Make validation results directory run: mkdir validation/ @@ -55,6 +55,12 @@ jobs: run: | echo "## Ancient Metagenome Environmental" >> validation/validation_results_raw.txt AMDirT validate -s -d -c -m ancientmetagenome-environmental/libraries/ancientmetagenome-environmental_libraries.tsv ancientmetagenome-environmental/libraries/ancientmetagenome-environmental_libraries_schema.json &>> validation/validation_results_raw.txt + - name: RADIOCARBON DATES test ancient single genomes (e.g. pathogens) + if: always() + run: | + echo "# Radiocarbon Dates" >> validation/validation_results_raw.txt + echo "## Ancient Single Genome Host Associated" >> validation/validation_results_raw.txt + AMDirT validate -s -d -c -m ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates.tsv ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates_schema.json &>> validation/validation_results_raw.txt - name: cleanup validation results from streamlit warnings if: always() run: | diff --git a/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates.tsv b/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates.tsv new file mode 100644 index 000000000..052c22c39 --- /dev/null +++ b/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates.tsv @@ -0,0 +1,5 @@ +project_name publication_year data_publication_doi sample_name archive_project archive_sample_accession date_information_present date_is_radiocarbon multiple_dates reference_location reference_citation_depth primary_secondary_reference_citation_doi direct_dating radiocarbon_lab_code spectrometry_type sample_material delta_13c uncalibrated_date uncalibrated_uncertainty_plus_minus calibration_reported calibration_curve calibration_software calibration_software_version calibrated_range_lower calibrated_range_upper calibrated_range_median calibrated_range_suffix reservoir_offset_mentioned reservoir_offset_applied reservoir_offset_reported +AndradesValtuena2017 2017 10.1016/j.cub.2017.10.025 1343UnTal85 PRJEB19335 ERS1892067 true true false main text 2 10.1371/journal.pone.0139705 true MAMS-18949 AMS tooth -20.5 3819 24 true IntCal13 OxCal v4.2.24 4346 4098 NA cal AD true false NA +AndradesValtuena2017 2017 10.1016/j.cub.2017.10.025 6Post PRJEB19335 ERS1892066 true true false main text 2 10.1371/journal.pone.0139705 true MAMS-18955 AMS tooth -20.7 3574 19 true IntCal13 OxCal v4.2.24 3957 3832 NA cal AD true false NA +Spyrou2018 2018 10.1038/s41467-018-04550-9 RT5 PRJEB24296 ERS2106903 true true false main text 1 10.1038/s41467-018-04550-9 true MAMS-29430 NA tooth -99999 3517 27 true NR NR NR 3868 3704 NA cal BP false NA NA +Spyrou2018 2018 10.1038/s41467-018-04550-9 RT6 PRJEB24296 ERS2106904 true true false main text 1 10.1038/s41467-018-04550-9 true MAMS-29431 NA tooth -99999 3499 25 true NR NR NR 3842 3696 NA cal BP false NA NA \ No newline at end of file diff --git a/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates_schema.json b/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates_schema.json new file mode 100644 index 000000000..e87c2b86d --- /dev/null +++ b/ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates_schema.json @@ -0,0 +1,259 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://spaam-community.github.io/AncientMetagenomeDir/ancientsinglegenome-hostassociated/ancientsinglegenome-hostassociated_radiocarbondates_schema.json", + "type": "array", + "title": "JSON schema for AncientMetagenomeDir ancient host-associated single-genome C14 data", + "description": "The JSON schema for AncientMetagenomeDir ancient host-associated radiocarbon date (C14) information", + "additionalItems": false, + "items": { + "$id": "#/items", + "type": "object", + "title": "The items schema", + "description": "An explanation about the purpose of this instance.", + "default": {}, + "required": [ + "project_name", + "publication_year", + "data_publication_doi", + "sample_name", + "archive_project", + "archive_sample_accession" + ], + "properties": { + "project_name": { + "$id": "#/items/properties/project_name", + "type": "string", + "title": "AncientMetagenomeDir key of the publication", + "description": "Format: surnameYYYY (if duplicate key but different publication, add b,c,d etc. as necessary). Must match a AncientMetagenomeDir samples table entry", + "pattern": "^[a-zA-Z]+\\d{4}[b-z]?$", + "examples": ["Warinner2014", "Muhlemann2018", "Muhlemann2018a"] + }, + "publication_year": { + "$id": "#/items/properties/publication_year", + "type": "integer", + "minimum": 1950, + "maximum": 2100, + "title": "Year of publication", + "description": "Format: YYYY", + "examples": [2014] + }, + "data_publication_doi": { + "$id": "#/items/properties/data_publication_doi", + "type": "string", + "pattern": "^10.\\d{4,9}\\/[^,]+$", + "title": "Digital Object Identifier (DOI) of the publication.", + "description": "A valid DOI code (not as an URL). Must match a AncientMetagenomeDir samples table entry", + "examples": ["10.1038/ng.2906"] + }, + "sample_name": { + "$id": "#/items/properties/sample_name", + "type": "string", + "title": "Name of the sample", + "description": "In most cases this should be the name of the host individual. Must match a AncientMetagenomeDir samples table entry", + "examples": ["B61"] + }, + "archive_project": { + "$id": "#/items/properties/archive_project", + "type": "string", + "title": "Archive project accession platform", + "description": "Name of the nucleotide data archiving platform. Must match a AncientMetagenomeDir samples table entry", + "examples": ["PRJNA438985", "mgp13354"] + }, + "archive_sample_accession": { + "$id": "#/items/properties/archive_sample_accession", + "type": "string", + "pattern": "^[\\S]+$", + "title": "Archive accession number", + "description": "Samples archive accession numbers, multiple records can be separated with commas. No spaces allowed. . Must match a AncientMetagenomeDir samples table entry", + "examples": ["SRS473742,SRS473743,SRS473744,SRS473745"] + }, + "date_information_present": { + "$id": "#/items/properties/date_information_present", + "type": "boolean", + "title": "Is Date Information Present?", + "description": "Whether a sample has a specific year-date reported (e.g. 1245, not 15th Century or similar), If false, all other downstream fields should be set to NA", + "examples": ["true", "false"] + }, + "date_is_radiocarbon": { + "$id": "#/items/properties/date_is_radiocarbon", + "type": "string", + "title": "Is Date Radiocarbon?", + "enum": ["true", "false", "NA", "NR"], + "description": "Whether a sample has a specific year-date reported (e.g. 1245, not 15th Century or similar), If false, all other downstream fields should be set to NA", + "examples": ["true", "false"] + }, + "multiple_dates": { + "$id": "#/items/properties/multiple_dates", + "type": "string", + "title": "Multiple Direct Dates Present?", + "enum": ["true", "false", "NA"], + "description": "Whether multiple (direct) dates are present for this sample; if so make multiple rows for the sample with one date per row.", + "examples": ["true", "false", "NA"] + }, + "reference_location": { + "$id": "#/items/properties/reference_location", + "type": "string", + "title": "Location of Reference to Date", + "enum": ["main text", "supplement text", "supplement table"], + "description": "First place where the precise-radiocarbon date was recorded in the primary citation publication (i.e, the publication in ancientMetagenomeDir). main text > supplementary text > supplmentary table.", + "examples": ["main text", "supplement text", "supplement table", "NA"] + }, + "reference_citation_depth": { + "$id": "#/items/properties/reference_citation_depth", + "type": "string", + "title": "Reference Citation Depth?", + "enum": ["main text", "supplement text", "supplement table"], + "description": "First place where the precise-radiocarbon date was recorded in the primary citation publication (i.e, the publication in ancientMetagenomeDir). main text > supplementary text > supplmentary table.", + "examples": ["1", "2", "3", "9", "NA"] + }, + "primary_secondary_reference_citation_doi": { + "$id": "#/items/properties/primary_secondary_reference_citation_doi", + "type": "string", + "pattern": "^10.\\d{4,9}\\/[^,]+$", + "title": "Digital Object Identifier (DOI) of the publication that the date was originally reported.", + "description": "DOI of the primary or secondary reference (i.e. the DOI of the publication in which the date was original reported)", + "examples": ["10.1038/ng.2906"] + }, + "direct_dating": { + "$id": "#/items/properties/direct_dating", + "type": "string", + "title": "Date Directly from Sample?", + "enum": ["true", "false", "NA"], + "description": "Whether the date of the ancient metagenomic sample was directly from the same skeleton (or similar), or whether inferred from other samples in the same context", + "examples": ["true", "false", "NA"] + }, + "radiocarbon_lab_code": { + "$id": "#/items/properties/radiocarbon_lab_sample_id", + "type": "string", + "title": "Radiocarbon Lab Sample ID", + "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/c14_lab_code.json", + "description": "Lab code of the date, from https://radiocarbon.webhost.uits.arizona.edu/laboratories from Labs-2023_02_17.pdf. NA is no date available, NR is date available but no lab code", + "examples": ["OxA", "ANAS", "Beta", "NR", "NA"] + }, + "radiocarbon_lab_sample_id": { + "$id": "#/items/properties/radiocarbon_lab_sample_id", + "type": "integer", + "title": "Radiocarbon Lab Sample ID", + "description": "C14 sample code of the radiocarbon date from the lab", + "examples": ["12355", "44034"] + }, + "spectrometry_type": { + "$id": "#/items/properties/spectrometry_type", + "type": "string", + "title": "Radiocarbon Lab Sample ID", + "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/c14_lab_code.json", + "description": "Lab code of the date, from https://radiocarbon.webhost.uits.arizona.edu/laboratories from Labs-2023_02_17.pdf. NA is no date available, NR is date available but no lab code", + "examples": ["AMS", "IMRS", "NR", "NA"] + }, + "material": { + "$id": "#/items/properties/material", + "type": "string", + "title": "Sample Material Used For Dating", + "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/material.json", + "description": "Sample material used for extraction of e.g. collagen for generating the radiocarbon date", + "examples": ["both", "enamel"] + }, + "delta_13c": { + "$id": "#/items/properties/delta_13c", + "type": "number", + "title": "δ13C value", + "description": "The δ13C value of the dating in ppm (‰). Not reported should be represented as -99999", + "examples": [-20.5, -17.6, -99999] + }, + "uncalibrated_date ": { + "$id": "#/items/properties/delta_13c", + "type": "integer", + "minimum": 0, + "maximum": 50000, + "title": "Uncalibrated Date Year", + "description": "The uncalibrated date in calendar year date Before Present notation", + "examples": [934, 3960, 13000] + }, + "uncalibrated_uncertainty_plus_minus": { + "$id": "#/items/properties/uncalibrated_uncertainty_plus_minus", + "type": "integer", + "title": "Uncalibrated Date Year Uncertainty", + "description": "Uncertainty value around uncalibrated date in calendar year date Before Present notation, typically indicated by ±", + "examples": [32, 5, 150] + }, + "calibration_reported": { + "$id": "#/items/properties/calibration_reported", + "type": "boolean", + "title": "Is Calibration Reported?", + "description": "Whether the date has additionally been calibrated.", + "examples": ["true", "false"] + }, + "calibration_curve": { + "$id": "#/items/properties/calibration_curve", + "type": "string", + "enum": ["IntCal20", "CalPal2007_HULU", "SHCal20", "Marine20", "NR"], + "title": "Calibration Curve", + "description": "The tree-ring calibration curve used for calibration." + }, + "calibration_software": { + "$id": "#/items/properties/calibration_software", + "type": "string", + "enum": ["OxCal", "CalPal", "NR"], + "title": "Calibration Software", + "description": "Software used for radiocarbon calibration." + }, + "calibration_software_version": { + "$id": "#/items/properties/calibration_software_version", + "type": "string", + "title": "Calibration Software", + "description": "Which version of the calibration software used, (set NR if not reported)", + "examples": ["v1.20", "0.35", "NR"] + }, + "calibrated_range_lower": { + "$id": "#/items/properties/calibrated_range_lower", + "type": "integer", + "title": "Lower Date of Calibrated Date Range", + "description": "The lower range of the calibrated date", + "examples": ["1650"] + }, + "calibrated_range_upper": { + "$id": "#/items/properties/calibrated_range_upper", + "type": "integer", + "title": "Upper Date of Calibrated Date Range", + "description": "The upper range of the calibrated date", + "examples": ["1450"] + }, + "calibrated_range_median": { + "$id": "#/items/properties/calibrated_range_median", + "type": "integer", + "title": "Median Date of Calibrated Date Range", + "description": "The median date of the calibrated date range", + "examples": ["1550"] + }, + "calibrated_range_suffix": { + "$id": "#/items/properties/calibrated_range_suffix", + "type": "string", + "enum": ["cal AD", "cal BC", "cal CE", "cal BCE", "cal BP"], + "title": "Suffix of the calibrated date range", + "description": "The suffix of the calibrated date range", + "examples": ["cal BP"] + }, + "reservoir_offset_mentioned": { + "$id": "#/items/properties/reservoir_offset_mentioned", + "type": "boolean", + "title": "Is Reservoir Offset Mentioned?", + "description": "If radiocarbon C14 reservoir offset mentioned in any form. False here corresponds to not recorded (NR)", + "examples": ["true", "false"] + }, + "reservoir_offset_applied": { + "$id": "#/items/properties/reservoir_offset_applied", + "type": "boolean", + "title": "Is Reservoir Offset Applied?", + "description": "If an offset correction or recalibration has been reported to have been applied ", + "examples": ["true", "false"] + }, + "reservoir_offset_reported": { + "$id": "#/items/properties/reservoir_offset_reported", + "type": "integer", + "title": "Reservoir Offset Reported", + "description": "If the actual value of the offset has been reported (set NR if applied but actual value of offset not reported)", + "examples": [250, 400] + } + } + } +} diff --git a/assets/enums/c14_lab_code.json b/assets/enums/c14_lab_code.json new file mode 100644 index 000000000..970ef6310 --- /dev/null +++ b/assets/enums/c14_lab_code.json @@ -0,0 +1,291 @@ +{ + "enum": [ + "NA", + "NR", + "A", + "AA", + "AAR", + "AC", + "AECV", + "AERIK", + "ALG", + "ANAS", + "ANL", + "ANTW", + "ANU", + "ANUA", + "AU", + "AURIS", + "B", + "Ba", + "BC", + "BE", + "Beta", + "BGS", + "BIOCAMS", + "Birm", + "Bln", + "BM", + "BONN", + "BRAMS", + "BS", + "C", + "CAMS", + "CAR", + "CENA", + "CG", + "CH", + "CIRAM", + "CN-XX", + "CNA", + "COL", + "CRCA", + "CRL", + "CSIC", + "CSM", + "CT", + "CU", + "D-AMS", + "D", + "Dak", + "DAL", + "DE", + "Deb", + "DebA", + "DEM", + "DGC", + "DIC", + "DK", + "DRI", + "DSA", + "ENEA", + "Erl", + "ETH", + "F", + "Fi", + "Fr", + "Fra", + "FSU", + "FTMC", + "FZ", + "G", + "GAK", + "Gd", + "GD", + "GdA,", + "Gif", + "GifA", + "GIN", + "GL", + "GrA", + "GrM", + "GrN", + "GrO", + "GSC", + "GU", + "GV", + "GX", + "GXNUAMS", + "H", + "HAM", + "HAR", + "Hd", + "Hel", + "Hela", + "HIG", + "HL", + "HNS", + "Hv", + "I", + "IAA", + "IAAA", + "IAEA-MEL", + "IAEA", + "ICA", + "ICEN", + "IEMAE", + "IFAO", + "IGAN", + "IGS", + "IGSB", + "IHME", + "II", + "IMTA", + "IOAN", + "IORAN", + "IRPA", + "ISGS", + "IUACD", + "IVAN", + "IVIC", + "IWP", + "JAT", + "K", + "KATRI", + "KEEA", + "KGM", + "Ki", + "KI", + "KIA", + "KIK", + "KN", + "KR", + "KRIL", + "KSU", + "L", + "LACUFF", + "LAEC", + "LAR", + "LE", + "LEMA", + "LIH", + "LJ", + "LTL", + "LU", + "Lu", + "LuA", + "LuS", + "Lv", + "Ly", + "LZ", + "LZU", + "M", + "Ma", + "MAG", + "MAMS", + "MC", + "METU", + "MKL", + "MTC", + "N", + "NIST", + "NPL", + "NS", + "NSRL", + "NSTF", + "NSW", + "NTU", + "NU", + "NUTA", + "Ny", + "NZ", + "NZA", + "O", + "OBDY", + "OR", + "ORINS", + "OS", + "OWU", + "OX", + "OxA", + "OZ", + "P", + "P", + "PAL", + "Pi", + "PI", + "PIC", + "PITT", + "PKU", + "PKUAMS", + "PL", + "PLD", + "Poz", + "Pr", + "PRI", + "PRL", + "PRLCH", + "PSU", + "PSUAMS", + "Pta", + "Q", + "QC", + "QL", + "QU", + "R", + "RCD", + "RCMib", + "RI", + "RICH", + "RIDDL", + "Riga", + "RL", + "RoAMS", + "RT", + "RTK", + "RU", + "S", + "Sa", + "Sac", + "SacA", + "SANU", + "SFU", + "Sh", + "SI", + "SL", + "SM", + "SMU", + "SNU", + "SPb", + "T", + "TB", + "TBNC", + "TEM", + "TF", + "TK", + "TKA", + "TKa", + "TKU", + "Tln", + "TO", + "TRa", + "TUa", + "TUBITAK", + "TUNC", + "Tx", + "U", + "Ua", + "UB", + "UBA", + "UBAR", + "UCD", + "UCI", + "UCLA", + "UCR", + "UD", + "UGa", + "UGAMS", + "UGRA", + "UL", + "ULA", + "UM", + "UNAM", + "und", + "UNSW", + "UOC", + "UQ", + "URCRM", + "URU", + "USGS", + "UtC", + "UTCAG", + "UW", + "UZH", + "V", + "VERA", + "VRI", + "Vs", + "W", + "WAT", + "WIS", + "Wk", + "WRD", + "WSU", + "X", + "XLLQ", + "Y", + "Ya", + "YU", + "Z" + ] +}