diff --git a/.github/workflows/npm-test.yml b/.github/workflows/npm-test.yml index 708f36c2..78b4b76d 100644 --- a/.github/workflows/npm-test.yml +++ b/.github/workflows/npm-test.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - node: ['12', '14'] + node: ['12', '14', '16'] name: node-${{ matrix.node }} steps: - uses: actions/checkout@v2 @@ -25,7 +25,7 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} files: coverage/junit.xml - if: matrix.node == 10 + if: matrix.node == 14 docker: runs-on: ubuntu-latest name: docker build diff --git a/Snakefile b/Snakefile index 47a1f866..725f9508 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,6 @@ import os from textwrap import dedent - DATA_DIR = 'snakemake_data' LOGS_DIR = 'snakemake_logs' @@ -18,6 +17,7 @@ COSMIC_EMAIL = config.get('cosmic_email') COSMIC_PASSWORD = config.get('cosmic_password') USE_COSMIC = COSMIC_EMAIL or COSMIC_PASSWORD BACKFILL_TRIALS = config.get('trials') +USE_FDA_UNII = config.get('fda') # due to the non-scriptable download, making FDA optional GITHUB_DATA = 'https://raw.githubusercontent.com/bcgsc/pori_graphkb_loader/develop/data' CONTAINER = 'docker://bcgsc/pori-graphkb-loader:latest' @@ -28,10 +28,10 @@ rule all: f'{DATA_DIR}/PMC4468049.COMPLETE', f'{DATA_DIR}/PMC4232638.COMPLETE', f'{DATA_DIR}/uberon.COMPLETE', - f'{DATA_DIR}/ncitFdaXref.COMPLETE', f'{DATA_DIR}/fdaApprovals.COMPLETE', f'{DATA_DIR}/cancerhotspots.COMPLETE', f'{DATA_DIR}/moa.COMPLETE', + *([f'{DATA_DIR}/ncitFdaXref.COMPLETE'] if USE_FDA_UNII else []), *([f'{DATA_DIR}/clinicaltrialsgov.COMPLETE'] if BACKFILL_TRIALS else []), *([f'{DATA_DIR}/cosmic_resistance.COMPLETE', f'{DATA_DIR}/cosmic_fusions.COMPLETE'] if USE_COSMIC else []) @@ -46,11 +46,12 @@ rule download_ncit: rm -rf __MACOSX''') -rule download_ncit_fda: - output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt' - shell: dedent(f'''\ - cd {DATA_DIR}/ncit - wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''') +if USE_FDA_UNII: + rule download_ncit_fda: + output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt' + shell: dedent(f'''\ + cd {DATA_DIR}/ncit + wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''') rule download_ensembl: @@ -62,16 +63,17 @@ rule download_ensembl: ''') -rule download_fda_srs: - output: f'{DATA_DIR}/fda/UNII_Records.txt' - shell: dedent(f'''\ - cd {DATA_DIR}/fda - wget https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip - unzip UNII_Data.zip - rm UNII_Data.zip +if USE_FDA_UNII: + rule download_fda_srs: + output: f'{DATA_DIR}/fda/UNII_Records.txt' + shell: dedent(f'''\ + cd {DATA_DIR}/fda + wget https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip + unzip UNII_Data.zip + rm UNII_Data.zip - mv UNII*.txt UNII_Records.txt - ''') + mv UNII*.txt UNII_Records.txt + ''') rule download_refseq: @@ -135,8 +137,8 @@ rule download_cgi: output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv' shell: dedent(f'''\ cd {DATA_DIR}/cgi - wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_latest.zip - unzip cgi_biomarkers_latest.zip + wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_20180117.zip + unzip cgi_biomarkers_20180117.zip ''') @@ -217,23 +219,24 @@ rule load_ncit: shell: 'node bin/load.js file ncit {input.data} &> {log}; cp {log} {output}' -rule load_fda_srs: - input: expand(rules.load_local.output, local=['vocab']), - data=rules.download_fda_srs.output - container: CONTAINER - log: f'{LOGS_DIR}/fdaSrs.logs.txt' - output: f'{DATA_DIR}/fdaSrs.COMPLETE' - shell: 'node bin/load.js file fdaSrs {input.data} &> {log}; cp {log} {output}' +if USE_FDA_UNII: + rule load_fda_srs: + input: expand(rules.load_local.output, local=['vocab']), + data=f'{DATA_DIR}/fda/UNII_Records.txt' + container: CONTAINER + log: f'{LOGS_DIR}/fdaSrs.logs.txt' + output: f'{DATA_DIR}/fdaSrs.COMPLETE' + shell: 'node bin/load.js file fdaSrs {input.data} &> {log}; cp {log} {output}' -rule load_ncit_fda: - input: rules.load_ncit.output, - rules.load_fda_srs.output, - data=rules.download_ncit_fda.output - container: CONTAINER - log: f'{LOGS_DIR}/ncitFdaXref.logs.txt' - output: f'{DATA_DIR}/ncitFdaXref.COMPLETE' - shell: 'node bin/load.js file ncitFdaXref {input.data} &> {log}; cp {log} {output}' + rule load_ncit_fda: + input: rules.load_ncit.output, + rules.load_fda_srs.output, + data=rules.download_ncit_fda.output + container: CONTAINER + log: f'{LOGS_DIR}/ncitFdaXref.logs.txt' + output: f'{DATA_DIR}/ncitFdaXref.COMPLETE' + shell: 'node bin/load.js file ncitFdaXref {input.data} &> {log}; cp {log} {output}' rule load_refseq: @@ -273,7 +276,7 @@ rule load_uberon: rule load_drugbank: - input: rules.load_fda_srs.output, + input: rules.load_fda_srs.output if USE_FDA_UNII else [], data=rules.download_drugbank.output container: CONTAINER log: f'{LOGS_DIR}/drugbank.logs.txt' @@ -298,7 +301,9 @@ rule load_dgidb: def get_drug_inputs(wildcards): - inputs = [*rules.load_fda_srs.output, *rules.load_ncit.output] + inputs = [*rules.load_ncit.output] + if USE_FDA_UNII: + inputs.extend(rules.load_fda_srs.output) container: CONTAINER if USE_DRUGBANK: inputs.append(*rules.load_drugbank.output) @@ -423,7 +428,8 @@ rule load_cosmic_fusions: rule load_moa: - input: rules.load_oncotree.output + input: rules.load_oncotree.output, + expand(rules.load_local.output, local=['vocab', 'signatures', 'chromosomes', 'evidenceLevels', 'aacr', 'asco']) container: CONTAINER log: f'{LOGS_DIR}/load_moa.logs.txt' output: f'{DATA_DIR}/moa.COMPLETE' diff --git a/bin/load.js b/bin/load.js index 4cad3768..e7a5b160 100644 --- a/bin/load.js +++ b/bin/load.js @@ -141,7 +141,7 @@ if (input) { loaderOptions.filename = input; if (options.module === 'cosmic') { - loaderOptions.mappingFilename = options.mappingFilename; + loaderOptions.classification = options.classification; } } } diff --git a/src/asco/index.js b/src/asco/index.js index e04f9376..1cdf0492 100644 --- a/src/asco/index.js +++ b/src/asco/index.js @@ -9,34 +9,14 @@ const { requestWithRetry, checkSpec } = require('../util'); const { asco: SOURCE_DEFN } = require('../sources'); const { logger } = require('../logging'); const { rid } = require('../graphkb'); +const { main: mainSpec, details: detailsSpec } = require('./specs.json'); const CACHE = {}; const ajv = new Ajv(); - -const validateMainSpec = ajv.compile({ - properties: { - AbstID: { type: 'string' }, - AuthorString: { type: 'string' }, - Meeting: { type: 'string' }, - Title: { type: 'string' }, - Year: { pattern: '\\d+', type: 'string' }, - id: { type: 'string' }, - url: { format: 'url', type: 'string' }, - }, - required: ['AbstID', 'Title', 'Meeting', 'url', 'id'], - type: 'object', -}); - - -const validateDetailsSpec = ajv.compile({ - properties: { - DOI: { type: 'string' }, - SiteCitation: { type: 'string' }, - }, - type: 'object', -}); +const validateMainSpec = ajv.compile(mainSpec); +const validateDetailsSpec = ajv.compile(detailsSpec); /** diff --git a/src/asco/specs.json b/src/asco/specs.json new file mode 100644 index 00000000..23c25e08 --- /dev/null +++ b/src/asco/specs.json @@ -0,0 +1,48 @@ +{ + "details": { + "properties": { + "DOI": { + "type": "string" + }, + "SiteCitation": { + "type": "string" + } + }, + "type": "object" + }, + "main": { + "properties": { + "AbstID": { + "type": "string" + }, + "AuthorString": { + "type": "string" + }, + "Meeting": { + "type": "string" + }, + "Title": { + "type": "string" + }, + "Year": { + "pattern": "\\d+", + "type": "string" + }, + "id": { + "type": "string" + }, + "url": { + "format": "url", + "type": "string" + } + }, + "required": [ + "AbstID", + "Title", + "Meeting", + "url", + "id" + ], + "type": "object" + } +} diff --git a/src/cancergenomeinterpreter/index.js b/src/cancergenomeinterpreter/index.js index da477b1f..6542ce4f 100644 --- a/src/cancergenomeinterpreter/index.js +++ b/src/cancergenomeinterpreter/index.js @@ -480,7 +480,9 @@ const processRow = async ({ row, source, conn }) => { }; -const uploadFile = async ({ conn, filename, errorLogPrefix }) => { +const uploadFile = async ({ + conn, filename, errorLogPrefix, maxRecords, +}) => { const rows = await loadDelimToJson(filename); logger.info('creating the source record'); const source = rid(await conn.addSource(SOURCE_DEFN)); @@ -496,6 +498,10 @@ const uploadFile = async ({ conn, filename, errorLogPrefix }) => { logger.info(`loading ${rows.length} rows`); for (let index = 0; index < rows.length; index++) { + if (maxRecords && index > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } const rawRow = rows[index]; const sourceId = hashRecordToId(rawRow); logger.info(`processing: ${sourceId} (${index} / ${rows.length})`); diff --git a/src/chembl/index.js b/src/chembl/index.js index 2a7ae555..8f9f091c 100644 --- a/src/chembl/index.js +++ b/src/chembl/index.js @@ -11,26 +11,11 @@ const { } = require('../graphkb'); const { logger } = require('../logging'); const { chembl: SOURCE_DEFN } = require('../sources'); +const spec = require('./spec.json'); const ajv = new Ajv(); -const recordSpec = ajv.compile({ - properties: { - molecule_chembl_id: { pattern: '^CHEMBL\\d+$', type: 'string' }, - molecule_properties: { - oneOf: [{ - properties: { - full_molformula: { type: 'string' }, - }, - type: 'object', - }, { type: 'null' }], - }, - pref_name: { type: ['string', 'null'] }, - usan_stem_definition: { type: ['string', 'null'] }, - }, - required: ['molecule_chembl_id'], - type: 'object', -}); +const recordSpec = ajv.compile(spec); const API = 'https://www.ebi.ac.uk/chembl/api/data/molecule'; diff --git a/src/chembl/spec.json b/src/chembl/spec.json new file mode 100644 index 00000000..eba982e5 --- /dev/null +++ b/src/chembl/spec.json @@ -0,0 +1,39 @@ +{ + "properties": { + "molecule_chembl_id": { + "pattern": "^CHEMBL\\d+$", + "type": "string" + }, + "molecule_properties": { + "oneOf": [ + { + "properties": { + "full_molformula": { + "type": "string" + } + }, + "type": "object" + }, + { + "type": "null" + } + ] + }, + "pref_name": { + "type": [ + "string", + "null" + ] + }, + "usan_stem_definition": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "molecule_chembl_id" + ], + "type": "object" +} diff --git a/src/civic/index.js b/src/civic/index.js index a72b07dd..8a3ed454 100644 --- a/src/civic/index.js +++ b/src/civic/index.js @@ -19,7 +19,7 @@ const _entrezGene = require('../entrez/gene'); const { civic: SOURCE_DEFN, ncit: NCIT_SOURCE_DEFN } = require('../sources'); const { downloadVariantRecords, processVariantRecord } = require('./variant'); const { getPublication } = require('./publication'); - +const { evidence: evidenceSpec } = require('./specs.json'); class NotImplementedError extends ErrorMixin {} @@ -48,71 +48,7 @@ const EVIDENCE_LEVEL_CACHE = {}; // avoid unecessary requests by caching the evi const RELEVANCE_CACHE = {}; -const validateEvidenceSpec = ajv.compile({ - properties: { - clinical_significance: { - enum: [ - 'Sensitivity', - 'Adverse Response', - 'Resistance', - 'Sensitivity/Response', - 'Reduced Sensitivity', - 'Positive', - 'Negative', - 'Poor Outcome', - 'Better Outcome', - 'Uncertain Significance', - 'Pathogenic', - 'Likely Pathogenic', - 'N/A', - 'Gain of Function', - 'Loss of Function', - 'Neomorphic', - 'Dominant Negative', - null, - ], - }, - description: { type: 'string' }, - disease: { - oneOf: [{ - doid: { type: 'string' }, - name: { type: 'string' }, - type: 'object', - }, { type: 'null' }], - }, - drug_interaction_type: { enum: ['Combination', 'Substitutes', 'Sequential', null] }, - drugs: { - items: { - properties: { - id: { type: 'number' }, - name: { type: 'string' }, - ncit_id: { type: ['string', 'null'] }, - pubchem_id: { type: ['string', 'null'] }, - }, - required: ['name', 'ncit_id'], - type: 'object', - }, - type: 'array', - }, - evidence_direction: { enum: ['Supports', 'N/A', 'Does Not Support', null] }, - evidence_level: { type: 'string' }, - evidence_type: { - enum: ['Predictive', 'Diagnostic', 'Prognostic', 'Predisposing', 'Functional'], - }, - id: { type: 'number' }, - rating: { type: ['number', 'null'] }, - source: { - properties: { - citation_id: { type: 'string' }, - name: { type: ['string', 'null'] }, - source_type: { enum: ['ASCO', 'PubMed'] }, - }, - }, - status: { type: 'string' }, - variant_id: { type: 'number' }, - }, - type: 'object', -}); +const validateEvidenceSpec = ajv.compile(evidenceSpec); /** @@ -652,10 +588,9 @@ const downloadEvidenceRecords = async (baseUrl, trustedCurators) => { * @param {string} [opt.url] url to use as the base for accessing the civic ApiConnection * @param {string[]} opt.trustedCurators a list of curator IDs to also fetch submitted only evidence items for */ -const upload = async (opt) => { - const { - conn, errorLogPrefix, trustedCurators, ignoreCache = false, - } = opt; +const upload = async ({ + conn, errorLogPrefix, trustedCurators, ignoreCache = false, maxRecords, url = BASE_URL, +}) => { // add the source node const source = await conn.addSource(SOURCE_DEFN); @@ -670,8 +605,8 @@ const upload = async (opt) => { _pubmed.preLoadCache(conn); const varById = await downloadVariantRecords(); - const { records, errorList, counts } = await downloadEvidenceRecords(opt.url || BASE_URL, trustedCurators); - const purgeableEvidenceItems = new Set(await fetchDeletedEvidenceItems(opt.url || BASE_URL)); + const { records, errorList, counts } = await downloadEvidenceRecords(url, trustedCurators); + const purgeableEvidenceItems = new Set(await fetchDeletedEvidenceItems(url)); logger.info(`fetched ${purgeableEvidenceItems.size} deleted entries from CIViC`); logger.info(`Processing ${records.length} records`); @@ -689,6 +624,11 @@ const upload = async (opt) => { recordsById[record.id] = []; } recordsById[record.id].push(record); + + if (maxRecords && Object.values(recordsById).length >= maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } } for (const [sourceId, recordList] of Object.entries(recordsById)) { diff --git a/src/civic/specs.json b/src/civic/specs.json new file mode 100644 index 00000000..08d8a71c --- /dev/null +++ b/src/civic/specs.json @@ -0,0 +1,252 @@ +{ + "evidence": { + "properties": { + "clinical_significance": { + "enum": [ + "Sensitivity", + "Adverse Response", + "Resistance", + "Sensitivity/Response", + "Reduced Sensitivity", + "Positive", + "Negative", + "Poor Outcome", + "Better Outcome", + "Uncertain Significance", + "Pathogenic", + "Likely Pathogenic", + "N/A", + "Gain of Function", + "Loss of Function", + "Neomorphic", + "Dominant Negative", + null + ] + }, + "description": { + "type": "string" + }, + "disease": { + "oneOf": [ + { + "doid": { + "type": "string" + }, + "name": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ] + }, + "drug_interaction_type": { + "enum": [ + "Combination", + "Substitutes", + "Sequential", + null + ] + }, + "drugs": { + "items": { + "properties": { + "id": { + "type": "number" + }, + "name": { + "type": "string" + }, + "ncit_id": { + "type": [ + "string", + "null" + ] + }, + "pubchem_id": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "name", + "ncit_id" + ], + "type": "object" + }, + "type": "array" + }, + "evidence_direction": { + "enum": [ + "Supports", + "N/A", + "Does Not Support", + null + ] + }, + "evidence_level": { + "type": "string" + }, + "evidence_type": { + "enum": [ + "Predictive", + "Diagnostic", + "Prognostic", + "Predisposing", + "Functional" + ] + }, + "id": { + "type": "number" + }, + "rating": { + "type": [ + "number", + "null" + ] + }, + "source": { + "properties": { + "citation_id": { + "type": "string" + }, + "name": { + "type": [ + "string", + "null" + ] + }, + "source_type": { + "enum": [ + "ASCO", + "PubMed" + ] + } + } + }, + "status": { + "type": "string" + }, + "variant_id": { + "type": "number" + } + }, + "type": "object" + }, + "variant": { + "properties": { + "civic_actionability_score": { + "type": "number" + }, + "coordinates": { + "properties": { + "chromosome": { + "type": [ + "string", + "null" + ] + }, + "chromosome2": { + "type": [ + "string", + "null" + ] + }, + "ensembl_version": { + "type": [ + "number", + "null" + ] + }, + "reference_bases": { + "type": [ + "string", + "null" + ] + }, + "reference_build": { + "type": [ + "string", + "null" + ] + }, + "representative_transcript": { + "type": [ + "string", + "null" + ] + }, + "representative_transcript2": { + "type": [ + "string", + "null" + ] + }, + "start": { + "type": [ + "number", + "null" + ] + }, + "start2": { + "type": [ + "number", + "null" + ] + }, + "stop": { + "type": [ + "number", + "null" + ] + }, + "stop2": { + "type": [ + "number", + "null" + ] + }, + "variant_bases": { + "type": [ + "string", + "null" + ] + } + }, + "type": "object" + }, + "description": { + "type": "string" + }, + "entrez_id": { + "type": "number" + }, + "entrez_name": { + "type": "string" + }, + "id": { + "type": "number" + }, + "name": { + "type": "string" + }, + "variant_types": { + "items": { + "name": { + "type": "string" + }, + "so_id": { + "type": "string" + }, + "type": "object" + }, + "type": "array" + } + }, + "type": "object" + } +} diff --git a/src/civic/variant.js b/src/civic/variant.js index 1584c933..27d556d1 100644 --- a/src/civic/variant.js +++ b/src/civic/variant.js @@ -10,6 +10,7 @@ const { const { logger } = require('../logging'); const _entrezGene = require('../entrez/gene'); const _snp = require('../entrez/snp'); +const { variant: variantSpec } = require('./specs.json'); const ajv = new Ajv(); @@ -23,42 +24,7 @@ const BASE_URL = 'https://civicdb.org/api'; /** * This is the expected format of the JSON body of a response to a variant request to the CIVIC API */ -const validateVariantSpec = ajv.compile({ - properties: { - civic_actionability_score: { type: 'number' }, - coordinates: { - properties: { - chromosome: { type: ['string', 'null'] }, - chromosome2: { type: ['string', 'null'] }, - ensembl_version: { type: ['number', 'null'] }, - reference_bases: { type: ['string', 'null'] }, - reference_build: { type: ['string', 'null'] }, - representative_transcript: { type: ['string', 'null'] }, - representative_transcript2: { type: ['string', 'null'] }, - start: { type: ['number', 'null'] }, - start2: { type: ['number', 'null'] }, - stop: { type: ['number', 'null'] }, - stop2: { type: ['number', 'null'] }, - variant_bases: { type: ['string', 'null'] }, - }, - type: 'object', - }, - description: { type: 'string' }, - entrez_id: { type: 'number' }, - entrez_name: { type: 'string' }, - id: { type: 'number' }, - name: { type: 'string' }, - variant_types: { - items: { - name: { type: 'string' }, - so_id: { type: 'string' }, - type: 'object', - }, - type: 'array', - }, - }, - type: 'object', -}); +const validateVariantSpec = ajv.compile(variantSpec); // based on discussion with cam here: https://www.bcgsc.ca/jira/browse/KBDEV-844 diff --git a/src/cli.js b/src/cli.js index 1c7b825a..cd233f8b 100644 --- a/src/cli.js +++ b/src/cli.js @@ -43,6 +43,11 @@ const createOptionsMenu = (opt = {}) => { default: `${process.cwd()}/errorLog-${new Date().valueOf()}`, help: 'prefix to use for any module specific log files that are written', }); + parser.add_argument('--maxRecords', { + default: process.env.GKB_MAX_RECORDS || null, + help: 'maximum number of records to load per loader, this is generally only used for testing the snakemake workflow and should not be used in production', + type: Number, + }); return parser; }; diff --git a/src/clinicaltrialsgov/index.js b/src/clinicaltrialsgov/index.js index b0f82487..7fa093ce 100644 --- a/src/clinicaltrialsgov/index.js +++ b/src/clinicaltrialsgov/index.js @@ -25,159 +25,15 @@ const { } = require('../graphkb'); const { logger } = require('../logging'); const { clinicalTrialsGov: SOURCE_DEFN } = require('../sources'); +const { api: apiSpec, rss: rssSpec } = require('./specs.json'); const BASE_URL = 'https://clinicaltrials.gov/ct2/show'; const RSS_URL = 'https://clinicaltrials.gov/ct2/results/rss.xml'; const CACHE = {}; const ajv = new Ajv(); - - -const singleItemArray = (spec = { type: 'string' }) => ({ - items: { ...spec }, maxItems: 1, minItems: 1, type: 'array', -}); - -const validateAPITrialRecord = ajv.compile({ - properties: { - clinical_study: { - properties: { - brief_title: singleItemArray({ type: 'string' }), - completion_date: singleItemArray({ - oneOf: [{ - properties: { - _: { type: 'string' }, - }, - required: ['_'], - type: 'object', - }, { type: 'string' }], - }), - condition: { - items: { type: 'string' }, - type: 'array', - }, - id_info: singleItemArray({ - properties: { nct_id: singleItemArray({ pattern: '^NCT\\d+$' }) }, - required: ['nct_id'], - type: 'object', - }), - intervention: { - items: { - properties: { - intervention_name: singleItemArray(), - intervention_type: singleItemArray(), - }, - required: [ - 'intervention_type', - 'intervention_name', - ], - type: 'object', - }, - type: 'array', - }, - last_update_posted: singleItemArray({ - properties: { _: { type: 'string' } }, - required: ['_'], - type: 'object', - }), - location: { - items: { - properties: { - facility: singleItemArray({ - properties: { - address: singleItemArray({ - properties: { - city: singleItemArray(), - country: singleItemArray(), - }, - required: ['city', 'country'], - type: 'object', - }), - }, - type: 'object', - }), - }, - required: ['facility'], - type: 'object', - }, - minItems: 1, - type: 'array', - }, - official_title: singleItemArray({ type: 'string' }), - overall_status: singleItemArray({ type: 'string' }), - phase: singleItemArray(), - required_header: singleItemArray({ - properties: { url: singleItemArray() }, - required: ['url'], - type: 'object', - }), - start_date: singleItemArray({ - oneOf: [ - { - properties: { - _: { type: 'string' }, - }, - required: ['_'], - type: 'object', - }, - { type: 'string' }, - ], - }), - }, - required: [ - 'id_info', - 'brief_title', - 'phase', - 'condition', - 'intervention', - 'last_update_posted', - 'required_header', - 'overall_status', - ], - type: 'object', - }, - }, - required: ['clinical_study'], - type: 'object', -}); - -// console.log(xml.rss.channel[0].item[0]); -const validateRssFeed = ajv.compile({ - properties: { - rss: { - properties: { - channel: singleItemArray({ - properties: { - item: { - items: { - properties: { - guid: singleItemArray({ - properties: { - _: { - pattern: '^NCT\\d+$', - type: 'string', - }, - }, - required: ['_'], - type: 'object', - }), - }, - required: ['guid'], - type: 'object', - }, - type: 'array', - }, - }, - required: ['item'], - type: 'object', - }), - }, - required: ['channel'], - type: 'object', - }, - }, - required: ['rss'], - type: 'object', -}); +const validateAPITrialRecord = ajv.compile(apiSpec); +const validateRssFeed = ajv.compile(rssSpec); const standardizeDate = (dateString) => { diff --git a/src/clinicaltrialsgov/specs.json b/src/clinicaltrialsgov/specs.json new file mode 100644 index 00000000..1b3e85cd --- /dev/null +++ b/src/clinicaltrialsgov/specs.json @@ -0,0 +1,304 @@ +{ + "api": { + "properties": { + "clinical_study": { + "properties": { + "brief_title": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "completion_date": { + "items": { + "oneOf": [ + { + "properties": { + "_": { + "type": "string" + } + }, + "required": [ + "_" + ], + "type": "object" + }, + { + "type": "string" + } + ] + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "condition": { + "items": { + "type": "string" + }, + "type": "array" + }, + "id_info": { + "items": { + "properties": { + "nct_id": { + "items": { + "pattern": "^NCT\\d+$" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "nct_id" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "intervention": { + "items": { + "properties": { + "intervention_name": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "intervention_type": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "intervention_type", + "intervention_name" + ], + "type": "object" + }, + "type": "array" + }, + "last_update_posted": { + "items": { + "properties": { + "_": { + "type": "string" + } + }, + "required": [ + "_" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "location": { + "items": { + "properties": { + "facility": { + "items": { + "properties": { + "address": { + "items": { + "properties": { + "city": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "country": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "city", + "country" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "facility" + ], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "official_title": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "overall_status": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "phase": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "required_header": { + "items": { + "properties": { + "url": { + "items": { + "type": "string" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "url" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + }, + "start_date": { + "items": { + "oneOf": [ + { + "properties": { + "_": { + "type": "string" + } + }, + "required": [ + "_" + ], + "type": "object" + }, + { + "type": "string" + } + ] + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "id_info", + "brief_title", + "phase", + "condition", + "intervention", + "last_update_posted", + "required_header", + "overall_status" + ], + "type": "object" + } + }, + "required": [ + "clinical_study" + ], + "type": "object" + }, + "rss": { + "properties": { + "rss": { + "properties": { + "channel": { + "items": { + "properties": { + "item": { + "items": { + "properties": { + "guid": { + "items": { + "properties": { + "_": { + "pattern": "^NCT\\d+$", + "type": "string" + } + }, + "required": [ + "_" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "guid" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "item" + ], + "type": "object" + }, + "maxItems": 1, + "minItems": 1, + "type": "array" + } + }, + "required": [ + "channel" + ], + "type": "object" + } + }, + "required": [ + "rss" + ], + "type": "object" + } +} diff --git a/src/cosmic/fusions.js b/src/cosmic/fusions.js index 8ae2e071..2206482e 100644 --- a/src/cosmic/fusions.js +++ b/src/cosmic/fusions.js @@ -124,10 +124,10 @@ const processRecordGroup = async ({ * @param {ApiConnection} opt.conn the API connection object */ const uploadFile = async ({ - filename, conn, mappingFilename, + filename, conn, classification, }) => { const jsonList = await loadDelimToJson(filename); - const mapping = await loadClassifications(mappingFilename); + const mapping = await loadClassifications(classification); // get the dbID for the source const source = rid(await conn.addSource(SOURCE_DEFN)); const counts = { error: 0, skip: 0, success: 0 }; diff --git a/src/cosmic/resistance.js b/src/cosmic/resistance.js index 0054c4c4..5ff8e29e 100644 --- a/src/cosmic/resistance.js +++ b/src/cosmic/resistance.js @@ -77,9 +77,7 @@ const processVariants = async ({ conn, record, source }) => { try { // add the protein variant with its protein translation - const { - noFeatures, multiFeature, prefix, ...variant - } = variantParser(record.protein, false); + const variant = variantParser(record.protein, false).toJSON(); variant.type = rid(await conn.getVocabularyTerm(variant.type)); const reference1 = rid(await _ensembl.fetchAndLoadById( @@ -115,9 +113,7 @@ const processVariants = async ({ conn, record, source }) => { // create the cds variant if (record.cds && record.cds.trim()) { try { - const { - noFeatures, multiFeature, prefix, ...variant - } = variantParser(record.cds, false); + const variant = variantParser(record.cds, false).toJSON(); // get the ensembl transcript const reference1 = rid(await _ensembl.fetchAndLoadById( conn, @@ -147,9 +143,7 @@ const processVariants = async ({ conn, record, source }) => { // add the genomic representation if (record.genomic) { try { - const { - noFeatures, multiFeature, prefix, ...variant - } = variantParser(record.genomic, false); + const variant = variantParser(record.genomic, false).toJSON(); // get the chromosome const reference1 = rid(await conn.getUniqueRecordBy({ filters: { @@ -328,11 +322,11 @@ const loadClassifications = async (filename) => { * @param {ApiConnection} opt.conn the API connection object */ const uploadFile = async ({ - filename, mappingFilename, conn, errorLogPrefix, + filename, classification, conn, errorLogPrefix, maxRecords, }) => { const jsonList = await loadDelimToJson(filename); - const mapping = await loadClassifications(mappingFilename); + const mapping = await loadClassifications(classification); logger.info(`loaded ${jsonList.length} records`); // get the dbID for the source const source = rid(await conn.addSource(SOURCE_DEFN)); @@ -359,6 +353,10 @@ const uploadFile = async ({ await _pubmed.fetchAndLoadByIds(conn, jsonList.map(rec => rec[HEADER.pubmed]), { upsert: true }); for (let index = 0; index < jsonList.length; index++) { + if (maxRecords && index > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } const sourceId = hashRecordToId(jsonList[index]); const record = { sourceId, ...convertRowFields(HEADER, jsonList[index]) }; logger.info(`processing (${index} / ${jsonList.length}) ${sourceId}`); diff --git a/src/dgidb/index.js b/src/dgidb/index.js index c22932cf..4ba114b6 100644 --- a/src/dgidb/index.js +++ b/src/dgidb/index.js @@ -8,22 +8,11 @@ const { checkSpec, request } = require('../util'); const { rid } = require('../graphkb'); const { dgidb: SOURCE_DEFN } = require('../sources'); +const spec = require('./spec.json'); const ajv = new Ajv(); -const recordSpec = ajv.compile({ - properties: { - concept_id: { pattern: '^chembl:CHEMBL\\d+$', type: 'string' }, - entrez_id: { min: 1, type: 'number' }, - id: { format: 'uuid', type: 'string' }, - interaction_direction: { items: { type: ['string', 'null'] }, type: 'array' }, - interaction_types: { items: { type: 'string' }, type: 'array' }, - score: { type: 'number' }, - sources: { items: { type: 'string' }, type: 'array' }, - }, - required: ['entrez_id', 'concept_id', 'interaction_types', 'id'], - type: 'object', -}); +const recordSpec = ajv.compile(spec); const BASE_URL = 'https://dgidb.org/api/v2'; diff --git a/src/dgidb/spec.json b/src/dgidb/spec.json new file mode 100644 index 00000000..aad7d801 --- /dev/null +++ b/src/dgidb/spec.json @@ -0,0 +1,47 @@ +{ + "properties": { + "concept_id": { + "pattern": "^chembl:CHEMBL\\d+$", + "type": "string" + }, + "entrez_id": { + "min": 1, + "type": "number" + }, + "id": { + "format": "uuid", + "type": "string" + }, + "interaction_direction": { + "items": { + "type": [ + "string", + "null" + ] + }, + "type": "array" + }, + "interaction_types": { + "items": { + "type": "string" + }, + "type": "array" + }, + "score": { + "type": "number" + }, + "sources": { + "items": { + "type": "string" + }, + "type": "array" + } + }, + "required": [ + "entrez_id", + "concept_id", + "interaction_types", + "id" + ], + "type": "object" +} diff --git a/src/diseaseOntology/index.js b/src/diseaseOntology/index.js index 872a195b..772bdddf 100644 --- a/src/diseaseOntology/index.js +++ b/src/diseaseOntology/index.js @@ -5,80 +5,18 @@ * @module importer/disease_ontology */ const Ajv = require('ajv'); +const fs = require('fs'); const { checkSpec } = require('../util'); const { rid, orderPreferredOntologyTerms, edgeExists } = require('../graphkb'); const { logger } = require('../logging'); const { diseaseOntology: SOURCE_DEFN, ncit: { name: ncitName } } = require('../sources'); - -const ajv = new Ajv(); +const { node: nodeSpecDefn, edge: edgeSpecDefn } = require('./specs.json'); const PREFIX_TO_STRIP = 'http://purl.obolibrary.org/obo/'; -const DOID_PATTERN = `^${PREFIX_TO_STRIP}DOID_\\d+$`; - -const nodeSpec = ajv.compile({ - properties: { - id: { pattern: DOID_PATTERN, type: 'string' }, - lbl: { type: 'string' }, - meta: { - properties: { - basicPropertyValues: { - items: { - properties: { - pred: { type: 'string' }, - val: { type: 'string' }, - }, - required: ['val', 'pred'], - type: 'object', - }, - type: 'array', - }, - definition: { - properties: { val: { type: 'string' } }, - required: ['val'], - type: 'object', - }, - deprecated: { type: 'boolean' }, - subsets: { - items: { - type: 'string', - }, - type: 'array', - }, - synonyms: { - items: { - properties: { val: { type: 'string' } }, - required: ['val'], - type: 'object', - }, - type: 'array', - }, - xrefs: { - items: { - properties: { val: { type: 'string' } }, - required: ['val'], - type: 'object', - }, - type: 'array', - }, - }, - type: 'object', - }, - }, - required: ['id', 'lbl'], - type: 'object', -}); - - -const edgeSpec = ajv.compile({ - properties: { - obj: { pattern: DOID_PATTERN, type: 'string' }, - pred: { type: 'string' }, - sub: { pattern: DOID_PATTERN, type: 'string' }, - }, - required: ['sub', 'pred', 'obj'], - type: 'object', -}); +const ajv = new Ajv(); +const nodeSpec = ajv.compile(nodeSpecDefn); +const edgeSpec = ajv.compile(edgeSpecDefn); const parseDoid = (ident) => { @@ -213,10 +151,12 @@ const loadEdges = async ({ * @param {string} opt.filename the path to the input JSON file * @param {ApiConnection} opt.conn the api connection object */ -const uploadFile = async ({ filename, conn, ignoreCache = false }) => { +const uploadFile = async ({ + filename, conn, ignoreCache = false, maxRecords, +}) => { // load the DOID JSON logger.info('loading external disease ontology data'); - const DOID = require(filename); // eslint-disable-line import/no-dynamic-require,global-require + const DOID = JSON.parse(fs.readFileSync(filename)); // build the disease ontology first const nodesByName = {}; // store by name @@ -275,6 +215,10 @@ const uploadFile = async ({ filename, conn, ignoreCache = false }) => { } for (let i = 0; i < DOID.graphs[0].nodes.length; i++) { + if (maxRecords && i > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } const node = DOID.graphs[0].nodes[i]; logger.info(`processing ${node.id} (${i} / ${DOID.graphs[0].nodes.length})`); let row; diff --git a/src/diseaseOntology/specs.json b/src/diseaseOntology/specs.json new file mode 100644 index 00000000..f29f8ef6 --- /dev/null +++ b/src/diseaseOntology/specs.json @@ -0,0 +1,110 @@ +{ + "edge": { + "properties": { + "obj": { + "pattern": "^http://purl.obolibrary.org/obo/DOID_\\d+$", + "type": "string" + }, + "pred": { + "type": "string" + }, + "sub": { + "pattern": "^http://purl.obolibrary.org/obo/DOID_\\d+$", + "type": "string" + } + }, + "required": [ + "sub", + "pred", + "obj" + ], + "type": "object" + }, + "node": { + "properties": { + "id": { + "pattern": "^http://purl.obolibrary.org/obo/DOID_\\d+$", + "type": "string" + }, + "lbl": { + "type": "string" + }, + "meta": { + "properties": { + "basicPropertyValues": { + "items": { + "properties": { + "pred": { + "type": "string" + }, + "val": { + "type": "string" + } + }, + "required": [ + "val", + "pred" + ], + "type": "object" + }, + "type": "array" + }, + "definition": { + "properties": { + "val": { + "type": "string" + } + }, + "required": [ + "val" + ], + "type": "object" + }, + "deprecated": { + "type": "boolean" + }, + "subsets": { + "items": { + "type": "string" + }, + "type": "array" + }, + "synonyms": { + "items": { + "properties": { + "val": { + "type": "string" + } + }, + "required": [ + "val" + ], + "type": "object" + }, + "type": "array" + }, + "xrefs": { + "items": { + "properties": { + "val": { + "type": "string" + } + }, + "required": [ + "val" + ], + "type": "object" + }, + "type": "array" + } + }, + "type": "object" + } + }, + "required": [ + "id", + "lbl" + ], + "type": "object" + } +} diff --git a/src/docm/index.js b/src/docm/index.js index 0f32eff8..6c047646 100644 --- a/src/docm/index.js +++ b/src/docm/index.js @@ -16,72 +16,15 @@ const _pubmed = require('../entrez/pubmed'); const { logger } = require('../logging'); const _gene = require('../entrez/gene'); const { docm: SOURCE_DEFN } = require('../sources'); +const { variant: variantSpec, record: recordSpecDefn } = require('./specs.json'); -const ajv = new Ajv(); const BASE_URL = 'http://docm.info/api/v1/variants'; - -const variantSummarySpec = ajv.compile({ - properties: { - hgvs: { type: 'string' }, - }, - required: ['hgvs'], - type: 'object', -}); - - -const recordSpec = ajv.compile({ - properties: { - amino_acid: { pattern: '^p\\..*', type: 'string' }, - chromosome: { type: 'string' }, - gene: { type: 'string' }, - hgvs: { type: 'string' }, - // TODO: process drug interactions (large amount of free text currently) - meta: { - items: { - properties: { - 'Drug Interaction Data': { - properties: { - fields: { - items: [ - { enum: ['Therapeutic Context'], type: 'string' }, - { enum: ['Pathway'], type: 'string' }, - { enum: ['Effect'], type: 'string' }, - { enum: ['Association'], type: 'string' }, - { enum: ['Status'], type: 'string' }, - { enum: ['Evidence'], type: 'string' }, - { enum: ['Source'], type: 'string' }, - ], - type: 'array', - }, - rows: { - items: { - items: { type: ['string', 'null'] }, maxItems: 7, minItems: 7, type: 'array', - }, - type: 'array', - }, - }, - required: ['fields', 'rows'], - type: 'object', - }, - }, - required: ['Drug Interaction Data'], - type: 'object', - }, - type: 'array', - }, - reference: { pattern: '^([ATGC]*|-)$', type: 'string' }, - reference_version: { type: 'string' }, - start: { min: 1, type: 'number' }, - stop: { min: 1, type: 'number' }, - variant: { pattern: '^([ATGC]*|-)$', type: 'string' }, - variant_type: { enum: ['SNV', 'DEL', 'INS', 'DNV'], type: 'string' }, - }, - required: ['reference_version', 'hgvs', 'gene', 'reference', 'variant', 'start', 'stop', 'variant_type'], - type: 'object', -}); +const ajv = new Ajv(); +const variantSummarySpec = ajv.compile(variantSpec); +const recordSpec = ajv.compile(recordSpecDefn); /** @@ -301,16 +244,22 @@ const processRecord = async (opt) => { * @param {ApiConnection} opt.conn the api connection object for GraphKB * @param {string} [opt.url] the base url for the DOCM api */ -const upload = async (opt) => { - const { conn, errorLogPrefix } = opt; +const upload = async ({ + conn, errorLogPrefix, url = BASE_URL, maxRecords, +}) => { // load directly from their api: - logger.info(`loading: ${opt.url || BASE_URL}.json`); - const recordsList = await request({ + logger.info(`loading: ${url}.json`); + let recordsList = await request({ json: true, method: 'GET', - uri: `${BASE_URL}.json`, + uri: `${url}.json`, }); logger.info(`loaded ${recordsList.length} records`); + + if (maxRecords) { + logger.warn(`truncating records input, maxRecords=${maxRecords}`); + recordsList = recordsList.slice(0, maxRecords); + } // add the source node const source = rid(await conn.addSource(SOURCE_DEFN)); diff --git a/src/docm/specs.json b/src/docm/specs.json new file mode 100644 index 00000000..2fef4b86 --- /dev/null +++ b/src/docm/specs.json @@ -0,0 +1,150 @@ +{ + "record": { + "properties": { + "amino_acid": { + "pattern": "^p\\..*", + "type": "string" + }, + "chromosome": { + "type": "string" + }, + "gene": { + "type": "string" + }, + "hgvs": { + "type": "string" + }, + "meta": { + "items": { + "properties": { + "Drug Interaction Data": { + "properties": { + "fields": { + "items": [ + { + "enum": [ + "Therapeutic Context" + ], + "type": "string" + }, + { + "enum": [ + "Pathway" + ], + "type": "string" + }, + { + "enum": [ + "Effect" + ], + "type": "string" + }, + { + "enum": [ + "Association" + ], + "type": "string" + }, + { + "enum": [ + "Status" + ], + "type": "string" + }, + { + "enum": [ + "Evidence" + ], + "type": "string" + }, + { + "enum": [ + "Source" + ], + "type": "string" + } + ], + "type": "array" + }, + "rows": { + "items": { + "items": { + "type": [ + "string", + "null" + ] + }, + "maxItems": 7, + "minItems": 7, + "type": "array" + }, + "type": "array" + } + }, + "required": [ + "fields", + "rows" + ], + "type": "object" + } + }, + "required": [ + "Drug Interaction Data" + ], + "type": "object" + }, + "type": "array" + }, + "reference": { + "pattern": "^([ATGC]*|-)$", + "type": "string" + }, + "reference_version": { + "type": "string" + }, + "start": { + "min": 1, + "type": "number" + }, + "stop": { + "min": 1, + "type": "number" + }, + "variant": { + "pattern": "^([ATGC]*|-)$", + "type": "string" + }, + "variant_type": { + "enum": [ + "SNV", + "DEL", + "INS", + "DNV" + ], + "type": "string" + } + }, + "required": [ + "reference_version", + "hgvs", + "gene", + "reference", + "variant", + "start", + "stop", + "variant_type" + ], + "type": "object" + }, + "variant": { + "properties": { + "hgvs": { + "type": "string" + } + }, + "required": [ + "hgvs" + ], + "type": "object" + } +} diff --git a/src/drugbank/index.js b/src/drugbank/index.js index cf5fb773..d3a37904 100644 --- a/src/drugbank/index.js +++ b/src/drugbank/index.js @@ -13,9 +13,7 @@ const _hgnc = require('../hgnc'); const { logger } = require('../logging'); const _chembl = require('../chembl'); const { drugbank: SOURCE_DEFN, fdaSrs: { name: fdaName } } = require('../sources'); - - -const ajv = new Ajv(); +const spec = require('./spec.json'); // Lists most of the commonly required 'Tags' and Attributes @@ -27,127 +25,11 @@ const HEADER = { unii: 'unii', }; -const singleReqProp = (name, spec = {}) => ({ - oneOf: [{ maxLength: 0, type: 'string' }, { properties: { [name]: spec }, required: [name], type: ['object', 'null'] }], -}); - /** * This defines the expected format of the JSON post transform from xml */ -const validateDrugbankSpec = ajv.compile({ - properties: { - $: { - properties: { - updated: { type: 'string' }, - }, - required: ['updated'], - type: 'object', - }, - 'atc-codes': singleReqProp( - 'atc-code', singleReqProp( - 'level', { - items: { - properties: { - $: { - properties: { code: { type: 'string' } }, - required: ['code'], - type: 'object', - }, - $text: { type: 'string' }, - }, - required: ['$text', '$'], - type: 'object', - }, - type: 'array', - }, - ), - ), - 'calculated-properties': singleReqProp('property', { - items: { - properties: { - kind: { type: 'string' }, - type: { type: 'string' }, - }, - required: ['kind', 'value'], - type: 'object', - }, - type: 'array', - }), - categories: singleReqProp( - 'category', { - items: { properties: { category: { type: 'string' } }, required: ['category'], type: 'object' }, - type: 'array', - }, - ), - description: { type: ['string', 'null'] }, - 'drugbank-id': { - items: [{ - properties: { - $text: { pattern: '^DB\\d+$', type: 'string' }, - }, - type: 'object', - }], - minItems: 1, - type: 'array', - }, - 'external-identifiers': singleReqProp( - 'external-identifier', { - items: { - properties: { - identifier: { type: 'string' }, - resource: { type: 'string' }, - }, - required: ['resource', 'identifier'], - type: 'object', - }, - type: 'array', - }, - ), - 'mechanism-of-action': { type: ['string', 'null'] }, - name: { type: 'string' }, - products: singleReqProp('product', { - items: { - properties: { name: { type: 'string' } }, - required: ['name'], - type: 'object', - }, - type: 'array', - }), - targets: singleReqProp( - 'target', { - properties: { - actions: singleReqProp('action', { items: { type: 'string' }, type: 'array' }), - polypeptide: { - items: { - properties: { - 'external-identifiers': singleReqProp( - 'external-identifier', { - items: { - properties: { - identifier: { type: 'string' }, - resource: { type: 'string' }, - }, - required: ['resource', 'identifier'], - type: 'object', - }, - type: 'array', - }, - ), - }, - type: 'object', - }, - type: 'array', - }, - }, - required: ['actions'], - type: 'object', - }, - ), - unii: { type: ['string', 'null'] }, - }, - required: ['drugbank-id', 'name', '$'], - type: 'object', -}); +const ajv = new Ajv(); +const validateDrugbankSpec = ajv.compile(spec); const getDrugBankId = record => record['drugbank-id'][0].$text; @@ -374,7 +256,7 @@ const processRecord = async ({ * @param {string} opt.filename the path to the input XML file * @param {ApiConnection} opt.conn the api connection object */ -const uploadFile = async ({ filename, conn }) => { +const uploadFile = async ({ filename, conn, maxRecords }) => { logger.info('Loading the external drugbank data'); const source = await conn.addSource(SOURCE_DEFN); @@ -416,6 +298,14 @@ const uploadFile = async ({ filename, conn }) => { ATC, conn, drug: item, sources: { current: source, fda: fdaSource }, }).then(() => { counts.success++; + + if (maxRecords && (counts.success + counts.error + counts.skipped) >= maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + logger.info('Parsing stream complete'); + stream.close(); + resolve(); + } + xml.resume(); }).catch((err) => { let label; diff --git a/src/drugbank/spec.json b/src/drugbank/spec.json new file mode 100644 index 00000000..eae389f7 --- /dev/null +++ b/src/drugbank/spec.json @@ -0,0 +1,358 @@ +{ + "properties": { + "$": { + "properties": { + "updated": { + "type": "string" + } + }, + "required": [ + "updated" + ], + "type": "object" + }, + "atc-codes": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "atc-code": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "level": { + "items": { + "properties": { + "$": { + "properties": { + "code": { + "type": "string" + } + }, + "required": [ + "code" + ], + "type": "object" + }, + "$text": { + "type": "string" + } + }, + "required": [ + "$text", + "$" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "level" + ], + "type": [ + "object", + "null" + ] + } + ] + } + }, + "required": [ + "atc-code" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "calculated-properties": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "property": { + "items": { + "properties": { + "kind": { + "type": "string" + }, + "type": { + "type": "string" + } + }, + "required": [ + "kind", + "value" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "property" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "categories": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "category": { + "items": { + "properties": { + "category": { + "type": "string" + } + }, + "required": [ + "category" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "category" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "description": { + "type": [ + "string", + "null" + ] + }, + "drugbank-id": { + "items": [ + { + "properties": { + "$text": { + "pattern": "^DB\\d+$", + "type": "string" + } + }, + "type": "object" + } + ], + "minItems": 1, + "type": "array" + }, + "external-identifiers": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "external-identifier": { + "items": { + "properties": { + "identifier": { + "type": "string" + }, + "resource": { + "type": "string" + } + }, + "required": [ + "resource", + "identifier" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "external-identifier" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "mechanism-of-action": { + "type": [ + "string", + "null" + ] + }, + "name": { + "type": "string" + }, + "products": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "product": { + "items": { + "properties": { + "name": { + "type": "string" + } + }, + "required": [ + "name" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "product" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "targets": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "target": { + "properties": { + "actions": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "action": { + "items": { + "type": "string" + }, + "type": "array" + } + }, + "required": [ + "action" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "polypeptide": { + "items": { + "properties": { + "external-identifiers": { + "oneOf": [ + { + "maxLength": 0, + "type": "string" + }, + { + "properties": { + "external-identifier": { + "items": { + "properties": { + "identifier": { + "type": "string" + }, + "resource": { + "type": "string" + } + }, + "required": [ + "resource", + "identifier" + ], + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "external-identifier" + ], + "type": [ + "object", + "null" + ] + } + ] + } + }, + "type": "object" + }, + "type": "array" + } + }, + "required": [ + "actions" + ], + "type": "object" + } + }, + "required": [ + "target" + ], + "type": [ + "object", + "null" + ] + } + ] + }, + "unii": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "drugbank-id", + "name", + "$" + ], + "type": "object" +} diff --git a/src/fdaSrs/index.js b/src/fdaSrs/index.js index d2f17f01..64f41d77 100644 --- a/src/fdaSrs/index.js +++ b/src/fdaSrs/index.js @@ -21,8 +21,7 @@ const HEADER = { * @param {string} opt.filename the path to the input file * @param {ApiConnection} opt.conn the api connection object */ -const uploadFile = async (opt) => { - const { filename, conn: graphkbConn } = opt; +const uploadFile = async ({ filename, conn: graphkbConn, maxRecords }) => { const jsonList = await loadDelimToJson(filename); const source = await graphkbConn.addSource(SOURCE_DEFN); @@ -42,6 +41,10 @@ const uploadFile = async (opt) => { const intervalSize = 1000; for (let i = 0; i < jsonList.length; i++) { + if (maxRecords && i > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } const { id, ncit, name, } = convertRowFields(HEADER, jsonList[i]); diff --git a/src/hgnc/index.js b/src/hgnc/index.js index 2d30dee5..71ccd42d 100644 --- a/src/hgnc/index.js +++ b/src/hgnc/index.js @@ -4,6 +4,7 @@ const Ajv = require('ajv'); const _ = require('lodash'); +const fs = require('fs'); const { checkSpec, request } = require('../util'); const { rid, orderPreferredOntologyTerms, convertRecordToQueryFilters, @@ -260,7 +261,7 @@ const uploadFile = async (opt) => { logger.info('loading the external HGNC data'); const { filename, conn } = opt; logger.info(`loading: ${filename}`); - const hgncContent = require(filename); // eslint-disable-line import/no-dynamic-require,global-require + const hgncContent = JSON.parse(fs.readFileSync(filename)); const genes = hgncContent.response.docs; const hgnc = await conn.addSource(SOURCE_DEFN); let ensembl; diff --git a/src/moa/README.md b/src/moa/README.md index 686d1d80..5d82fe94 100644 --- a/src/moa/README.md +++ b/src/moa/README.md @@ -9,7 +9,7 @@ node bin/load.js api moa ## Loading Assumptions - A specific drug ontology is not used/given and therefore drugs are matched by name -- A specific gene ontology is not specified and therefore we default to using Entrez genes as they are popular amongst other knowlege bases +- A specific gene ontology is not specified and therefore we default to using Entrez genes as they are popular amongst other knowledge bases - When given, diseases are preferentially matched to their OncoTree term by both name and code. This falls back to matching by name when the oncotree term/code is not given ### Relevance Mapping diff --git a/src/ncit/index.js b/src/ncit/index.js index 1452be06..eff0c175 100644 --- a/src/ncit/index.js +++ b/src/ncit/index.js @@ -195,7 +195,9 @@ const cleanRawRow = (rawRow) => { * @param {string} opt.filename the path to the input OWL file * @param {ApiRequst} opt.conn the API connection object */ -const uploadFile = async ({ filename, conn, ignoreCache = false }) => { +const uploadFile = async ({ + filename, conn, ignoreCache = false, maxRecords, +}) => { logger.info('Loading external NCIT data'); logger.info(`loading: ${filename}`); const rawRows = await loadDelimToJson(filename, { @@ -234,6 +236,11 @@ const uploadFile = async ({ filename, conn, ignoreCache = false }) => { const erroredSourceIds = new Set(); for (const raw of rawRows) { + if (maxRecords && rows.length > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } + try { const row = cleanRawRow(raw); diff --git a/src/refseq/index.js b/src/refseq/index.js index 4669c9fe..89ce26ef 100644 --- a/src/refseq/index.js +++ b/src/refseq/index.js @@ -19,8 +19,7 @@ const { refseq: SOURCE_DEFN } = require('../sources'); * @param {string} opt.filename path to the tab delimited file * @param {ApiConnection} opt.conn the api connection object */ -const uploadFile = async (opt) => { - const { filename, conn } = opt; +const uploadFile = async ({ filename, conn, maxRecords }) => { const json = await loadDelimToJson(filename); const source = await conn.addRecord({ @@ -36,6 +35,11 @@ const uploadFile = async (opt) => { await _entrez.fetchAndLoadByIds(conn, json.map(rec => rec.GeneID)); for (let i = 0; i < json.length; i++) { + if (maxRecords && i > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } + try { const { RNA, GeneID, Protein } = json[i]; logger.info(`processing (${i} / ${json.length}) ${RNA}`); diff --git a/src/uberon/index.js b/src/uberon/index.js index 2e6767a1..fd1342d6 100644 --- a/src/uberon/index.js +++ b/src/uberon/index.js @@ -64,7 +64,7 @@ const parseSubsetName = (url) => { * @param {string} opt.filename the path to the input OWL file * @param {ApiConnection} opt.conn the API connection object */ -const uploadFile = async ({ filename, conn }) => { +const uploadFile = async ({ filename, conn, maxRecords }) => { logger.info('Loading the external uberon data'); logger.info(`reading: ${filename}`); const content = fs.readFileSync(filename).toString(); @@ -92,10 +92,16 @@ const uploadFile = async ({ filename, conn }) => { const ncitMissingRecords = new Set(); logger.info(`Adding the uberon ${Object.keys(nodesByCode).length} entity nodes`); + let count = 0; + for (const node of Object.values(nodesByCode)) { if (!node[PREDICATES.LABEL] || !node.code) { continue; } + if (maxRecords && count > maxRecords) { + logger.warn(`not loading all content due to max records limit (${maxRecords})`); + break; + } const body = { name: node[PREDICATES.LABEL][0], source: rid(source), @@ -136,6 +142,7 @@ const uploadFile = async ({ filename, conn }) => { target: 'AnatomicalEntity', }); records[dbEntry.sourceId] = dbEntry; + count++; } logger.info(`Adding the ${subclassEdges.length} subclassof relationships`);