Merge pull request #63 from bcgsc/feature/node-16

Feature/node 16
bcgsc · Jan 25, 2022 · 38bf3ef · 38bf3ef
2 parents 2c0339a + b2bbb40
commit 38bf3ef
Show file tree

Hide file tree

Showing 30 changed files with 1,470 additions and 626 deletions.
diff --git a/.github/workflows/npm-test.yml b/.github/workflows/npm-test.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        node: ['12', '14']
+        node: ['12', '14', '16']
     name: node-${{ matrix.node }}
     steps:
       - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           files: coverage/junit.xml
-        if: matrix.node == 10
+        if: matrix.node == 14
   docker:
     runs-on: ubuntu-latest
     name: docker build

diff --git a/Snakefile b/Snakefile
@@ -1,7 +1,6 @@
 import os
 from textwrap import dedent
 
-
 DATA_DIR = 'snakemake_data'
 LOGS_DIR = 'snakemake_logs'
 
@@ -18,6 +17,7 @@ COSMIC_EMAIL = config.get('cosmic_email')
 COSMIC_PASSWORD = config.get('cosmic_password')
 USE_COSMIC = COSMIC_EMAIL or COSMIC_PASSWORD
 BACKFILL_TRIALS = config.get('trials')
+USE_FDA_UNII = config.get('fda')  # due to the non-scriptable download, making FDA optional
 GITHUB_DATA = 'https://raw.githubusercontent.com/bcgsc/pori_graphkb_loader/develop/data'
 CONTAINER = 'docker://bcgsc/pori-graphkb-loader:latest'
 
@@ -28,10 +28,10 @@ rule all:
         f'{DATA_DIR}/PMC4468049.COMPLETE',
         f'{DATA_DIR}/PMC4232638.COMPLETE',
         f'{DATA_DIR}/uberon.COMPLETE',
-        f'{DATA_DIR}/ncitFdaXref.COMPLETE',
         f'{DATA_DIR}/fdaApprovals.COMPLETE',
         f'{DATA_DIR}/cancerhotspots.COMPLETE',
         f'{DATA_DIR}/moa.COMPLETE',
+        *([f'{DATA_DIR}/ncitFdaXref.COMPLETE'] if USE_FDA_UNII else []),
         *([f'{DATA_DIR}/clinicaltrialsgov.COMPLETE'] if BACKFILL_TRIALS else []),
         *([f'{DATA_DIR}/cosmic_resistance.COMPLETE', f'{DATA_DIR}/cosmic_fusions.COMPLETE'] if USE_COSMIC else [])
 
@@ -46,11 +46,12 @@ rule download_ncit:
         rm -rf __MACOSX''')
 
 
-rule download_ncit_fda:
-    output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt'
-    shell: dedent(f'''\
-        cd {DATA_DIR}/ncit
-        wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''')
+if USE_FDA_UNII:
+    rule download_ncit_fda:
+        output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt'
+        shell: dedent(f'''\
+            cd {DATA_DIR}/ncit
+            wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''')
 
 
 rule download_ensembl:
@@ -62,16 +63,17 @@ rule download_ensembl:
         ''')
 
 
-rule download_fda_srs:
-    output: f'{DATA_DIR}/fda/UNII_Records.txt'
-    shell: dedent(f'''\
-        cd {DATA_DIR}/fda
-        wget https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip
-        unzip UNII_Data.zip
-        rm UNII_Data.zip
+if USE_FDA_UNII:
+    rule download_fda_srs:
+        output: f'{DATA_DIR}/fda/UNII_Records.txt'
+        shell: dedent(f'''\
+            cd {DATA_DIR}/fda
+            wget https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip
+            unzip UNII_Data.zip
+            rm UNII_Data.zip
 
-        mv UNII*.txt UNII_Records.txt
-        ''')
+            mv UNII*.txt UNII_Records.txt
+            ''')
 
 
 rule download_refseq:
@@ -135,8 +137,8 @@ rule download_cgi:
     output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv'
     shell: dedent(f'''\
         cd {DATA_DIR}/cgi
-        wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_latest.zip
-        unzip cgi_biomarkers_latest.zip
+        wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_20180117.zip
+        unzip cgi_biomarkers_20180117.zip
         ''')
 
 
@@ -217,23 +219,24 @@ rule load_ncit:
     shell: 'node bin/load.js file ncit {input.data} &> {log}; cp {log} {output}'
 
 
-rule load_fda_srs:
-    input: expand(rules.load_local.output, local=['vocab']),
-        data=rules.download_fda_srs.output
-    container: CONTAINER
-    log: f'{LOGS_DIR}/fdaSrs.logs.txt'
-    output: f'{DATA_DIR}/fdaSrs.COMPLETE'
-    shell: 'node bin/load.js file fdaSrs {input.data} &> {log}; cp {log} {output}'
+if USE_FDA_UNII:
+    rule load_fda_srs:
+        input: expand(rules.load_local.output, local=['vocab']),
+            data=f'{DATA_DIR}/fda/UNII_Records.txt'
+        container: CONTAINER
+        log: f'{LOGS_DIR}/fdaSrs.logs.txt'
+        output: f'{DATA_DIR}/fdaSrs.COMPLETE'
+        shell: 'node bin/load.js file fdaSrs {input.data} &> {log}; cp {log} {output}'
 
 
-rule load_ncit_fda:
-    input: rules.load_ncit.output,
-        rules.load_fda_srs.output,
-        data=rules.download_ncit_fda.output
-    container: CONTAINER
-    log: f'{LOGS_DIR}/ncitFdaXref.logs.txt'
-    output: f'{DATA_DIR}/ncitFdaXref.COMPLETE'
-    shell: 'node bin/load.js file ncitFdaXref {input.data} &> {log}; cp {log} {output}'
+    rule load_ncit_fda:
+        input: rules.load_ncit.output,
+            rules.load_fda_srs.output,
+            data=rules.download_ncit_fda.output
+        container: CONTAINER
+        log: f'{LOGS_DIR}/ncitFdaXref.logs.txt'
+        output: f'{DATA_DIR}/ncitFdaXref.COMPLETE'
+        shell: 'node bin/load.js file ncitFdaXref {input.data} &> {log}; cp {log} {output}'
 
 
 rule load_refseq:
@@ -273,7 +276,7 @@ rule load_uberon:
 
 
 rule load_drugbank:
-    input: rules.load_fda_srs.output,
+    input: rules.load_fda_srs.output if USE_FDA_UNII else [],
         data=rules.download_drugbank.output
     container: CONTAINER
     log: f'{LOGS_DIR}/drugbank.logs.txt'
@@ -298,7 +301,9 @@ rule load_dgidb:
 
 
 def get_drug_inputs(wildcards):
-    inputs = [*rules.load_fda_srs.output, *rules.load_ncit.output]
+    inputs = [*rules.load_ncit.output]
+    if USE_FDA_UNII:
+        inputs.extend(rules.load_fda_srs.output)
     container: CONTAINER
     if USE_DRUGBANK:
         inputs.append(*rules.load_drugbank.output)
@@ -423,7 +428,8 @@ rule load_cosmic_fusions:
 
 
 rule load_moa:
-    input: rules.load_oncotree.output
+    input: rules.load_oncotree.output,
+        expand(rules.load_local.output, local=['vocab', 'signatures', 'chromosomes', 'evidenceLevels', 'aacr', 'asco'])
     container: CONTAINER
     log: f'{LOGS_DIR}/load_moa.logs.txt'
     output: f'{DATA_DIR}/moa.COMPLETE'

diff --git a/bin/load.js b/bin/load.js
@@ -141,7 +141,7 @@ if (input) {
         loaderOptions.filename = input;
 
         if (options.module === 'cosmic') {
-            loaderOptions.mappingFilename = options.mappingFilename;
+            loaderOptions.classification = options.classification;
         }
     }
 }

diff --git a/src/asco/index.js b/src/asco/index.js
@@ -9,34 +9,14 @@ const { requestWithRetry, checkSpec } = require('../util');
 const { asco: SOURCE_DEFN } = require('../sources');
 const { logger } = require('../logging');
 const { rid } = require('../graphkb');
+const { main: mainSpec, details: detailsSpec } = require('./specs.json');
 
 const CACHE = {};
 
 
 const ajv = new Ajv();
-
-const validateMainSpec = ajv.compile({
-    properties: {
-        AbstID: { type: 'string' },
-        AuthorString: { type: 'string' },
-        Meeting: { type: 'string' },
-        Title: { type: 'string' },
-        Year: { pattern: '\\d+', type: 'string' },
-        id: { type: 'string' },
-        url: { format: 'url', type: 'string' },
-    },
-    required: ['AbstID', 'Title', 'Meeting', 'url', 'id'],
-    type: 'object',
-});
-
-
-const validateDetailsSpec = ajv.compile({
-    properties: {
-        DOI: { type: 'string' },
-        SiteCitation: { type: 'string' },
-    },
-    type: 'object',
-});
+const validateMainSpec = ajv.compile(mainSpec);
+const validateDetailsSpec = ajv.compile(detailsSpec);
 
 
 /**

diff --git a/src/asco/specs.json b/src/asco/specs.json
@@ -0,0 +1,48 @@
+{
+    "details": {
+        "properties": {
+            "DOI": {
+                "type": "string"
+            },
+            "SiteCitation": {
+                "type": "string"
+            }
+        },
+        "type": "object"
+    },
+    "main": {
+        "properties": {
+            "AbstID": {
+                "type": "string"
+            },
+            "AuthorString": {
+                "type": "string"
+            },
+            "Meeting": {
+                "type": "string"
+            },
+            "Title": {
+                "type": "string"
+            },
+            "Year": {
+                "pattern": "\\d+",
+                "type": "string"
+            },
+            "id": {
+                "type": "string"
+            },
+            "url": {
+                "format": "url",
+                "type": "string"
+            }
+        },
+        "required": [
+            "AbstID",
+            "Title",
+            "Meeting",
+            "url",
+            "id"
+        ],
+        "type": "object"
+    }
+}
diff --git a/src/cancergenomeinterpreter/index.js b/src/cancergenomeinterpreter/index.js
@@ -480,7 +480,9 @@ const processRow = async ({ row, source, conn }) => {
 };
 
 
-const uploadFile = async ({ conn, filename, errorLogPrefix }) => {
+const uploadFile = async ({
+    conn, filename, errorLogPrefix, maxRecords,
+}) => {
     const rows = await loadDelimToJson(filename);
     logger.info('creating the source record');
     const source = rid(await conn.addSource(SOURCE_DEFN));
@@ -496,6 +498,10 @@ const uploadFile = async ({ conn, filename, errorLogPrefix }) => {
     logger.info(`loading ${rows.length} rows`);
 
     for (let index = 0; index < rows.length; index++) {
+        if (maxRecords && index > maxRecords) {
+            logger.warn(`not loading all content due to max records limit (${maxRecords})`);
+            break;
+        }
         const rawRow = rows[index];
         const sourceId = hashRecordToId(rawRow);
         logger.info(`processing: ${sourceId} (${index} / ${rows.length})`);

diff --git a/src/chembl/index.js b/src/chembl/index.js
@@ -11,26 +11,11 @@ const {
 } = require('../graphkb');
 const { logger } = require('../logging');
 const { chembl: SOURCE_DEFN } = require('../sources');
+const spec = require('./spec.json');
 
 const ajv = new Ajv();
 
-const recordSpec = ajv.compile({
-    properties: {
-        molecule_chembl_id: { pattern: '^CHEMBL\\d+$', type: 'string' },
-        molecule_properties: {
-            oneOf: [{
-                properties: {
-                    full_molformula: { type: 'string' },
-                },
-                type: 'object',
-            }, { type: 'null' }],
-        },
-        pref_name: { type: ['string', 'null'] },
-        usan_stem_definition: { type: ['string', 'null'] },
-    },
-    required: ['molecule_chembl_id'],
-    type: 'object',
-});
+const recordSpec = ajv.compile(spec);
 
 
 const API = 'https://www.ebi.ac.uk/chembl/api/data/molecule';

diff --git a/src/chembl/spec.json b/src/chembl/spec.json
@@ -0,0 +1,39 @@
+{
+    "properties": {
+        "molecule_chembl_id": {
+            "pattern": "^CHEMBL\\d+$",
+            "type": "string"
+        },
+        "molecule_properties": {
+            "oneOf": [
+                {
+                    "properties": {
+                        "full_molformula": {
+                            "type": "string"
+                        }
+                    },
+                    "type": "object"
+                },
+                {
+                    "type": "null"
+                }
+            ]
+        },
+        "pref_name": {
+            "type": [
+                "string",
+                "null"
+            ]
+        },
+        "usan_stem_definition": {
+            "type": [
+                "string",
+                "null"
+            ]
+        }
+    },
+    "required": [
+        "molecule_chembl_id"
+    ],
+    "type": "object"
+}