From f4acbcc74fd239aca5776de2c41174558e4e9e31 Mon Sep 17 00:00:00 2001 From: sshugsc Date: Mon, 4 Mar 2024 09:35:54 -0800 Subject: [PATCH 1/3] remove refseq from ensembl loader --- src/ensembl/README.md | 4 +++- src/ensembl/index.js | 54 +++---------------------------------------- 2 files changed, 6 insertions(+), 52 deletions(-) diff --git a/src/ensembl/README.md b/src/ensembl/README.md index f3e2cc47..d2302984 100644 --- a/src/ensembl/README.md +++ b/src/ensembl/README.md @@ -4,10 +4,12 @@ This loader loads both a BioMart export TSV file or individual records by ID. It to batch load Ensembl data but you can do so if you would like it to appear for users who will use the auto-complete adding variants through GraphKB client +Link for archived ensembl versions: https://useast.ensembl.org/info/website/archives/index.html + First download the batch export from BioMart ```bash -query_string='' +query_string='' wget -O biomart_export.tsv "http://www.ensembl.org/biomart/martservice?query=$query_string" ``` diff --git a/src/ensembl/index.js b/src/ensembl/index.js index 87e0b59c..5009dc32 100644 --- a/src/ensembl/index.js +++ b/src/ensembl/index.js @@ -6,13 +6,12 @@ const { loadDelimToJson, requestWithRetry, convertRowFields } = require('../util'); const { - rid, orderPreferredOntologyTerms, generateCacheKey, + rid, generateCacheKey, } = require('../graphkb'); const { logger } = require('../logging'); const _hgnc = require('../hgnc'); const _entrez = require('../entrez/gene'); -const _refseq = require('../entrez/refseq'); -const { ensembl: SOURCE_DEFN, refseq: refseqSourceDefn } = require('../sources'); +const { ensembl: SOURCE_DEFN } = require('../sources'); const BASE_URL = 'http://rest.ensembl.org'; @@ -182,7 +181,6 @@ const uploadFile = async (opt) => { geneIdVersion: 'Gene stable ID version', hgncId: 'HGNC ID', proteinIdVersion: 'Protein stable ID version', - refseqId: 'RefSeq mRNA ID', transcriptIdVersion: 'Transcript stable ID version', }; const { filename, conn } = opt; @@ -198,12 +196,9 @@ const uploadFile = async (opt) => { const source = await conn.addSource(SOURCE_DEFN); - const refseqSource = await conn.addSource(refseqSourceDefn); - const visited = {}; // cache genes to speed up adding records const hgncMissingRecords = new Set(); - const refseqMissingRecords = new Set(); logger.info('pre-load the entrez cache to avoid unecessary requests'); await _entrez.preLoadCache(conn); @@ -263,18 +258,6 @@ const uploadFile = async (opt) => { } - logger.info('pre-fetching refseq entries'); - await _refseq.preLoadCache(conn); - const missingRefSeqIds = new Set(); - rows.map(r => r.refseqId || '').forEach((id) => { - if (!_refseq.cacheHas(id) && id) { - missingRefSeqIds.add(id); - } - }); - - logger.info(`fetching ${missingRefSeqIds.size} missing refseq entries`); - await _refseq.fetchAndLoadByIds(conn, Array.from(missingRefSeqIds)); - logger.info(`processing ${rows.length} records`); for (let index = 0; index < rows.length; index++) { @@ -481,35 +464,6 @@ const uploadFile = async (opt) => { - // transcript -> crossreferenceof -> refseq - if (record.refseqId) { - skip--; - - try { - const refseq = await conn.getUniqueRecordBy({ - filters: { - AND: [ - { source: rid(refseqSource) }, - { sourceId: record.refseqId }, - { sourceIdVersion: null }, - ], - }, - sort: orderPreferredOntologyTerms, - target: 'Feature', - }); - await conn.addRecord({ - content: { - in: rid(refseq), out: rid(transcript), source: rid(source), - }, - existsOk: true, - fetchExisting: false, - target: 'crossreferenceof', - }); - } catch (err) { - logger.log('error', `failed cross-linking from ${record.transcriptId} to ${record.refseqId}`); - refseqMissingRecords.add(record.refseqId); - } - } // gene -> crossreferenceof -> hgnc if (record.hgncId && newGene) { skip--; @@ -539,9 +493,7 @@ const uploadFile = async (opt) => { if (hgncMissingRecords.size) { logger.warn(`Unable to retrieve ${hgncMissingRecords.size} hgnc records for linking`); } - if (refseqMissingRecords.size) { - logger.warn(`Unable to retrieve ${refseqMissingRecords.size} refseq records for linking`); - } + logger.info(JSON.stringify(counts)); }; From 2e18fe767f81388daf6bfde29e743e6686e0800d Mon Sep 17 00:00:00 2001 From: sshugsc Date: Mon, 4 Mar 2024 16:36:59 -0800 Subject: [PATCH 2/3] remove refseq in ensembl test files --- ...ensembl_biomart_export_ENSG00000139618.tsv | 10 +- test/data/ensembl_uploadFile_requests.json | 325 ------------------ test/ensembl/ensembl.uploadFile.test.js | 9 - 3 files changed, 5 insertions(+), 339 deletions(-) diff --git a/test/data/ensembl_biomart_export_ENSG00000139618.tsv b/test/data/ensembl_biomart_export_ENSG00000139618.tsv index 7d143ea0..f0aa0d0c 100644 --- a/test/data/ensembl_biomart_export_ENSG00000139618.tsv +++ b/test/data/ensembl_biomart_export_ENSG00000139618.tsv @@ -1,5 +1,5 @@ -Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version Protein stable ID Protein stable ID version HGNC ID RefSeq mRNA ID Gene description Gene name Source of gene name -ENSG00000139618 ENSG00000139618.17 ENST00000544455 ENST00000544455.6 ENSP00000439902 ENSP00000439902.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol -ENSG00000139618 ENSG00000139618.17 ENST00000530893 ENST00000530893.6 ENSP00000499438 ENSP00000499438.2 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol -ENSG00000139618 ENSG00000139618.17 ENST00000380152 ENST00000380152.8 ENSP00000369497 ENSP00000369497.3 HGNC:1101 NM_000059 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol -ENSG00000139618 ENSG00000139618.17 ENST00000680887 ENST00000680887.1 ENSP00000505508 ENSP00000505508.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol +Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version Protein stable ID Protein stable ID version HGNC ID Gene description Gene name Source of gene name +ENSG00000139618 ENSG00000139618.17 ENST00000544455 ENST00000544455.6 ENSP00000439902 ENSP00000439902.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol +ENSG00000139618 ENSG00000139618.17 ENST00000530893 ENST00000530893.6 ENSP00000499438 ENSP00000499438.2 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol +ENSG00000139618 ENSG00000139618.17 ENST00000380152 ENST00000380152.8 ENSP00000369497 ENSP00000369497.3 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol +ENSG00000139618 ENSG00000139618.17 ENST00000680887 ENST00000680887.1 ENSP00000505508 ENSP00000505508.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol diff --git a/test/data/ensembl_uploadFile_requests.json b/test/data/ensembl_uploadFile_requests.json index 5bdefb4a..9c0d2ccf 100644 --- a/test/data/ensembl_uploadFile_requests.json +++ b/test/data/ensembl_uploadFile_requests.json @@ -50,58 +50,6 @@ } } }, - { - "opt": { - "body": { - "filters": { - "name": "refseq" - }, - "neighbors": 1, - "target": "Source" - }, - "method": "POST", - "uri": "/query" - }, - "result": { - "metadata": { - "records": 0 - }, - "result": [] - } - }, - { - "opt": { - "body": { - "description": "A comprehensive, integrated, non-redundant, well-annotated set of reference sequences including genomic, transcript, and protein.", - "displayName": "RefSeq", - "longName": "RefSeq: NCBI Reference Sequence Database", - "name": "refseq", - "url": "https://www.ncbi.nlm.nih.gov/refseq", - "usage": "https://www.ncbi.nlm.nih.gov/home/about/policies" - }, - "method": "POST", - "uri": "/sources" - }, - "result": { - "result": { - "@class": "Source", - "sort": 99999, - "description": "a comprehensive, integrated, non-redundant, well-annotated set of reference sequences including genomic, transcript, and protein.", - "displayName": "RefSeq", - "longName": "refseq: ncbi reference sequence database", - "name": "refseq", - "url": "https://www.ncbi.nlm.nih.gov/refseq", - "usage": "https://www.ncbi.nlm.nih.gov/home/about/policies", - "createdBy": "#32:0", - "updatedBy": "#32:0", - "uuid": "f319e234-13b1-4841-b049-e48aacf2644a", - "createdAt": 1653589823081, - "updatedAt": 1653589823081, - "@rid": "#42:0", - "@version": 1 - } - } - }, { "opt": { "body": { @@ -235,167 +183,6 @@ "result": [] } }, - { - "opt": { - "body": { - "filters": { - "AND": [ - { - "source": { - "filters": { - "name": "refseq" - }, - "target": "Source" - } - }, - { - "dependency": null - }, - { - "deprecated": false - } - ] - }, - "limit": 1000, - "neighbors": 0, - "returnProperties": null, - "skip": 0, - "target": "Feature" - }, - "method": "POST", - "uri": "/query" - }, - "result": { - "metadata": { - "records": 0 - }, - "result": [] - } - }, - { - "opt": { - "body": { - "filters": { - "name": "refseq" - }, - "neighbors": 1, - "target": "Source" - }, - "method": "POST", - "uri": "/query" - }, - "result": { - "metadata": { - "records": 1 - }, - "result": [ - { - "createdAt": 1653589823081, - "updatedBy": { - "createdAt": 1653589817005, - "lastLoginAt": 1653589823040, - "name": "mlemieux", - "groups": [ - "#25:0" - ], - "signedLicenseAt": 1653589817002, - "firstLoginAt": 1653589823040, - "uuid": "d3ccc0e1-a51d-4086-abd4-a35ee8f8b6fa", - "loginCount": 1, - "@rid": "#32:0", - "@class": "User" - }, - "createdBy": { - "createdAt": 1653589817005, - "lastLoginAt": 1653589823040, - "name": "mlemieux", - "groups": [ - "#25:0" - ], - "signedLicenseAt": 1653589817002, - "firstLoginAt": 1653589823040, - "uuid": "d3ccc0e1-a51d-4086-abd4-a35ee8f8b6fa", - "loginCount": 1, - "@rid": "#32:0", - "@class": "User" - }, - "displayName": "RefSeq", - "usage": "https://www.ncbi.nlm.nih.gov/home/about/policies", - "name": "refseq", - "description": "a comprehensive, integrated, non-redundant, well-annotated set of reference sequences including genomic, transcript, and protein.", - "sort": 99999, - "uuid": "f319e234-13b1-4841-b049-e48aacf2644a", - "url": "https://www.ncbi.nlm.nih.gov/refseq", - "longName": "refseq: ncbi reference sequence database", - "updatedAt": 1653589823081, - "@rid": "#42:0", - "@class": "Source" - } - ] - } - }, - { - "opt": { - "body": { - "filters": { - "AND": [ - { - "sourceId": "NM_000059" - }, - { - "source": "#42:0" - }, - { - "sourceIdVersion": null - } - ] - }, - "neighbors": 1, - "target": "Feature" - }, - "method": "POST", - "uri": "/query" - }, - "result": { - "metadata": { - "records": 0 - }, - "result": [] - } - }, - { - "opt": { - "body": { - "biotype": "transcript", - "displayName": "NM_000059", - "sourceId": "nm_000059", - "sourceIdVersion": null, - "source": "#42:0" - }, - "method": "POST", - "uri": "/features" - }, - "result": { - "result": { - "@class": "Feature", - "deprecated": false, - "alias": false, - "biotype": "transcript", - "displayName": "NM_000059", - "sourceId": "nm_000059", - "sourceIdVersion": null, - "source": "#42:0", - "createdBy": "#32:0", - "updatedBy": "#32:0", - "uuid": "1703537d-b15a-4a95-a3f8-575939e5c89d", - "createdAt": 1653589823632, - "updatedAt": 1653589823632, - "name": "nm_000059", - "@rid": "#137:0", - "@version": 1 - } - } - }, { "opt": { "body": { @@ -2311,118 +2098,6 @@ } } }, - { - "opt": { - "body": { - "filters": { - "AND": [ - { - "source": "#42:0" - }, - { - "sourceId": "NM_000059" - }, - { - "sourceIdVersion": null - } - ] - }, - "neighbors": 1, - "target": "Feature" - }, - "method": "POST", - "uri": "/query" - }, - "result": { - "metadata": { - "records": 1 - }, - "result": [ - { - "biotype": "transcript", - "sourceId": "nm_000059", - "updatedBy": { - "createdAt": 1653589817005, - "lastLoginAt": 1653589823040, - "name": "mlemieux", - "groups": [ - "#25:0" - ], - "signedLicenseAt": 1653589817002, - "firstLoginAt": 1653589823040, - "uuid": "d3ccc0e1-a51d-4086-abd4-a35ee8f8b6fa", - "loginCount": 1, - "@rid": "#32:0", - "@class": "User" - }, - "displayName": "NM_000059", - "deprecated": false, - "sourceIdVersion": null, - "source": { - "createdAt": 1653589823081, - "updatedBy": "#32:0", - "createdBy": "#32:0", - "displayName": "RefSeq", - "usage": "https://www.ncbi.nlm.nih.gov/home/about/policies", - "name": "refseq", - "description": "a comprehensive, integrated, non-redundant, well-annotated set of reference sequences including genomic, transcript, and protein.", - "sort": 99999, - "uuid": "f319e234-13b1-4841-b049-e48aacf2644a", - "url": "https://www.ncbi.nlm.nih.gov/refseq", - "longName": "refseq: ncbi reference sequence database", - "updatedAt": 1653589823081, - "@rid": "#42:0", - "@class": "Source" - }, - "uuid": "1703537d-b15a-4a95-a3f8-575939e5c89d", - "createdAt": 1653589823632, - "createdBy": { - "createdAt": 1653589817005, - "lastLoginAt": 1653589823040, - "name": "mlemieux", - "groups": [ - "#25:0" - ], - "signedLicenseAt": 1653589817002, - "firstLoginAt": 1653589823040, - "uuid": "d3ccc0e1-a51d-4086-abd4-a35ee8f8b6fa", - "loginCount": 1, - "@rid": "#32:0", - "@class": "User" - }, - "name": "nm_000059", - "alias": false, - "updatedAt": 1653589823632, - "@rid": "#137:0", - "@class": "Feature" - } - ] - } - }, - { - "opt": { - "body": { - "in": "#137:0", - "out": "#138:4", - "source": "#41:0" - }, - "method": "POST", - "uri": "/crossreferenceof" - }, - "result": { - "result": { - "@class": "CrossReferenceOf", - "out": "#138:4", - "in": "#137:0", - "source": "#41:0", - "createdBy": "#32:0", - "uuid": "5b3d1a81-1453-4464-8cba-4ef3d660c163", - "createdAt": 1653589830261, - "@rid": "#83:0", - "@version": 1 - } - } - }, { "opt": { "body": { diff --git a/test/ensembl/ensembl.uploadFile.test.js b/test/ensembl/ensembl.uploadFile.test.js index 7165a253..15d43aee 100644 --- a/test/ensembl/ensembl.uploadFile.test.js +++ b/test/ensembl/ensembl.uploadFile.test.js @@ -107,7 +107,6 @@ describe('uploadFile in Ensembl loader', () => { { '@rid': '#138:4', edges: [ - { class: 'crossreferenceof', in: '#137:0' }, { class: 'elementof', in: '#139:0' }, { class: 'generalizationof', in: '#137:4' }, ], @@ -273,14 +272,6 @@ describe('uploadFile in Ensembl loader', () => { ], feature: { biotype: 'gene', name: 'XRCC11', source: 'hgnc' }, }, - { - '@rid': '#137:0', - edges: [], - feature: { - biotype: 'transcript', source: 'refseq', sourceId: 'NM_000059', sourceIdVersion: null, - }, - - }, ].map(el => Object.assign(el, { // Custom method - Returns the source's RID from mockDataset From a6dd81b61f2896250e5ef7a1194bc76b336335e9 Mon Sep 17 00:00:00 2001 From: sshugsc Date: Thu, 14 Mar 2024 15:52:02 -0700 Subject: [PATCH 3/3] bump to version v7.0.1 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 28f68c62..b68cc3fa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@bcgsc-pori/graphkb-loader", - "version": "7.0.0", + "version": "7.0.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@bcgsc-pori/graphkb-loader", - "version": "7.0.0", + "version": "7.0.1", "license": "GPL-3", "dependencies": { "@bcgsc-pori/graphkb-parser": "^1.1.1", diff --git a/package.json b/package.json index 72866258..818ba165 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@bcgsc-pori/graphkb-loader", "main": "src/index.js", - "version": "7.0.0", + "version": "7.0.1", "repository": { "type": "git", "url": "https://github.com/bcgsc/pori_graphkb_loader.git"