From 5a3cebc22d75a121c7208a5264777114d3ce04e1 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Wed, 4 Sep 2024 08:40:17 -0500 Subject: [PATCH 1/4] added ncbi api-key option --- rescript/ncbi.py | 29 +++++++++++++++++++++-------- rescript/plugin_setup.py | 8 ++++++-- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/rescript/ncbi.py b/rescript/ncbi.py index a89fe776..f850bc36 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -73,12 +73,15 @@ def get_ncbi_data( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: str = None ) -> (DNAIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + _entrez_params['api_key'] = api_key seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'nuccore') @@ -93,12 +96,15 @@ def get_ncbi_data( def get_ncbi_data_protein( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: str = None ) -> (ProteinIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + _entrez_params['api_key'] = api_key seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'protein') @@ -120,7 +126,8 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, if query: seqs, taxids = get_data_for_query( - query, logging_level, n_jobs, request_lock, _entrez_delay, db) + query, logging_level, n_jobs, request_lock, + _entrez_delay, db) if accession_ids: accs = accession_ids.get_ids() @@ -134,11 +141,12 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, taxids.update(acc_taxids) else: seqs, taxids = get_data_for_accs( - accs, logging_level, n_jobs, request_lock, _entrez_delay, db) + accs, logging_level, n_jobs, request_lock, + _entrez_delay, db) taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation, - logging_level, n_jobs, request_lock, - _entrez_delay) + logging_level, n_jobs, + request_lock, _entrez_delay) for acc in bad_accs: del seqs[acc] @@ -212,6 +220,7 @@ def request(params): r = requests.post( epost, data=data, params=_entrez_params, timeout=10, stream=True) + print('\nRequesting the following epost url: ', r.url) finally: request_lock.release() logger.debug('request lock released') @@ -238,6 +247,7 @@ def _esearch(params, logging_level, entrez_delay=0.334): def request(params): time.sleep(entrez_delay) r = requests.get(esearch, params=params, timeout=10) + print('\nRequesting the following esearch url: ', r.url) r.raise_for_status() webenv = parse(r.content)['eSearchResult'] if 'WebEnv' not in webenv: @@ -263,6 +273,7 @@ def request(): time.sleep(entrez_delay) try: r = requests.get(efetch, params=params, timeout=10, stream=True) + print('\nRequesting the following efetch url: ', r.url) finally: request_lock.release() logger.debug('request lock released') @@ -353,6 +364,7 @@ def get_data_for_accs(accs, logging_level, n_jobs, request_lock, params = dict( db=db, rettype='fasta', retmode='xml', **_entrez_params ) + records = _get_for_ids(params, accs, logging_level, n_jobs, request_lock, True, entrez_delay) seqs = {} @@ -385,6 +397,7 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, params = dict( db=db, term=query, usehistory='y', retmax=0, **_entrez_params ) + params, expected_num_records = _esearch(params, logging_level, entrez_delay) if expected_num_records > 166666: @@ -419,8 +432,8 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, def get_taxonomies( - taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock, - entrez_delay=0.334): + taxids, ranks, rank_propagation, logging_level, n_jobs, + request_lock, entrez_delay=0.334): # download the taxonomies params = dict(db='taxonomy', **_entrez_params) ids = set(map(str, taxids.values())) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 5a5c6baf..c336edc2 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -869,7 +869,8 @@ 'rank_propagation': Bool, 'logging_level': Str % Choices([ 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - 'n_jobs': Int % Range(1, None) + 'n_jobs': Int % Range(1, None), + 'api_key': Str } GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = { 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' @@ -879,7 +880,10 @@ 'logging_level': 'Logging level, set to INFO for download progress or ' 'DEBUG for copious verbosity', 'n_jobs': 'Number of concurrent download connections. More is faster ' - 'until you run out of bandwidth.' + 'until you run out of bandwidth.', + 'api_key': 'NCBI API Key that increases requests/second from 3 to 10. ' + 'See: ' + 'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.' } GET_NCBI_DATA_PARAM_DESCRIPTIONS_DNA = { 'query': 'Query on the NCBI Nucleotide database', From 6e43bceeda004fe0dc3238213942d7c0abcaba6d Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Mon, 9 Sep 2024 11:58:38 -0500 Subject: [PATCH 2/4] set _entrez_delay based on api_key --- rescript/ncbi.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rescript/ncbi.py b/rescript/ncbi.py index f850bc36..4815ec6e 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -82,6 +82,8 @@ def get_ncbi_data( raise ValueError('Query or accession_ids must be supplied') if api_key: _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'nuccore') @@ -105,6 +107,8 @@ def get_ncbi_data_protein( raise ValueError('Query or accession_ids must be supplied') if api_key: _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'protein') @@ -221,6 +225,7 @@ def request(params): epost, data=data, params=_entrez_params, timeout=10, stream=True) print('\nRequesting the following epost url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') finally: request_lock.release() logger.debug('request lock released') @@ -248,6 +253,7 @@ def request(params): time.sleep(entrez_delay) r = requests.get(esearch, params=params, timeout=10) print('\nRequesting the following esearch url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') r.raise_for_status() webenv = parse(r.content)['eSearchResult'] if 'WebEnv' not in webenv: @@ -274,6 +280,7 @@ def request(): try: r = requests.get(efetch, params=params, timeout=10, stream=True) print('\nRequesting the following efetch url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') finally: request_lock.release() logger.debug('request lock released') From 53821fa8485ec260b8675414b2cb24d7e545163c Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Mon, 9 Sep 2024 15:06:59 -0500 Subject: [PATCH 3/4] api key added via metadata file --- rescript/ncbi.py | 24 ++++++++++++++++-------- rescript/plugin_setup.py | 5 +++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/rescript/ncbi.py b/rescript/ncbi.py index 4815ec6e..fd9ae667 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -74,16 +74,20 @@ def get_ncbi_data( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, logging_level: str = None, n_jobs: int = 1, - api_key: str = None + api_key: Metadata = None ) -> (DNAIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') if api_key: - _entrez_params['api_key'] = api_key - global _entrez_delay - _entrez_delay = 0.1 + try: + (api_key,) = api_key.get_ids() + _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 + except ValueError: + raise ValueError("API KEY file should contain only one value!") seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'nuccore') @@ -99,16 +103,20 @@ def get_ncbi_data_protein( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, logging_level: str = None, n_jobs: int = 1, - api_key: str = None + api_key: Metadata = None ) -> (ProteinIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') if api_key: - _entrez_params['api_key'] = api_key - global _entrez_delay - _entrez_delay = 0.1 + try: + (api_key,) = api_key.get_ids() + _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 + except ValueError: + raise ValueError("API KEY file should contain only one value!") seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'protein') diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index c336edc2..7bc926ed 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -870,7 +870,7 @@ 'logging_level': Str % Choices([ 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 'n_jobs': Int % Range(1, None), - 'api_key': Str + 'api_key': Metadata } GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = { 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' @@ -881,7 +881,8 @@ 'DEBUG for copious verbosity', 'n_jobs': 'Number of concurrent download connections. More is faster ' 'until you run out of bandwidth.', - 'api_key': 'NCBI API Key that increases requests/second from 3 to 10. ' + 'api_key': 'Metadata file that contains the NCBI API Key. This will ' + 'increases requests/second from 3 to 10. ' 'See: ' 'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.' } From 38dd82e0e015ea0c970e42f51e3afced79e8c316 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Mon, 9 Sep 2024 16:39:20 -0500 Subject: [PATCH 4/4] remove tuple parenthesis from get_ids lines --- rescript/ncbi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rescript/ncbi.py b/rescript/ncbi.py index fd9ae667..258aeba9 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -82,7 +82,7 @@ def get_ncbi_data( raise ValueError('Query or accession_ids must be supplied') if api_key: try: - (api_key,) = api_key.get_ids() + api_key, = api_key.get_ids() _entrez_params['api_key'] = api_key global _entrez_delay _entrez_delay = 0.1 @@ -111,7 +111,7 @@ def get_ncbi_data_protein( raise ValueError('Query or accession_ids must be supplied') if api_key: try: - (api_key,) = api_key.get_ids() + api_key, = api_key.get_ids() _entrez_params['api_key'] = api_key global _entrez_delay _entrez_delay = 0.1