From 5a3cebc22d75a121c7208a5264777114d3ce04e1 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Wed, 4 Sep 2024 08:40:17 -0500 Subject: [PATCH] added ncbi api-key option --- rescript/ncbi.py | 29 +++++++++++++++++++++-------- rescript/plugin_setup.py | 8 ++++++-- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/rescript/ncbi.py b/rescript/ncbi.py index a89fe776..f850bc36 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -73,12 +73,15 @@ def get_ncbi_data( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: str = None ) -> (DNAIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + _entrez_params['api_key'] = api_key seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'nuccore') @@ -93,12 +96,15 @@ def get_ncbi_data( def get_ncbi_data_protein( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: str = None ) -> (ProteinIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + _entrez_params['api_key'] = api_key seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'protein') @@ -120,7 +126,8 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, if query: seqs, taxids = get_data_for_query( - query, logging_level, n_jobs, request_lock, _entrez_delay, db) + query, logging_level, n_jobs, request_lock, + _entrez_delay, db) if accession_ids: accs = accession_ids.get_ids() @@ -134,11 +141,12 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, taxids.update(acc_taxids) else: seqs, taxids = get_data_for_accs( - accs, logging_level, n_jobs, request_lock, _entrez_delay, db) + accs, logging_level, n_jobs, request_lock, + _entrez_delay, db) taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation, - logging_level, n_jobs, request_lock, - _entrez_delay) + logging_level, n_jobs, + request_lock, _entrez_delay) for acc in bad_accs: del seqs[acc] @@ -212,6 +220,7 @@ def request(params): r = requests.post( epost, data=data, params=_entrez_params, timeout=10, stream=True) + print('\nRequesting the following epost url: ', r.url) finally: request_lock.release() logger.debug('request lock released') @@ -238,6 +247,7 @@ def _esearch(params, logging_level, entrez_delay=0.334): def request(params): time.sleep(entrez_delay) r = requests.get(esearch, params=params, timeout=10) + print('\nRequesting the following esearch url: ', r.url) r.raise_for_status() webenv = parse(r.content)['eSearchResult'] if 'WebEnv' not in webenv: @@ -263,6 +273,7 @@ def request(): time.sleep(entrez_delay) try: r = requests.get(efetch, params=params, timeout=10, stream=True) + print('\nRequesting the following efetch url: ', r.url) finally: request_lock.release() logger.debug('request lock released') @@ -353,6 +364,7 @@ def get_data_for_accs(accs, logging_level, n_jobs, request_lock, params = dict( db=db, rettype='fasta', retmode='xml', **_entrez_params ) + records = _get_for_ids(params, accs, logging_level, n_jobs, request_lock, True, entrez_delay) seqs = {} @@ -385,6 +397,7 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, params = dict( db=db, term=query, usehistory='y', retmax=0, **_entrez_params ) + params, expected_num_records = _esearch(params, logging_level, entrez_delay) if expected_num_records > 166666: @@ -419,8 +432,8 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, def get_taxonomies( - taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock, - entrez_delay=0.334): + taxids, ranks, rank_propagation, logging_level, n_jobs, + request_lock, entrez_delay=0.334): # download the taxonomies params = dict(db='taxonomy', **_entrez_params) ids = set(map(str, taxids.values())) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 5a5c6baf..c336edc2 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -869,7 +869,8 @@ 'rank_propagation': Bool, 'logging_level': Str % Choices([ 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - 'n_jobs': Int % Range(1, None) + 'n_jobs': Int % Range(1, None), + 'api_key': Str } GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = { 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' @@ -879,7 +880,10 @@ 'logging_level': 'Logging level, set to INFO for download progress or ' 'DEBUG for copious verbosity', 'n_jobs': 'Number of concurrent download connections. More is faster ' - 'until you run out of bandwidth.' + 'until you run out of bandwidth.', + 'api_key': 'NCBI API Key that increases requests/second from 3 to 10. ' + 'See: ' + 'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.' } GET_NCBI_DATA_PARAM_DESCRIPTIONS_DNA = { 'query': 'Query on the NCBI Nucleotide database',