diff --git a/rescript/ncbi.py b/rescript/ncbi.py index a89fe77..258aeba 100644 --- a/rescript/ncbi.py +++ b/rescript/ncbi.py @@ -73,12 +73,21 @@ def get_ncbi_data( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: Metadata = None ) -> (DNAIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + try: + api_key, = api_key.get_ids() + _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 + except ValueError: + raise ValueError("API KEY file should contain only one value!") seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'nuccore') @@ -93,12 +102,21 @@ def get_ncbi_data( def get_ncbi_data_protein( query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, - logging_level: str = None, n_jobs: int = 1 + logging_level: str = None, n_jobs: int = 1, + api_key: Metadata = None ) -> (ProteinIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') + if api_key: + try: + api_key, = api_key.get_ids() + _entrez_params['api_key'] = api_key + global _entrez_delay + _entrez_delay = 0.1 + except ValueError: + raise ValueError("API KEY file should contain only one value!") seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation, logging_level, n_jobs, 'protein') @@ -120,7 +138,8 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, if query: seqs, taxids = get_data_for_query( - query, logging_level, n_jobs, request_lock, _entrez_delay, db) + query, logging_level, n_jobs, request_lock, + _entrez_delay, db) if accession_ids: accs = accession_ids.get_ids() @@ -134,11 +153,12 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, taxids.update(acc_taxids) else: seqs, taxids = get_data_for_accs( - accs, logging_level, n_jobs, request_lock, _entrez_delay, db) + accs, logging_level, n_jobs, request_lock, + _entrez_delay, db) taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation, - logging_level, n_jobs, request_lock, - _entrez_delay) + logging_level, n_jobs, + request_lock, _entrez_delay) for acc in bad_accs: del seqs[acc] @@ -212,6 +232,8 @@ def request(params): r = requests.post( epost, data=data, params=_entrez_params, timeout=10, stream=True) + print('\nRequesting the following epost url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') finally: request_lock.release() logger.debug('request lock released') @@ -238,6 +260,8 @@ def _esearch(params, logging_level, entrez_delay=0.334): def request(params): time.sleep(entrez_delay) r = requests.get(esearch, params=params, timeout=10) + print('\nRequesting the following esearch url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') r.raise_for_status() webenv = parse(r.content)['eSearchResult'] if 'WebEnv' not in webenv: @@ -263,6 +287,8 @@ def request(): time.sleep(entrez_delay) try: r = requests.get(efetch, params=params, timeout=10, stream=True) + print('\nRequesting the following efetch url: ', r.url) + print('With a delay of ', entrez_delay, ' seconds.') finally: request_lock.release() logger.debug('request lock released') @@ -353,6 +379,7 @@ def get_data_for_accs(accs, logging_level, n_jobs, request_lock, params = dict( db=db, rettype='fasta', retmode='xml', **_entrez_params ) + records = _get_for_ids(params, accs, logging_level, n_jobs, request_lock, True, entrez_delay) seqs = {} @@ -385,6 +412,7 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, params = dict( db=db, term=query, usehistory='y', retmax=0, **_entrez_params ) + params, expected_num_records = _esearch(params, logging_level, entrez_delay) if expected_num_records > 166666: @@ -419,8 +447,8 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock, def get_taxonomies( - taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock, - entrez_delay=0.334): + taxids, ranks, rank_propagation, logging_level, n_jobs, + request_lock, entrez_delay=0.334): # download the taxonomies params = dict(db='taxonomy', **_entrez_params) ids = set(map(str, taxids.values())) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 5a5c6ba..7bc926e 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -869,7 +869,8 @@ 'rank_propagation': Bool, 'logging_level': Str % Choices([ 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - 'n_jobs': Int % Range(1, None) + 'n_jobs': Int % Range(1, None), + 'api_key': Metadata } GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = { 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' @@ -879,7 +880,11 @@ 'logging_level': 'Logging level, set to INFO for download progress or ' 'DEBUG for copious verbosity', 'n_jobs': 'Number of concurrent download connections. More is faster ' - 'until you run out of bandwidth.' + 'until you run out of bandwidth.', + 'api_key': 'Metadata file that contains the NCBI API Key. This will ' + 'increases requests/second from 3 to 10. ' + 'See: ' + 'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.' } GET_NCBI_DATA_PARAM_DESCRIPTIONS_DNA = { 'query': 'Query on the NCBI Nucleotide database',