Skip to content

Commit

Permalink
added ncbi api-key option
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson committed Sep 4, 2024
1 parent 4f7ab4b commit 5a3cebc
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
29 changes: 21 additions & 8 deletions rescript/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,15 @@
def get_ncbi_data(
query: str = None, accession_ids: Metadata = None,
ranks: list = None, rank_propagation: bool = True,
logging_level: str = None, n_jobs: int = 1
logging_level: str = None, n_jobs: int = 1,
api_key: str = None
) -> (DNAIterator, DataFrame):
if ranks is None:
ranks = _default_ranks
if query is None and accession_ids is None:
raise ValueError('Query or accession_ids must be supplied')
if api_key:
_entrez_params['api_key'] = api_key

seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation,
logging_level, n_jobs, 'nuccore')
Expand All @@ -93,12 +96,15 @@ def get_ncbi_data(
def get_ncbi_data_protein(
query: str = None, accession_ids: Metadata = None,
ranks: list = None, rank_propagation: bool = True,
logging_level: str = None, n_jobs: int = 1
logging_level: str = None, n_jobs: int = 1,
api_key: str = None
) -> (ProteinIterator, DataFrame):
if ranks is None:
ranks = _default_ranks
if query is None and accession_ids is None:
raise ValueError('Query or accession_ids must be supplied')
if api_key:
_entrez_params['api_key'] = api_key

seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation,
logging_level, n_jobs, 'protein')
Expand All @@ -120,7 +126,8 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None,

if query:
seqs, taxids = get_data_for_query(
query, logging_level, n_jobs, request_lock, _entrez_delay, db)
query, logging_level, n_jobs, request_lock,
_entrez_delay, db)

if accession_ids:
accs = accession_ids.get_ids()
Expand All @@ -134,11 +141,12 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None,
taxids.update(acc_taxids)
else:
seqs, taxids = get_data_for_accs(
accs, logging_level, n_jobs, request_lock, _entrez_delay, db)
accs, logging_level, n_jobs, request_lock,
_entrez_delay, db)

taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation,
logging_level, n_jobs, request_lock,
_entrez_delay)
logging_level, n_jobs,
request_lock, _entrez_delay)
for acc in bad_accs:
del seqs[acc]

Expand Down Expand Up @@ -212,6 +220,7 @@ def request(params):
r = requests.post(
epost, data=data, params=_entrez_params, timeout=10,
stream=True)
print('\nRequesting the following epost url: ', r.url)
finally:
request_lock.release()
logger.debug('request lock released')
Expand All @@ -238,6 +247,7 @@ def _esearch(params, logging_level, entrez_delay=0.334):
def request(params):
time.sleep(entrez_delay)
r = requests.get(esearch, params=params, timeout=10)
print('\nRequesting the following esearch url: ', r.url)
r.raise_for_status()
webenv = parse(r.content)['eSearchResult']
if 'WebEnv' not in webenv:
Expand All @@ -263,6 +273,7 @@ def request():
time.sleep(entrez_delay)
try:
r = requests.get(efetch, params=params, timeout=10, stream=True)
print('\nRequesting the following efetch url: ', r.url)
finally:
request_lock.release()
logger.debug('request lock released')
Expand Down Expand Up @@ -353,6 +364,7 @@ def get_data_for_accs(accs, logging_level, n_jobs, request_lock,
params = dict(
db=db, rettype='fasta', retmode='xml', **_entrez_params
)

records = _get_for_ids(params, accs, logging_level, n_jobs, request_lock,
True, entrez_delay)
seqs = {}
Expand Down Expand Up @@ -385,6 +397,7 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock,
params = dict(
db=db, term=query, usehistory='y', retmax=0, **_entrez_params
)

params, expected_num_records = _esearch(params, logging_level,
entrez_delay)
if expected_num_records > 166666:
Expand Down Expand Up @@ -419,8 +432,8 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock,


def get_taxonomies(
taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock,
entrez_delay=0.334):
taxids, ranks, rank_propagation, logging_level, n_jobs,
request_lock, entrez_delay=0.334):
# download the taxonomies
params = dict(db='taxonomy', **_entrez_params)
ids = set(map(str, taxids.values()))
Expand Down
8 changes: 6 additions & 2 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,8 @@
'rank_propagation': Bool,
'logging_level': Str % Choices([
'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
'n_jobs': Int % Range(1, None)
'n_jobs': Int % Range(1, None),
'api_key': Str
}
GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = {
'ranks': 'List of taxonomic ranks for building a taxonomy from the '
Expand All @@ -879,7 +880,10 @@
'logging_level': 'Logging level, set to INFO for download progress or '
'DEBUG for copious verbosity',
'n_jobs': 'Number of concurrent download connections. More is faster '
'until you run out of bandwidth.'
'until you run out of bandwidth.',
'api_key': 'NCBI API Key that increases requests/second from 3 to 10. '
'See: '
'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.'
}
GET_NCBI_DATA_PARAM_DESCRIPTIONS_DNA = {
'query': 'Query on the NCBI Nucleotide database',
Expand Down

0 comments on commit 5a3cebc

Please sign in to comment.