Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds NCBI api-key option #203

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions rescript/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,21 @@
def get_ncbi_data(
query: str = None, accession_ids: Metadata = None,
ranks: list = None, rank_propagation: bool = True,
logging_level: str = None, n_jobs: int = 1
logging_level: str = None, n_jobs: int = 1,
api_key: Metadata = None
) -> (DNAIterator, DataFrame):
if ranks is None:
ranks = _default_ranks
if query is None and accession_ids is None:
raise ValueError('Query or accession_ids must be supplied')
if api_key:
try:
api_key, = api_key.get_ids()
_entrez_params['api_key'] = api_key
global _entrez_delay
_entrez_delay = 0.1
except ValueError:
raise ValueError("API KEY file should contain only one value!")

seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation,
logging_level, n_jobs, 'nuccore')
Expand All @@ -93,12 +102,21 @@ def get_ncbi_data(
def get_ncbi_data_protein(
query: str = None, accession_ids: Metadata = None,
ranks: list = None, rank_propagation: bool = True,
logging_level: str = None, n_jobs: int = 1
logging_level: str = None, n_jobs: int = 1,
api_key: Metadata = None
) -> (ProteinIterator, DataFrame):
if ranks is None:
ranks = _default_ranks
if query is None and accession_ids is None:
raise ValueError('Query or accession_ids must be supplied')
if api_key:
try:
api_key, = api_key.get_ids()
_entrez_params['api_key'] = api_key
global _entrez_delay
_entrez_delay = 0.1
except ValueError:
raise ValueError("API KEY file should contain only one value!")

seqs, taxa = _get_ncbi_data(query, accession_ids, ranks, rank_propagation,
logging_level, n_jobs, 'protein')
Expand All @@ -120,7 +138,8 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None,

if query:
seqs, taxids = get_data_for_query(
query, logging_level, n_jobs, request_lock, _entrez_delay, db)
query, logging_level, n_jobs, request_lock,
_entrez_delay, db)

if accession_ids:
accs = accession_ids.get_ids()
Expand All @@ -134,11 +153,12 @@ def _get_ncbi_data(query: str = None, accession_ids: Metadata = None,
taxids.update(acc_taxids)
else:
seqs, taxids = get_data_for_accs(
accs, logging_level, n_jobs, request_lock, _entrez_delay, db)
accs, logging_level, n_jobs, request_lock,
_entrez_delay, db)

taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation,
logging_level, n_jobs, request_lock,
_entrez_delay)
logging_level, n_jobs,
request_lock, _entrez_delay)
for acc in bad_accs:
del seqs[acc]

Expand Down Expand Up @@ -212,6 +232,8 @@ def request(params):
r = requests.post(
epost, data=data, params=_entrez_params, timeout=10,
stream=True)
print('\nRequesting the following epost url: ', r.url)
print('With a delay of ', entrez_delay, ' seconds.')
finally:
request_lock.release()
logger.debug('request lock released')
Expand All @@ -238,6 +260,8 @@ def _esearch(params, logging_level, entrez_delay=0.334):
def request(params):
time.sleep(entrez_delay)
r = requests.get(esearch, params=params, timeout=10)
print('\nRequesting the following esearch url: ', r.url)
print('With a delay of ', entrez_delay, ' seconds.')
r.raise_for_status()
webenv = parse(r.content)['eSearchResult']
if 'WebEnv' not in webenv:
Expand All @@ -263,6 +287,8 @@ def request():
time.sleep(entrez_delay)
try:
r = requests.get(efetch, params=params, timeout=10, stream=True)
print('\nRequesting the following efetch url: ', r.url)
print('With a delay of ', entrez_delay, ' seconds.')
finally:
request_lock.release()
logger.debug('request lock released')
Expand Down Expand Up @@ -353,6 +379,7 @@ def get_data_for_accs(accs, logging_level, n_jobs, request_lock,
params = dict(
db=db, rettype='fasta', retmode='xml', **_entrez_params
)

records = _get_for_ids(params, accs, logging_level, n_jobs, request_lock,
True, entrez_delay)
seqs = {}
Expand Down Expand Up @@ -385,6 +412,7 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock,
params = dict(
db=db, term=query, usehistory='y', retmax=0, **_entrez_params
)

params, expected_num_records = _esearch(params, logging_level,
entrez_delay)
if expected_num_records > 166666:
Expand Down Expand Up @@ -419,8 +447,8 @@ def get_data_for_query(query, logging_level, n_jobs, request_lock,


def get_taxonomies(
taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock,
entrez_delay=0.334):
taxids, ranks, rank_propagation, logging_level, n_jobs,
request_lock, entrez_delay=0.334):
# download the taxonomies
params = dict(db='taxonomy', **_entrez_params)
ids = set(map(str, taxids.values()))
Expand Down
9 changes: 7 additions & 2 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,8 @@
'rank_propagation': Bool,
'logging_level': Str % Choices([
'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
'n_jobs': Int % Range(1, None)
'n_jobs': Int % Range(1, None),
'api_key': Metadata
}
GET_NCBI_DATA_PARAM_DESCRIPTIONS_COMMON = {
'ranks': 'List of taxonomic ranks for building a taxonomy from the '
Expand All @@ -879,7 +880,11 @@
'logging_level': 'Logging level, set to INFO for download progress or '
'DEBUG for copious verbosity',
'n_jobs': 'Number of concurrent download connections. More is faster '
'until you run out of bandwidth.'
'until you run out of bandwidth.',
'api_key': 'Metadata file that contains the NCBI API Key. This will '
'increases requests/second from 3 to 10. '
'See: '
'https://support.nlm.nih.gov/knowledgebase/article/KA-05317/.'
}
GET_NCBI_DATA_PARAM_DESCRIPTIONS_DNA = {
'query': 'Query on the NCBI Nucleotide database',
Expand Down
Loading