Skip to content

Commit

Permalink
updated old uploadlists API to new UniProt ID Mapping API
Browse files Browse the repository at this point in the history
  • Loading branch information
deeenes committed Jun 10, 2023
1 parent 1ebbc27 commit f636c1e
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 36 deletions.
4 changes: 3 additions & 1 deletion pypath/data/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ server_annotations_full_download: false
mapping_use_cache: true
mapping_uniprot_static: false
use_intermediate_cache: true
uniprot_uploadlists_chunk_size: 10000
uniprot_uploadlists_chunk_size: 100000
uniprot_idmapping_timeout: 180
uniprot_idmapping_poll_interval: 3
timestamp_dirs: true
uniprot_info_maxlen: 500
uniprot_datasheet_connect_timeout: 10
Expand Down
50 changes: 45 additions & 5 deletions pypath/internals/input_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
'ensgt': 'Ensembl_Genomes_Transcript',
'hgnc': 'HGNC',
'ensp_string': 'STRING',
'genesymbol': 'Gene_Name',
}

BIOMART_MAPPING = {
Expand Down Expand Up @@ -227,6 +228,16 @@ def __contains__(self, other: str) -> bool:
)


def swap_sides(self):

self.id_type_a, self.id_type_b = self.id_type_b, self.id_type_a
self.resource_id_type_a, self.resource_id_type_b = (
self.resource_id_type_b,
self.resource_id_type_a,
)



class FileMapping(MappingInput):

def __init__(
Expand Down Expand Up @@ -405,12 +416,9 @@ def __init__(
resource_id_type_b = uniprot_id_type_b,
)

self.swissprot = swissprot
self._set_swissprot(swissprot)
self.ac_mapping = AC_MAPPING

self.uniprot_id_type_a = self._resource_id_type_a
self.uniprot_id_type_b = self._resource_id_type_b

self._update_uniprot_types()
self.entity_type = 'protein'


Expand All @@ -421,6 +429,18 @@ def set_organism(self, ncbi_tax_id):
return other_organism


def swap_sides(self):

MappingInput.swap_sides(self)
self._update_uniprot_types()


def _update_uniprot_types(self):

self.uniprot_id_type_a = self._resource_id_type_a
self.uniprot_id_type_b = self._resource_id_type_b


def _resource_id_type(self, side: str) -> str:

uniprot_id_types = {
Expand All @@ -436,6 +456,26 @@ def _resource_id_type(self, side: str) -> str:
)


def _set_swissprot(self, swissprot: bool | None) -> None:

values = {'swissprot': True, 'trembl': False, 'uniprot': True}

if swissprot is None:

swissprot = values.get(
self.id_type_a,
values.get(self.id_type_b, swissprot)
)

self.swissprot = swissprot


@classmethod
def _uniprotkb_id_type(cls, id_type: str) -> bool:

return id_type in cls._from_uniprot


class ProMapping(MappingInput):
"""
Provides parameters for mapping table from the Protein Ontology
Expand Down
16 changes: 16 additions & 0 deletions pypath/share/curl.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,7 @@ def __init__(
if CACHEDEL:

self.delete_cache_file()
self.init_cache()

if not self.use_cache and not DRYRUN:

Expand Down Expand Up @@ -1646,6 +1647,21 @@ def get_cache_file_name(self):
)


@classmethod
def cache_path(self, **kwargs) -> str:
"""
Returns the cache path without performing download or creating file.
Args:
kwargs:
Arguments to `Curl`.
"""

kwargs.update({'setup': False, 'call': False, 'process': False})

return Curl(**kwargs).cache_file_name


@classmethod
def replace_forbidden(cls, name: str, repl: str = '_') -> str:
"""
Expand Down
145 changes: 115 additions & 30 deletions pypath/utils/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,14 @@ def read_mapping_file(self):
self.b_to_a = b_to_a if self.load_b_to_a else None


@staticmethod
def _uniprotkb_id_type(id_type: str) -> bool:

return input_formats.UniprotListMapping._uniprotkb_id_type(
id_type,
)


def read_mapping_uniprot_list(self):
"""
Builds a mapping table by downloading data from UniProt's
Expand All @@ -608,6 +616,7 @@ def read_mapping_uniprot_list(self):

a_to_b = collections.defaultdict(set)
b_to_a = collections.defaultdict(set)
swap = False

if not self.uniprots:

Expand All @@ -617,14 +626,26 @@ def read_mapping_uniprot_list(self):
# getting a proteome wide list of UniProt IDs. If the translated
# ID type is not UniProt, then first we need to translate the
# proteome wide reference list from UniProt to the target ID type.
if self.param.id_type_a != 'uniprot':
if not self._uniprotkb_id_type(self.param.id_type_a):

u_target = self._read_mapping_uniprot_list(
uniprot_id_type_a = 'ACC',
uniprot_id_type_b = self.param.uniprot_id_type_a,
)
if self._uniprotkb_id_type(self.param.id_type_b):

swap = True
self.param.swap_sides()
self.load_a_to_b, self.load_b_to_a = (
self.load_b_to_a,
self.load_a_to_b,
)
upload_ac_list = self.uniprots

else:

u_target = self._read_mapping_uniprot_list(
uniprot_id_type_a = 'UniProtKB_AC-ID',
uniprot_id_type_b = self.param.uniprot_id_type_a,
)

upload_ac_list = [l.split('\t')[1].strip() for l in u_target]
upload_ac_list = [l.split('\t')[1].strip() for l in u_target]

else:

Expand All @@ -650,6 +671,15 @@ def read_mapping_uniprot_list(self):

b_to_a[l[1]].add(l[0])

if swap:

a_to_b, b_to_a = b_to_a, a_to_b
self.load_a_to_b, self.load_b_to_a = (
self.load_b_to_a,
self.load_a_to_b,
)
self.param.swap_sides()

self.a_to_b = a_to_b if self.load_a_to_b else None
self.b_to_a = b_to_a if self.load_b_to_a else None

Expand Down Expand Up @@ -707,8 +737,8 @@ def _read_mapping_uniprot_list(
'data. Querying a list of %u IDs.' % len(upload_ac_list)
)

url = urls.urls['uniprot_basic']['lists']

run_url = urls.urls['uniprot_idmapping']['run']
poll_result = {}
result = []

# loading data in chunks of 10,000 by default
Expand All @@ -725,45 +755,100 @@ def _read_mapping_uniprot_list(

post = {
'from': uniprot_id_type_a,
'format': 'tab',
'to': uniprot_id_type_b,
'uploadQuery': ' '.join(sorted(this_chunk)),
'ids': ' '.join(sorted(this_chunk)),
}
accept_json = {'req_headers': ['Accept: application/json']}

c = curl.Curl(url, post = post, large = True, silent = False)
run_args = {'url': run_url, 'post': post}
nocache = {'cache': False, 'large': False}

# 3 extra attempts
if c.result is None:
cache_path = curl.Curl.cache_path(**run_args)

for i in xrange(3):
if not os.path.exists(cache_path):

c = curl.Curl(
url,
post = post,
large = True,
silent = False,
cache = False,
slow = True,
run_c = curl.Curl( **run_args, **nocache, **accept_json)

if run_c.status != 200:

raise RuntimeError(
'Failed to submit job to UniProt ID Mapping. '
'See details in the log.'
)

if c.result is not None:
jobid = json.loads(run_c.result)['jobId']

self._log(
f'Submitted job to UniProt ID Mapping, job ID: `{jobid}`.'
)

timeout = settings.get('uniprot_idmapping_timeout')
interval = settings.get('uniprot_idmapping_poll_interval')
max_polls = math.ceil(timeout / interval)
poll_url = urls.urls['uniprot_idmapping']['poll'] % jobid
poll_args = {'url': poll_url} | nocache | accept_json

for i in range(max_polls):

self._log(
f'Polling job UniProt ID Mapping job `{jobid}`, '
f'poll {i + 1} of {max_polls}.'
)

poll_c = curl.Curl(**poll_args)

if poll_c.status != 200:

self._log(f'Poll failed with HTTP {poll_c.status}.')
continue

poll_result = json.loads(poll_c.result)

if 'status' in poll_result or 'failedIds' in poll_result:

break

if c.result is None or c.fileobj.read(5) == '<!DOC':
elif 'messages' in poll_result:

msg = (
'UniProt ID Mapping job failed: ' +
' '.join(common.to_list(poll_result['messages']))
)

self._console(
'Error at downloading ID mapping data from UniProt.'
self._log(msg)

raise RuntimeError(msg)

time.sleep(interval)

det_url = urls.urls['uniprot_idmapping']['details'] % jobid
det_c = curl.Curl(url = det_url, **nocache, **accept_json)
result_url = (
json.loads(det_c.result)['redirectURL'].
replace('/idmapping/results/', '/idmapping/stream/').
replace('/results/', '/results/stream/').
__add__('?format=tsv')
)

self._log(
'Retrieving UniProt ID Mapping results '
f'from `{result_url}`'
)

c.result = ''
with curl.cache_delete_on():

c.fileobj.seek(0)
res_c = curl.Curl(
url = result_url,
cache = cache_path,
large = True,
silent = False,
)

else:

# removing the header row
_ = next(c.result)
res_c = curl.Curl(**run_args)

result.extend(list(c.fileobj)[1:])
result.extend(list(res_c.fileobj)[1:])

return result

Expand Down

0 comments on commit f636c1e

Please sign in to comment.