updated old uploadlists API to new UniProt ID Mapping API

saezlab · Jun 10, 2023 · f636c1e · f636c1e
1 parent 1ebbc27
commit f636c1e
Show file tree

Hide file tree

Showing 4 changed files with 179 additions and 36 deletions.
diff --git a/pypath/data/settings.yaml b/pypath/data/settings.yaml
@@ -109,7 +109,9 @@ server_annotations_full_download: false
 mapping_use_cache: true
 mapping_uniprot_static: false
 use_intermediate_cache: true
-uniprot_uploadlists_chunk_size: 10000
+uniprot_uploadlists_chunk_size: 100000
+uniprot_idmapping_timeout: 180
+uniprot_idmapping_poll_interval: 3
 timestamp_dirs: true
 uniprot_info_maxlen: 500
 uniprot_datasheet_connect_timeout: 10

diff --git a/pypath/internals/input_formats.py b/pypath/internals/input_formats.py
@@ -75,6 +75,7 @@
     'ensgt': 'Ensembl_Genomes_Transcript',
     'hgnc': 'HGNC',
     'ensp_string': 'STRING',
+    'genesymbol': 'Gene_Name',
 }
 
 BIOMART_MAPPING = {
@@ -227,6 +228,16 @@ def __contains__(self, other: str) -> bool:
         )
 
 
+    def swap_sides(self):
+
+        self.id_type_a, self.id_type_b = self.id_type_b, self.id_type_a
+        self.resource_id_type_a, self.resource_id_type_b = (
+            self.resource_id_type_b,
+            self.resource_id_type_a,
+        )
+
+
+
 class FileMapping(MappingInput):
 
     def __init__(
@@ -405,12 +416,9 @@ def __init__(
             resource_id_type_b = uniprot_id_type_b,
         )
 
-        self.swissprot = swissprot
+        self._set_swissprot(swissprot)
         self.ac_mapping = AC_MAPPING
-
-        self.uniprot_id_type_a = self._resource_id_type_a
-        self.uniprot_id_type_b = self._resource_id_type_b
-
+        self._update_uniprot_types()
         self.entity_type = 'protein'
 
 
@@ -421,6 +429,18 @@ def set_organism(self, ncbi_tax_id):
         return other_organism
 
 
+    def swap_sides(self):
+
+        MappingInput.swap_sides(self)
+        self._update_uniprot_types()
+
+
+    def _update_uniprot_types(self):
+
+        self.uniprot_id_type_a = self._resource_id_type_a
+        self.uniprot_id_type_b = self._resource_id_type_b
+
+
     def _resource_id_type(self, side: str) -> str:
 
         uniprot_id_types = {
@@ -436,6 +456,26 @@ def _resource_id_type(self, side: str) -> str:
         )
 
 
+    def _set_swissprot(self, swissprot: bool | None) -> None:
+
+        values = {'swissprot': True, 'trembl': False, 'uniprot': True}
+
+        if swissprot is None:
+
+            swissprot = values.get(
+                self.id_type_a,
+                values.get(self.id_type_b, swissprot)
+            )
+
+        self.swissprot = swissprot
+
+
+    @classmethod
+    def _uniprotkb_id_type(cls, id_type: str) -> bool:
+
+        return id_type in cls._from_uniprot
+
+
 class ProMapping(MappingInput):
     """
     Provides parameters for mapping table from the Protein Ontology

diff --git a/pypath/share/curl.py b/pypath/share/curl.py
@@ -935,6 +935,7 @@ def __init__(
         if CACHEDEL:
 
             self.delete_cache_file()
+            self.init_cache()
 
         if not self.use_cache and not DRYRUN:
 
@@ -1646,6 +1647,21 @@ def get_cache_file_name(self):
         )
 
 
+    @classmethod
+    def cache_path(self, **kwargs) -> str:
+        """
+        Returns the cache path without performing download or creating file.
+
+        Args:
+            kwargs:
+                Arguments to `Curl`.
+        """
+
+        kwargs.update({'setup': False, 'call': False, 'process': False})
+
+        return Curl(**kwargs).cache_file_name
+
+
     @classmethod
     def replace_forbidden(cls, name: str, repl: str = '_') -> str:
         """

diff --git a/pypath/utils/mapping.py b/pypath/utils/mapping.py
@@ -600,6 +600,14 @@ def read_mapping_file(self):
         self.b_to_a = b_to_a if self.load_b_to_a else None
 
 
+    @staticmethod
+    def _uniprotkb_id_type(id_type: str) -> bool:
+
+        return input_formats.UniprotListMapping._uniprotkb_id_type(
+            id_type,
+        )
+
+
     def read_mapping_uniprot_list(self):
         """
         Builds a mapping table by downloading data from UniProt's
@@ -608,6 +616,7 @@ def read_mapping_uniprot_list(self):
 
         a_to_b = collections.defaultdict(set)
         b_to_a = collections.defaultdict(set)
+        swap = False
 
         if not self.uniprots:
 
@@ -617,14 +626,26 @@ def read_mapping_uniprot_list(self):
         # getting a proteome wide list of UniProt IDs. If the translated
         # ID type is not UniProt, then first we need to translate the
         # proteome wide reference list from UniProt to the target ID type.
-        if self.param.id_type_a != 'uniprot':
+        if not self._uniprotkb_id_type(self.param.id_type_a):
 
-            u_target = self._read_mapping_uniprot_list(
-                uniprot_id_type_a = 'ACC',
-                uniprot_id_type_b = self.param.uniprot_id_type_a,
-            )
+            if self._uniprotkb_id_type(self.param.id_type_b):
+
+                swap = True
+                self.param.swap_sides()
+                self.load_a_to_b, self.load_b_to_a = (
+                    self.load_b_to_a,
+                    self.load_a_to_b,
+                )
+                upload_ac_list = self.uniprots
+
+            else:
+
+                u_target = self._read_mapping_uniprot_list(
+                    uniprot_id_type_a = 'UniProtKB_AC-ID',
+                    uniprot_id_type_b = self.param.uniprot_id_type_a,
+                )
 
-            upload_ac_list = [l.split('\t')[1].strip() for l in u_target]
+                upload_ac_list = [l.split('\t')[1].strip() for l in u_target]
 
         else:
 
@@ -650,6 +671,15 @@ def read_mapping_uniprot_list(self):
 
                 b_to_a[l[1]].add(l[0])
 
+        if swap:
+
+            a_to_b, b_to_a = b_to_a, a_to_b
+            self.load_a_to_b, self.load_b_to_a = (
+                self.load_b_to_a,
+                self.load_a_to_b,
+            )
+            self.param.swap_sides()
+
         self.a_to_b = a_to_b if self.load_a_to_b else None
         self.b_to_a = b_to_a if self.load_b_to_a else None
 
@@ -707,8 +737,8 @@ def _read_mapping_uniprot_list(
             'data. Querying a list of %u IDs.' % len(upload_ac_list)
         )
 
-        url = urls.urls['uniprot_basic']['lists']
-
+        run_url = urls.urls['uniprot_idmapping']['run']
+        poll_result = {}
         result = []
 
         # loading data in chunks of 10,000 by default
@@ -725,45 +755,100 @@ def _read_mapping_uniprot_list(
 
             post = {
                 'from': uniprot_id_type_a,
-                'format': 'tab',
                 'to': uniprot_id_type_b,
-                'uploadQuery': ' '.join(sorted(this_chunk)),
+                'ids': ' '.join(sorted(this_chunk)),
             }
+            accept_json = {'req_headers': ['Accept: application/json']}
 
-            c = curl.Curl(url, post = post, large = True, silent = False)
+            run_args = {'url': run_url, 'post': post}
+            nocache = {'cache': False, 'large': False}
 
-            # 3 extra attempts
-            if c.result is None:
+            cache_path = curl.Curl.cache_path(**run_args)
 
-                for i in xrange(3):
+            if not os.path.exists(cache_path):
 
-                    c = curl.Curl(
-                        url,
-                        post = post,
-                        large = True,
-                        silent = False,
-                        cache = False,
-                        slow = True,
+                run_c = curl.Curl( **run_args, **nocache, **accept_json)
+
+                if run_c.status != 200:
+
+                    raise RuntimeError(
+                        'Failed to submit job to UniProt ID Mapping. '
+                        'See details in the log.'
                     )
 
-                    if c.result is not None:
+                jobid = json.loads(run_c.result)['jobId']
+
+                self._log(
+                    f'Submitted job to UniProt ID Mapping, job ID: `{jobid}`.'
+                )
+
+                timeout = settings.get('uniprot_idmapping_timeout')
+                interval = settings.get('uniprot_idmapping_poll_interval')
+                max_polls = math.ceil(timeout / interval)
+                poll_url = urls.urls['uniprot_idmapping']['poll'] % jobid
+                poll_args = {'url': poll_url} | nocache | accept_json
+
+                for i in range(max_polls):
+
+                    self._log(
+                        f'Polling job UniProt ID Mapping job `{jobid}`, '
+                        f'poll {i + 1} of {max_polls}.'
+                    )
+
+                    poll_c = curl.Curl(**poll_args)
+
+                    if poll_c.status != 200:
+
+                        self._log(f'Poll failed with HTTP {poll_c.status}.')
+                        continue
+
+                    poll_result = json.loads(poll_c.result)
+
+                    if 'status' in poll_result or 'failedIds' in poll_result:
 
                         break
 
-            if c.result is None or c.fileobj.read(5) == '<!DOC':
+                    elif 'messages' in poll_result:
+
+                        msg = (
+                            'UniProt ID Mapping job failed: ' +
+                            ' '.join(common.to_list(poll_result['messages']))
+                        )
 
-                self._console(
-                    'Error at downloading ID mapping data from UniProt.'
+                        self._log(msg)
+
+                        raise RuntimeError(msg)
+
+                    time.sleep(interval)
+
+                det_url = urls.urls['uniprot_idmapping']['details'] % jobid
+                det_c = curl.Curl(url = det_url, **nocache, **accept_json)
+                result_url = (
+                    json.loads(det_c.result)['redirectURL'].
+                    replace('/idmapping/results/', '/idmapping/stream/').
+                    replace('/results/', '/results/stream/').
+                    __add__('?format=tsv')
+                )
+
+                self._log(
+                    'Retrieving UniProt ID Mapping results '
+                    f'from `{result_url}`'
                 )
 
-                c.result = ''
+                with curl.cache_delete_on():
 
-            c.fileobj.seek(0)
+                    res_c = curl.Curl(
+                        url = result_url,
+                        cache = cache_path,
+                        large = True,
+                        silent = False,
+                    )
+
+            else:
 
-            # removing the header row
-            _ = next(c.result)
+                res_c = curl.Curl(**run_args)
 
-            result.extend(list(c.fileobj)[1:])
+            result.extend(list(res_c.fileobj)[1:])
 
         return result