add normalisation into pipeline, add tests

EBIvariation · Apr 5, 2024 · 3a35810 · 3a35810
1 parent 9064e24
commit 3a35810
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 24 deletions.
diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -145,7 +145,6 @@ def get_genotype_ids(df, fasta_path, counts=None):
     for i, row in df_with_ids.drop_duplicates(['Variant/Haplotypes']).iterrows():
         chrom, pos, ref, alleles_dict = fasta.get_chr_pos_ref(row['Variant/Haplotypes'], row['Location'],
                                                               row['all_genotypes'])
-        # TODO - here fallback on NCBI SPDI method
         rs_to_coords[row['Variant/Haplotypes']] = (chrom, pos, ref, alleles_dict)
         # Generate per-variant counts, if applicable
         if not counts:

diff --git a/opentargets_pharmgkb/ncbi_utils.py b/opentargets_pharmgkb/ncbi_utils.py
@@ -31,38 +31,38 @@ def get_rsid_summaries(rsids, api_key=None):
     return {}
 
 
-def parse_spdi_from_ncbi_result(ncbi_result):
+def parse_spdi_from_ncbi_result(spdi_result):
     """
     Parse SPDI expressions from NCBI result.
 
-    :param ncbi_result: dict response from NCBI summary endpoint for a single rsID
+    :param spdi_result: SPDI string returned from NCBI
     :return: SPDI coords (chrom, pos, ref, list of alts)
     """
-    spdis = ncbi_result['spdi'].split(',')
+    spdis = spdi_result.split(',')
     # Parse each SPDI into coordinates
     chrom = pos = ref = None
     alts = []
     for spdi in spdis:
         # TODO warn or something if chrom/pos/ref differs among these
         chrom, pos, ref, alt = spdi.split(':')
         alts.append(alt)
-    return chrom, pos, ref, alts
+    return chrom, int(pos), ref, alts
 
 
 def get_spdi_coords_for_rsid(rsid):
     """
     Return SPDI coordinates for a single rsIDs by querying NCBI.
 
-    :param rsid:
-    :return: dict mapping rsid to SPDI coords as tuple (chrom, pos, ref, list of alts)
+    :param rsid: rsID to query
+    :return: SPDI coords as tuple (chrom, pos, ref, list of alts)
     """
     if rsid.startswith('rs'):
         rsid = rsid[2:]
     summary_result = get_rsid_summaries([rsid])
-    if rsid not in summary_result:
+    if rsid not in summary_result or 'spdi' not in summary_result[rsid]:
         logger.warning(f'Could not get SPDI from NCBI for rs{rsid}')
         return None, None, None, []
-    return parse_spdi_from_ncbi_result(summary_result[rsid])
+    return parse_spdi_from_ncbi_result(summary_result[rsid]['spdi'])
 
 
 def get_spdi_coords_for_rsids(rsids):
@@ -77,7 +77,8 @@ def get_spdi_coords_for_rsids(rsids):
     rsid_to_spdis = {}
     for k in summary_results:
         rsid = f'rs{k}'
-        rsid_to_spdis[rsid] = parse_spdi_from_ncbi_result(summary_results[k])
+        if 'spdi' in summary_results[k]:
+            rsid_to_spdis[rsid] = parse_spdi_from_ncbi_result(summary_results[k]['spdi'])
     if set(rsids) != set(rsid_to_spdis.keys()):
         logger.warning('Did not get SPDI from NCBI for all rsids')
     return rsid_to_spdis
diff --git a/opentargets_pharmgkb/variant_coordinates.py b/opentargets_pharmgkb/variant_coordinates.py
@@ -5,6 +5,8 @@
 import pandas as pd
 from Bio import SeqIO
 
+from opentargets_pharmgkb.ncbi_utils import get_spdi_coords_for_rsid
+
 logger = logging.getLogger(__package__)
 
 
@@ -69,9 +71,8 @@ def get_chr_pos_ref(self, rsid, location, all_parsed_genotypes):
         else:
             start = end = int(pos)
 
-        # TODO replace this with normalisation
         alleles_dict = {alt: alt for genotype in all_parsed_genotypes for alt in genotype}
-        # Correct for deletion alleles
+        # Add context base for deletion alleles
         if 'DEL' in alleles_dict:
             if end == start:
                 end -= 1  # keep end == start if they began that way
@@ -81,14 +82,38 @@ def get_chr_pos_ref(self, rsid, location, all_parsed_genotypes):
         if not alleles_dict:
             logger.warning(f'Could not parse any genotypes for {rsid}')
             return chrom_num, start, None, None
+        pgkb_alleles = alleles_dict.values()
 
+        # First check for ref based on PGKB's coordinates
         ref = self.get_ref_from_fasta(chrom, start, end)
-        # Report & skip if ref is not among the alleles
-        if ref not in alleles_dict.values():
-            logger.warning(f'Ref not in alleles: {rsid}\t{ref}\t{",".join(alleles_dict)}')
-            return chrom_num, start, None, None  # TODO return normalised alleles in this case
+        if ref in pgkb_alleles:
+            return chrom_num, start, ref, alleles_dict
+
+        logger.info(f'Ref from FASTA not in alleles: {rsid}\t{ref}\t{",".join(pgkb_alleles)}')
+        # If could not determine ref from FASTA, use ref determined from NCBI & normalised
+        chrom, pos, ref, alleles_dict = self.get_norm_coords_from_ncbi(rsid, alleles_dict)
+        logger.info(f'Will use ref from NCBI: {ref}')
+        # Use NCBI's chromosome, normalised position, and normalised reference
+        # (Note this could be inconsistent for DEL alleles if PGKB's position doesn't match NCBI's)
+        return self.get_chrom_num_from_refseq(chrom), pos, ref, alleles_dict
+
+    def get_norm_coords_from_ncbi(self, rsid, alleles_dict):
+        """
+        Get normalised coordinates from NCBI for an rsID, using alleles passed in.
 
-        return chrom_num, start, ref, alleles_dict
+        :param rsid: rsID to query
+        :param alleles_dict: dict mapping PGKB's original alleles to alleles with context added
+        :return: normalised coordinates (chrom, pos, ref, alleles) where alleles is a dict with the same keys as passed
+            in but with normalised values
+        """
+        # Don't actually need the alts from SPDI, we only care about PGKB's alleles
+        chrom, pos, ref, _ = get_spdi_coords_for_rsid(rsid)
+        # Need to normalise values of alleles_dict while keeping track of their associated keys,
+        # for simplicity we do this by preserving order.
+        ordered_keys = list(alleles_dict.keys())
+        alts = [alleles_dict[k] for k in ordered_keys]
+        chrom, norm_pos, norm_ref, norm_alts = self.normalise_with_ref(chrom, pos, ref, alts)
+        return chrom, norm_pos, norm_ref, dict(zip(ordered_keys, norm_alts))
 
     @lru_cache
     def get_ref_from_fasta(self, chrom, start, end=None):
@@ -120,7 +145,6 @@ def get_chrom_num_from_refseq(self, chrom_refseq):
         logger.warning(f'Could not get chromosome number for {chrom_refseq}')
         return None
 
-    @lru_cache
     def normalise(self, chrom, pos, alleles):
         """
         Normalise alleles to be parsimonious and left-aligned.
@@ -147,8 +171,7 @@ def normalise(self, chrom, pos, alleles):
             pos += 1
         return chrom, pos, alleles
 
-    @lru_cache
     def normalise_with_ref(self, chrom, pos, ref, alts):
         """Normalisation where reference allele is known."""
-        chrom, pos, alleles = self.normalise(chrom, pos, [ref] + [a for a in alts])
+        chrom, pos, alleles = self.normalise(chrom, pos, [ref] + list(alts))
         return chrom, pos, alleles[0], alleles[1:]
diff --git a/tests/test_ncbi_utils.py b/tests/test_ncbi_utils.py
@@ -0,0 +1,6 @@
+from opentargets_pharmgkb.ncbi_utils import get_spdi_coords_for_rsid
+
+
+def test_get_spdi_coords_for_rsid():
+    assert get_spdi_coords_for_rsid('rs35068180') == ('NC_000011.10', 102845216, 'AAAAA', ['AAAA', 'AAAAAA', 'AAAAAAA'])
+    assert get_spdi_coords_for_rsid('rs3000000000') == (None, None, None, [])
diff --git a/tests/test_variant_coordinates.py b/tests/test_variant_coordinates.py
@@ -30,9 +30,9 @@ def test_get_coordinates_range_location(fasta: Fasta):
     ) == ('21', 33341700, 'AGAC', {'DEL': 'A', 'GAC': 'AGAC'})
 
 
-def test_get_coordinates_not_match_reference(fasta: Fasta):
+def test_get_coordinates_from_ncbi(fasta: Fasta):
     assert fasta.get_chr_pos_ref(
         'rs1051266',
-        'NC_000021.9:33341701_33341703',
-        [['TTT', 'TTT'], ['TTT', 'DEL'], ['DEL', 'DEL']]
-    ) == ('21', 33341700, None, None)
+        'NC_000021.9:45537879',
+        [['TGA', 'TGA'], ['TGA', 'TGAGA'], ['TGAGA', 'TGAGA']]
+    ) == ('21', 45537879, 'T', {'TGA': 'TGA', 'TGAGA': 'TGAGA'})