Skip to content

Commit

Permalink
add normalisation into pipeline, add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Apr 5, 2024
1 parent 9064e24 commit 3a35810
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 24 deletions.
1 change: 0 additions & 1 deletion opentargets_pharmgkb/evidence_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def get_genotype_ids(df, fasta_path, counts=None):
for i, row in df_with_ids.drop_duplicates(['Variant/Haplotypes']).iterrows():
chrom, pos, ref, alleles_dict = fasta.get_chr_pos_ref(row['Variant/Haplotypes'], row['Location'],
row['all_genotypes'])
# TODO - here fallback on NCBI SPDI method
rs_to_coords[row['Variant/Haplotypes']] = (chrom, pos, ref, alleles_dict)
# Generate per-variant counts, if applicable
if not counts:
Expand Down
19 changes: 10 additions & 9 deletions opentargets_pharmgkb/ncbi_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,38 +31,38 @@ def get_rsid_summaries(rsids, api_key=None):
return {}


def parse_spdi_from_ncbi_result(ncbi_result):
def parse_spdi_from_ncbi_result(spdi_result):
"""
Parse SPDI expressions from NCBI result.
:param ncbi_result: dict response from NCBI summary endpoint for a single rsID
:param spdi_result: SPDI string returned from NCBI
:return: SPDI coords (chrom, pos, ref, list of alts)
"""
spdis = ncbi_result['spdi'].split(',')
spdis = spdi_result.split(',')
# Parse each SPDI into coordinates
chrom = pos = ref = None
alts = []
for spdi in spdis:
# TODO warn or something if chrom/pos/ref differs among these
chrom, pos, ref, alt = spdi.split(':')
alts.append(alt)
return chrom, pos, ref, alts
return chrom, int(pos), ref, alts


def get_spdi_coords_for_rsid(rsid):
"""
Return SPDI coordinates for a single rsIDs by querying NCBI.
:param rsid:
:return: dict mapping rsid to SPDI coords as tuple (chrom, pos, ref, list of alts)
:param rsid: rsID to query
:return: SPDI coords as tuple (chrom, pos, ref, list of alts)
"""
if rsid.startswith('rs'):
rsid = rsid[2:]
summary_result = get_rsid_summaries([rsid])
if rsid not in summary_result:
if rsid not in summary_result or 'spdi' not in summary_result[rsid]:
logger.warning(f'Could not get SPDI from NCBI for rs{rsid}')
return None, None, None, []
return parse_spdi_from_ncbi_result(summary_result[rsid])
return parse_spdi_from_ncbi_result(summary_result[rsid]['spdi'])


def get_spdi_coords_for_rsids(rsids):
Expand All @@ -77,7 +77,8 @@ def get_spdi_coords_for_rsids(rsids):
rsid_to_spdis = {}
for k in summary_results:
rsid = f'rs{k}'
rsid_to_spdis[rsid] = parse_spdi_from_ncbi_result(summary_results[k])
if 'spdi' in summary_results[k]:
rsid_to_spdis[rsid] = parse_spdi_from_ncbi_result(summary_results[k]['spdi'])
if set(rsids) != set(rsid_to_spdis.keys()):
logger.warning('Did not get SPDI from NCBI for all rsids')
return rsid_to_spdis
43 changes: 33 additions & 10 deletions opentargets_pharmgkb/variant_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pandas as pd
from Bio import SeqIO

from opentargets_pharmgkb.ncbi_utils import get_spdi_coords_for_rsid

logger = logging.getLogger(__package__)


Expand Down Expand Up @@ -69,9 +71,8 @@ def get_chr_pos_ref(self, rsid, location, all_parsed_genotypes):
else:
start = end = int(pos)

# TODO replace this with normalisation
alleles_dict = {alt: alt for genotype in all_parsed_genotypes for alt in genotype}
# Correct for deletion alleles
# Add context base for deletion alleles
if 'DEL' in alleles_dict:
if end == start:
end -= 1 # keep end == start if they began that way
Expand All @@ -81,14 +82,38 @@ def get_chr_pos_ref(self, rsid, location, all_parsed_genotypes):
if not alleles_dict:
logger.warning(f'Could not parse any genotypes for {rsid}')
return chrom_num, start, None, None
pgkb_alleles = alleles_dict.values()

# First check for ref based on PGKB's coordinates
ref = self.get_ref_from_fasta(chrom, start, end)
# Report & skip if ref is not among the alleles
if ref not in alleles_dict.values():
logger.warning(f'Ref not in alleles: {rsid}\t{ref}\t{",".join(alleles_dict)}')
return chrom_num, start, None, None # TODO return normalised alleles in this case
if ref in pgkb_alleles:
return chrom_num, start, ref, alleles_dict

logger.info(f'Ref from FASTA not in alleles: {rsid}\t{ref}\t{",".join(pgkb_alleles)}')
# If could not determine ref from FASTA, use ref determined from NCBI & normalised
chrom, pos, ref, alleles_dict = self.get_norm_coords_from_ncbi(rsid, alleles_dict)
logger.info(f'Will use ref from NCBI: {ref}')
# Use NCBI's chromosome, normalised position, and normalised reference
# (Note this could be inconsistent for DEL alleles if PGKB's position doesn't match NCBI's)
return self.get_chrom_num_from_refseq(chrom), pos, ref, alleles_dict

def get_norm_coords_from_ncbi(self, rsid, alleles_dict):
"""
Get normalised coordinates from NCBI for an rsID, using alleles passed in.
return chrom_num, start, ref, alleles_dict
:param rsid: rsID to query
:param alleles_dict: dict mapping PGKB's original alleles to alleles with context added
:return: normalised coordinates (chrom, pos, ref, alleles) where alleles is a dict with the same keys as passed
in but with normalised values
"""
# Don't actually need the alts from SPDI, we only care about PGKB's alleles
chrom, pos, ref, _ = get_spdi_coords_for_rsid(rsid)
# Need to normalise values of alleles_dict while keeping track of their associated keys,
# for simplicity we do this by preserving order.
ordered_keys = list(alleles_dict.keys())
alts = [alleles_dict[k] for k in ordered_keys]
chrom, norm_pos, norm_ref, norm_alts = self.normalise_with_ref(chrom, pos, ref, alts)
return chrom, norm_pos, norm_ref, dict(zip(ordered_keys, norm_alts))

@lru_cache
def get_ref_from_fasta(self, chrom, start, end=None):
Expand Down Expand Up @@ -120,7 +145,6 @@ def get_chrom_num_from_refseq(self, chrom_refseq):
logger.warning(f'Could not get chromosome number for {chrom_refseq}')
return None

@lru_cache
def normalise(self, chrom, pos, alleles):
"""
Normalise alleles to be parsimonious and left-aligned.
Expand All @@ -147,8 +171,7 @@ def normalise(self, chrom, pos, alleles):
pos += 1
return chrom, pos, alleles

@lru_cache
def normalise_with_ref(self, chrom, pos, ref, alts):
"""Normalisation where reference allele is known."""
chrom, pos, alleles = self.normalise(chrom, pos, [ref] + [a for a in alts])
chrom, pos, alleles = self.normalise(chrom, pos, [ref] + list(alts))
return chrom, pos, alleles[0], alleles[1:]
6 changes: 6 additions & 0 deletions tests/test_ncbi_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from opentargets_pharmgkb.ncbi_utils import get_spdi_coords_for_rsid


def test_get_spdi_coords_for_rsid():
assert get_spdi_coords_for_rsid('rs35068180') == ('NC_000011.10', 102845216, 'AAAAA', ['AAAA', 'AAAAAA', 'AAAAAAA'])
assert get_spdi_coords_for_rsid('rs3000000000') == (None, None, None, [])
8 changes: 4 additions & 4 deletions tests/test_variant_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def test_get_coordinates_range_location(fasta: Fasta):
) == ('21', 33341700, 'AGAC', {'DEL': 'A', 'GAC': 'AGAC'})


def test_get_coordinates_not_match_reference(fasta: Fasta):
def test_get_coordinates_from_ncbi(fasta: Fasta):
assert fasta.get_chr_pos_ref(
'rs1051266',
'NC_000021.9:33341701_33341703',
[['TTT', 'TTT'], ['TTT', 'DEL'], ['DEL', 'DEL']]
) == ('21', 33341700, None, None)
'NC_000021.9:45537879',
[['TGA', 'TGA'], ['TGA', 'TGAGA'], ['TGAGA', 'TGAGA']]
) == ('21', 45537879, 'T', {'TGA': 'TGA', 'TGAGA': 'TGAGA'})

0 comments on commit 3a35810

Please sign in to comment.