Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 18: Implement genotype IDs to support variants with multiple alleles #24

Merged
merged 9 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Pipeline to provide evidence strings for Open Targets from PharmGKB
## How to run
```
# Download data
DATA_DIR=<directory for data>
export DATA_DIR=<directory for data>
wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
wget https://api.pharmgkb.org/v1/download/file/data/variants.zip
Expand All @@ -25,21 +25,21 @@ Unless otherwise mentioned, data is taken directly from PharmGKB.

Field | Description | Example
--|--|--
datasourceId | Identifier for data source | `"pharmGKB"`
datasourceId | Identifier for data source | `"pharmgkb"`
datasourceVersion | Date when data dump was generated, formatted YYYY-MM-DD | `"2023-08-05"`
datatypeId | Type of data corresponding to this evidence string (currently only clinical annotation) | `"clinical_annotation"`
studyId | Clinical Annotation ID | `"1449309937"`
evidenceLevel | Level of evidence (see [here](https://www.pharmgkb.org/page/clinAnnLevels)) | `"1A"`
literature | List of PMIDs associated with this clinical annotation | `["11389482", "27857962"]`
variantId | VCF-style (`chr_pos_ref_alt`) identifier of variant; computed as described [below](#variant-coordinate-computation) | `"19_38499645_GGAG_G"`
genotypeId | VCF-style (`chr_pos_ref_allele1,allele2`) identifier of genotype; computed as described [below](#variant-coordinate-computation) | `"19_38499645_GGAG_G,GGAG"`
variantRsId | RS ID of variant | `"rs121918596"`
variantFunctionalConsequenceId | Sequence Ontology term, currently from VEP only | `"SO_0001822"`
targetFromSourceId | Ensembl stable gene ID, currently from VEP only | `"ENSG00000196218"`
variantFunctionalConsequenceId | Sequence Ontology term, from VEP | `"SO_0001822"`
targetFromSourceId | Ensembl stable gene ID, from VEP | `"ENSG00000196218"`
genotype | Genotype string | SNP `"TA"`, indel `"del/GAG"`, repeat `"(CA)16/(CA)17"`
genotypeAnnotationText | Full annotation string for genotype | `"Patients with the rs121918596 del/GAG genotype may develop malignant hyperthermia when treated with volatile anesthetics [...]"`
drugFromSource | Drug name | `"succinylcholine"`
drugId | CHEBI ID of drug, mapped through OLS | `"CHEBI_45652"`
pgxCategory | Pharmacogenomics phenotype category | `"Toxicity"`
pgxCategory | Pharmacogenomics phenotype category | `"toxicity"`
phenotypeText | Phenotype name | `"Malignant Hyperthermia"`
phenotypeFromSourceId | EFO ID of phenotype, mapped through ZOOMA / OXO | `"Orphanet_423"`

Expand All @@ -56,15 +56,15 @@ Below is an example of a complete clinical annotation evidence string:
"11389482",
"27857962"
],
"variantId": "19_38499645_GGAG_G",
"genotypeId": "19_38499645_GGAG_G,GGAG",
"variantRsId": "rs121918596",
"variantFunctionalConsequenceId": "SO_0001822",
"targetFromSourceId": "ENSG00000196218",
"genotype": "del/GAG",
"genotypeAnnotationText": "Patients with the rs121918596 del/GAG genotype may develop malignant hyperthermia when treated with volatile anesthetics (desflurane, enflurane, halothane, isoflurane, methoxyflurane, sevoflurane) and/or succinylcholine as compared to patients with the GAG/GAG genotype. Other genetic or clinical factors may also influence the risk for malignant hyperthermia.",
"drugFromSource": "succinylcholine",
"drugId": "CHEBI_45652",
"pgxCategory": "Toxicity",
"pgxCategory": "toxicity",
"phenotypeText": "Malignant Hyperthermia",
"phenotypeFromSourceId": "Orphanet_423"
}
Expand All @@ -75,7 +75,7 @@ Other examples can be found in the [tests](tests/resources/expected_output.json)

TODO: describe this in words

````mermaid
```mermaid
graph TD
J[PharmGKB]
H[FASTA files]
Expand All @@ -89,4 +89,4 @@ graph TD
A --> |locations 'Chr+Pos'| D
H --> |Reference + context| D
E --> |Alternate alleles| D
````
```
9 changes: 9 additions & 0 deletions opentargets_pharmgkb/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ def __init__(self):
self.annot_with_pgkb_genes = 0
self.annot_with_vep_genes = 0
self.pgkb_vep_gene_diff = 0
# Variant counts
self.total_rs = 0
self.rs_with_alleles = 0
self.rs_with_more_than_2_alleles = 0

def report(self):
report_str = f'\nTotal clinical annotations: {self.clinical_annotations}\n'
Expand All @@ -54,5 +58,10 @@ def report(self):
f'({format_percent(self.annot_with_vep_genes, self.clinical_annotations)})\n')
report_str += (f'\tPGKB genes != VEP genes: {self.pgkb_vep_gene_diff} '
f'({format_percent(self.pgkb_vep_gene_diff, self.clinical_annotations)})\n')
report_str += f'Total RS: {self.total_rs}\n'
report_str += (f'\tWith parsed alleles: {self.rs_with_alleles} '
f'({format_percent(self.rs_with_alleles, self.total_rs)})\n')
report_str += (f'\t\tWith >2 alleles: {self.rs_with_more_than_2_alleles} '
f'({format_percent(self.rs_with_more_than_2_alleles, self.rs_with_alleles)})\n')
print(report_str)
return report_str
110 changes: 80 additions & 30 deletions opentargets_pharmgkb/evidence_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
import multiprocessing
import os
import sys
from collections import defaultdict
from itertools import zip_longest

import pandas as pd
Expand All @@ -12,7 +14,8 @@
from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
from opentargets_pharmgkb.variant_coordinates import Fasta
from opentargets_pharmgkb.validation import validate_evidence_string
from opentargets_pharmgkb.variant_coordinates import Fasta, parse_genotype

logging.basicConfig()
logger = logging.getLogger(__package__)
Expand Down Expand Up @@ -58,7 +61,8 @@ def pipeline(data_dir, fasta_path, created_date, output_path, debug_path=None):
mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
counts.exploded_phenotypes = len(mapped_phenotypes)

coordinates_table = get_vcf_coordinates(mapped_phenotypes, fasta_path)
# Coordinates and consequences
coordinates_table = get_genotype_ids(mapped_phenotypes, fasta_path, counts)
consequences_table = get_functional_consequences(coordinates_table)

# Add clinical evidence with PMIDs
Expand All @@ -80,72 +84,118 @@ def pipeline(data_dir, fasta_path, created_date, output_path, debug_path=None):
generate_clinical_annotation_evidence(so_accession_dict, created_date, row)
for _, row in evidence_table.iterrows()
]
# Validate and write
invalid_evidence = False
with open(output_path, 'w+') as output:
output.write('\n'.join(json.dumps(ev) for ev in evidence))
for ev_string in evidence:
if validate_evidence_string(ev_string):
output.write(json.dumps(ev_string)+'\n')
else:
invalid_evidence = True

# Final count report
if not debug_path:
debug_path = f'{output_path.rsplit(".", 1)[0]}_genes.csv'
gene_comparison_counts(evidence_table, counts, debug_path=debug_path)
counts.report()

# Exit with an error code if any invalid evidence is produced
# Do this at the very end so we still output counts and any valid evidence strings.
if invalid_evidence:
logger.error('Invalid evidence strings occurred, please check the logs for the details')
sys.exit(1)


def read_tsv_to_df(path):
return pd.read_csv(path, sep='\t', dtype=str)


def get_vcf_coordinates(df, fasta_path):
def genotype_id(chrom, pos, ref, parsed_genotype):
return f'{chrom}_{pos}_{ref}_{",".join(parsed_genotype)}' if chrom and pos and ref else None


def get_genotype_ids(df, fasta_path, counts=None):
"""
Get VCF-style coordinates (chr_pos_ref_alt) for dataframe.
Get genotype IDs (chr_pos_ref_allele1,allele2) for dataframe.

:param df: dataframe to annotate (needs 'Genotype/Allele' and 'Variant/Haplotypes' columns)
:return: dataframe with 'vcf_coords' column added
:param df: dataframe to annotate (needs 'Genotype/Allele', 'Variant/Haplotypes', 'Location' columns)
:param fasta_path: path to fasta file to check reference
:param counts: ClinicalAnnotationCounts; if provided will count multi-allelic variants.
:return: dataframe with 'genotype_id' column added
"""
fasta = Fasta(fasta_path)
# First set a column with all genotypes for a given rs
df_with_coords = pd.merge(df, df.groupby(by=ID_COL_NAME).aggregate(
all_genotypes=('Genotype/Allele', list)), on=ID_COL_NAME)
# Then get coordinates for each row
# TODO Currently this does one call per genotype per RS
# Remove redundant calls once we figure out how to handle genotypes & multiple alts per RS
for i, row in df_with_coords.iterrows():
df_with_coords.at[i, 'vcf_coords'] = fasta.get_coordinates_for_clinical_annotation(
row['Variant/Haplotypes'], row['Location'], row['all_genotypes'])
return df_with_coords
# First set a column with all genotypes for a given RS
df_with_ids = df.assign(parsed_genotype=df['Genotype/Allele'].apply(parse_genotype))
df_with_ids = pd.merge(df_with_ids, df_with_ids.groupby(by='Variant/Haplotypes').aggregate(
all_genotypes=('parsed_genotype', list)), on='Variant/Haplotypes')
# Get coordinates (chromsome, position, reference, and all alternate alleles) for each RS
rs_to_coords = {}
for i, row in df_with_ids.drop_duplicates(['Variant/Haplotypes']).iterrows():
chrom, pos, ref, alleles_dict = fasta.get_chr_pos_ref(row['Variant/Haplotypes'], row['Location'],
row['all_genotypes'])
rs_to_coords[row['Variant/Haplotypes']] = (chrom, pos, ref, alleles_dict)
# Generate per-variant counts, if applicable
if not counts:
continue
counts.total_rs += 1
if not alleles_dict:
continue
counts.rs_with_alleles += 1
if len(alleles_dict) <= 2:
continue
counts.rs_with_more_than_2_alleles += 1
# Use rs_to_coords to generate genotypeId for each genotype
for i, row in df_with_ids.iterrows():
chrom, pos, ref, alleles_dict = rs_to_coords[row['Variant/Haplotypes']]
if chrom and pos and ref and alleles_dict:
df_with_ids.at[i, 'genotype_id'] = genotype_id(chrom, pos, ref, sorted([alleles_dict[a]
for a in row['parsed_genotype']]))
else:
df_with_ids.at[i, 'genotype_id'] = None
return df_with_ids


def get_functional_consequences(df):
"""
Get functional consequences from VEP.

:param df: dataframe to annotate (needs 'vcf_coords' column)
:param df: dataframe to annotate (needs 'genotype_id' column)
:return: dataframe with 'overlapping_gene' and 'consequence_term' columns added
"""
vep_id_to_coords = {
coord_id_to_vep_id(x): x for x in df['vcf_coords'].dropna().drop_duplicates().tolist()
}
vep_id_to_genotype_ids = defaultdict(list)
for genotype_id in df['genotype_id'].dropna().drop_duplicates().tolist():
for vep_id in genotype_id_to_vep_ids(genotype_id):
vep_id_to_genotype_ids[vep_id].append(genotype_id)
# Note that variants in a single genotype will have VEP logic applied independently, i.e. most severe consequence
# for each overlapping gene.
with multiprocessing.Pool(processes=24) as pool:
all_consequences = [
pool.apply(process_to_list, args=(batch,))
for batch in grouper(vep_id_to_coords.keys(), 200)
for batch in grouper(vep_id_to_genotype_ids.keys(), 200)
]
mapped_consequences = pd.DataFrame(data=[
{
'vcf_coords': vep_id_to_coords[variant_id],
'genotype_id': genotype_id,
'overlapping_gene': gene_id,
'consequence_term': consequence_term
}
for batch in all_consequences
for variant_id, gene_id, gene_symbol, consequence_term in batch
])
return pd.merge(df, mapped_consequences, on='vcf_coords', how='left')
for genotype_id in vep_id_to_genotype_ids[variant_id]
]).drop_duplicates()
return pd.merge(df, mapped_consequences, on='genotype_id', how='left')


def coord_id_to_vep_id(coord_id):
"""Converts an underscore-separated coordinate identifier (e.g. 15_7237571_C_T) to VEP compatible one."""
def genotype_id_to_vep_ids(coord_id):
"""Converts an underscore-separated genotype identifier (e.g. 15_7237571_C_T,C) to VEP compatible ones."""
id_fields = coord_id.split('_')
assert len(id_fields) == 4, 'Invalid identifier supplied (should contain exactly 4 fields)'
return '{} {} . {} {}'.format(*id_fields)
chrom, pos, ref, genotype = id_fields
genotype = genotype.split(',')
for alt in genotype:
# Skip non-variants
if alt != ref:
yield f'{chrom} {pos} . {ref} {alt}'


def grouper(iterable, n):
Expand Down Expand Up @@ -251,7 +301,7 @@ def generate_clinical_annotation_evidence(so_accession_dict, created_date, row):
'literature': [str(x) for x in row['publications']],

# VARIANT ATTRIBUTES
'variantId': row['vcf_coords'],
'genotypeId': row['genotype_id'],
'variantRsId': row['Variant/Haplotypes'],
# 'originalSourceGeneId': row['gene_from_pgkb'],
'variantFunctionalConsequenceId': so_accession_dict.get(row['consequence_term'], None),
Expand All @@ -264,7 +314,7 @@ def generate_clinical_annotation_evidence(so_accession_dict, created_date, row):
# PHENOTYPE ATTRIBUTES
'drugFromSource': row['split_drug'],
'drugId': iri_to_code(row['chebi']),
'pgxCategory': row['Phenotype Category'],
'pgxCategory': row['Phenotype Category'].lower(),
'phenotypeText': row['split_phenotype'],
'phenotypeFromSourceId': iri_to_code(row['efo'])
}
Expand Down
33 changes: 33 additions & 0 deletions opentargets_pharmgkb/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
import logging

import jsonschema
import requests

logging.basicConfig()
logger = logging.getLogger(__package__)
logger.setLevel(level=logging.DEBUG)


def get_ot_json_schema():
# Temporary schema url for testing
schema_url = 'https://raw.githubusercontent.com/opentargets/json_schema/master/schemas/pharmacogenomics.json'
response = requests.get(schema_url)
if response.ok:
return response.json()
raise ValueError('Problem getting JSON schema')


def validate_evidence_string(ev_string):
try:
ot_schema_contents = get_ot_json_schema()
jsonschema.validate(ev_string, ot_schema_contents, format_checker=jsonschema.FormatChecker())
return True
except jsonschema.exceptions.ValidationError as err:
logger.error('Error: evidence string does not validate against schema.')
logger.error(f'Error message: {err}')
logger.error(f'Complete evidence string: {json.dumps(ev_string)}')
return False
except jsonschema.exceptions.SchemaError:
logger.error('Error: OpenTargets schema file is invalid')
return False
Loading