EBIvariation · apriltuesday · May 8, 2024 · Apr 23, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/README.md b/README.md
@@ -6,13 +6,11 @@ Pipeline to provide evidence strings for Open Targets from PharmGKB
 # Download data
 export DATA_DIR=<directory for data>
 wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
-wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
 wget https://api.pharmgkb.org/v1/download/file/data/variants.zip
 wget https://api.pharmgkb.org/v1/download/file/data/relationships.zip
 
 unzip -j clinicalAnnotations.zip "*.tsv" -d $DATA_DIR
 unzip -j clinicalAnnotations.zip "CREATED*.txt" -d $DATA_DIR
-unzip -j drugs.zip "*.tsv" -d $DATA_DIR
 unzip -j variants.zip "*.tsv" -d $DATA_DIR
 unzip -j relationships.zip "*.tsv" -d $DATA_DIR
 rm clinicalAnnotations.zip drugs.zip variants.zip relationships.zip
@@ -42,8 +40,7 @@ genotypeAnnotationText | Full annotation string for genotype or allele | `"Patie
 directionality | Allele function annotation (see Table 2 [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5253119/)) | `"Decreased function"`
 haplotypeId | Name of haplotype; can be an allele or a genotype | `"CYP2B6*6"` or `"GSTT1 non-null/non-null"`
 haplotypeFromSourceId | Internal PGKB identifier for the haplotype | `"PA165818762"`
-drugFromSource | Drug name | `"succinylcholine"`
-drugFromSourceId | CHEBI ID of drug, mapped through OLS | `"CHEBI_45652"`
+drugsFromSource | List of drug names (if known to be annotated in combination) | `["succinylcholine"]` or `["ivacaftor", "lumacaftor"]`
 pgxCategory | Pharmacogenomics phenotype category | `"toxicity"`
 phenotypeText | Phenotype name | `"Malignant Hyperthermia"`
 phenotypeFromSourceId | EFO ID of phenotype, mapped through ZOOMA / OXO | `"Orphanet_423"`
@@ -67,8 +64,9 @@ Below is an example of a complete clinical annotation evidence string:
   "targetFromSourceId": "ENSG00000196218",
   "genotype": "del/GAG",
   "genotypeAnnotationText": "Patients with the rs121918596 del/GAG genotype may develop malignant hyperthermia when treated with volatile anesthetics (desflurane, enflurane, halothane, isoflurane, methoxyflurane, sevoflurane) and/or succinylcholine as compared to patients with the GAG/GAG genotype. Other genetic or clinical factors may also influence the risk for malignant hyperthermia.",
-  "drugFromSource": "succinylcholine",
-  "drugId": "CHEBI_45652",
+  "drugsFromSource": [
+    "succinylcholine"
+  ],
   "pgxCategory": "toxicity",
   "phenotypeText": "Malignant Hyperthermia",
   "phenotypeFromSourceId": "Orphanet_423"

diff --git a/opentargets_pharmgkb/counts.py b/opentargets_pharmgkb/counts.py
@@ -20,16 +20,11 @@ def __init__(self):
         self.exploded_phenotypes = 0
         # Output counts (after annotation and exploding)
         self.evidence_strings = 0
-        self.with_chebi = 0
         self.with_efo = 0
         self.with_consequence = 0
         self.with_target_gene = 0
         self.with_haplotype = 0
         self.resolved_haplotype_id = 0  # indicates we were able to resolve the haplotype to a PGKB internal ID
-        # Evaluation counts - after annotation but without exploding
-        self.annot_with_pgkb_genes = 0
-        self.annot_with_vep_genes = 0
-        self.pgkb_vep_gene_diff = 0
         # Variant counts
         self.total_rs = 0
         self.rs_with_alleles = 0
@@ -47,21 +42,12 @@ def report(self):
         report_str += (f'\t\t4. Exploded by phenotype: {self.exploded_phenotypes}'
                        f' ({format_decimal(self.exploded_phenotypes, self.exploded_drugs)}x)\n')
         report_str += f'Total evidence strings: {self.evidence_strings}\n'
-        report_str += f'\tWith CHEBI: {self.with_chebi} ({format_percent(self.with_chebi, self.evidence_strings)})\n'
         report_str += (f'\tWith EFO phenotype: {self.with_efo}'
                        f' ({format_percent(self.with_efo, self.evidence_strings)})\n')
         report_str += (f'\tWith functional consequence: {self.with_consequence} '
                        f'({format_percent(self.with_consequence, self.evidence_strings)})\n')
         report_str += (f'\tWith target gene: {self.with_target_gene} '
                        f'({format_percent(self.with_target_gene, self.evidence_strings)})\n')
-        if self.annot_with_pgkb_genes or self.annot_with_vep_genes:
-            report_str += f'Gene comparisons per annotation\n'
-            report_str += (f'\tWith PGKB genes: {self.annot_with_pgkb_genes} '
-                           f'({format_percent(self.annot_with_pgkb_genes, self.clinical_annotations)})\n')
-            report_str += (f'\tWith VEP genes: {self.annot_with_vep_genes} '
-                           f'({format_percent(self.annot_with_vep_genes, self.clinical_annotations)})\n')
-            report_str += (f'\tPGKB genes != VEP genes: {self.pgkb_vep_gene_diff} '
-                           f'({format_percent(self.pgkb_vep_gene_diff, self.clinical_annotations)})\n')
         report_str += f'Total RS: {self.total_rs}\n'
         report_str += (f'\tWith parsed alleles: {self.rs_with_alleles} '
                        f'({format_percent(self.rs_with_alleles, self.total_rs)})\n')

diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -13,7 +13,7 @@
 from cmat.output_generation.consequence_type import get_so_accession_dict
 
 from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
-from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
+from opentargets_pharmgkb.ontology_apis import get_efo_iri
 from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column, read_tsv_to_df
 from opentargets_pharmgkb.validation import validate_evidence_string
 from opentargets_pharmgkb.variant_coordinates import Fasta, parse_genotype
@@ -30,9 +30,8 @@ def pipeline(data_dir, fasta_path, created_date, output_path):
     clinical_alleles_path = os.path.join(data_dir, 'clinical_ann_alleles.tsv')
     clinical_evidence_path = os.path.join(data_dir, 'clinical_ann_evidence.tsv')
     variants_path = os.path.join(data_dir, 'variants.tsv')
-    drugs_path = os.path.join(data_dir, 'drugs.tsv')
     relationships_path = os.path.join(data_dir, 'relationships.tsv')
-    for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, variants_path, drugs_path):
+    for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, variants_path):
         if not os.path.exists(p):
             logger.error(f'Missing required data file: {p}')
             raise ValueError(f'Missing required data file: {p}')
@@ -41,7 +40,6 @@ def pipeline(data_dir, fasta_path, created_date, output_path):
     clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
     clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
     variants_table = read_tsv_to_df(variants_path)
-    drugs_table = read_tsv_to_df(drugs_path)
     relationships_table = read_tsv_to_df(relationships_path)
 
     # Gather input counts
@@ -58,7 +56,7 @@ def pipeline(data_dir, fasta_path, created_date, output_path):
     exploded_pgx_cat = explode_column(merged_with_alleles_table, 'Phenotype Category', 'split_pgx_category')
     counts.exploded_pgx_cat = len(exploded_pgx_cat)
 
-    mapped_drugs = explode_and_map_drugs(exploded_pgx_cat, drugs_table)
+    mapped_drugs = explode_drugs(exploded_pgx_cat)
     counts.exploded_drugs = len(mapped_drugs)
 
     mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
@@ -84,7 +82,6 @@ def pipeline(data_dir, fasta_path, created_date, output_path):
 
     # Gather output counts
     counts.evidence_strings = len(evidence_table)
-    counts.with_chebi = evidence_table['chebi'].count()
     counts.with_efo = evidence_table['efo'].count()
     counts.with_consequence = evidence_table['consequence_term'].count()
     counts.with_target_gene = evidence_table['overlapping_gene'].count() + evidence_table['gene_from_pgkb'].count()
@@ -296,38 +293,18 @@ def explode_and_map_genes(df):
     return mapped_genes
 
 
-def explode_and_map_drugs(df, drugs_table):
+def explode_drugs(df):
     """
-    Maps drug names to CHEBI IRIs using OLS, falling back on primary drugs data from PharmGKB if needed.
-    Explodes multiple drugs in a single row.
+    Explodes multiple drugs in a single row, unless they are known to be a drug combination.
 
     :param df: dataframe to annotate (should have a 'Drug(s)' column)
-    :param drugs_table: drugs dataframe
-    :return: dataframe with 'chebi' column added
+    :return: dataframe with 'split_drug' column added
     """
+    # Drugs on same row but not explicitly annotated as combinations
     split_drugs = explode_column(df, 'Drug(s)', 'split_drug')
-    # Query OLS in parallel as there are no batch queries currently.
-    with multiprocessing.Pool(processes=24) as pool:
-        str_to_iri = {
-            s: pool.apply(get_chebi_iri, args=(s,))
-            for s in split_drugs['split_drug'].drop_duplicates().tolist()
-        }
-    mapped_drugs = pd.concat(
-        split_drugs[split_drugs['split_drug'] == s].assign(chebi=none_to_nan(iri))
-        for s, iri in str_to_iri.items()
-    )
-    # Some drugs we can't unambiguously map using OLS, so we rely on primary data provided by PharmGKB.
-    # Using OLS first ensures we get up-to-date IDs wherever possible.
-    drugs_table['chebi_id'] = drugs_table['Cross-references'].str.extract(r'CHEBI:(?P<chebi_id>\d+)', expand=False)
-    mapped_drugs = pd.merge(mapped_drugs, drugs_table, left_on='split_drug', right_on='Name', how='left')
-    mapped_drugs['chebi'].fillna(mapped_drugs['chebi_id'].apply(chebi_id_to_iri), inplace=True)
-    return mapped_drugs
-
-
-def chebi_id_to_iri(id_):
-    if pd.notna(id_):
-        return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
-    return None
+    # Drugs explicitly annotated as combinations
+    split_drugs = explode_column(split_drugs, 'split_drug', 'split_drug', sep='/', split_only=True)
+    return split_drugs
 
 
 def explode_and_map_phenotypes(df):
@@ -375,8 +352,7 @@ def generate_clinical_annotation_evidence(so_accession_dict, created_date, row):
         'directionality': row['Allele Function'],
 
         # PHENOTYPE ATTRIBUTES
-        'drugFromSource': row['split_drug'],
-        'drugFromSourceId': iri_to_code(row['chebi']),
+        'drugsFromSource': row['split_drug'],
         'pgxCategory': row['split_pgx_category'].lower(),
         'phenotypeText': row['split_phenotype'],
         'phenotypeFromSourceId': iri_to_code(row['efo'])

diff --git a/opentargets_pharmgkb/pandas_utils.py b/opentargets_pharmgkb/pandas_utils.py
@@ -10,7 +10,7 @@ def none_to_nan(x):
     return np.nan if x is None else x
 
 
-def explode_column(df, source_col, target_col, sep=';'):
+def explode_column(df, source_col, target_col, sep=';', split_only=False):
     """
     Splits a string-valued column in dataframe and explodes on the values, storing them in the specified target column.
     Any white space around the separator will be stripped.
@@ -19,8 +19,10 @@ def explode_column(df, source_col, target_col, sep=';'):
     :param source_col: name of column in df to split
     :param target_col: destination column name for exploded values
     :param sep: string separator to split source_col by (default ';')
+    :param split_only: if True will only split on separator, leaving target_col as a list (default False)
     :return: dataframe with target_col added
     """
-    split_cols = df.assign(**{target_col: df[source_col].str.split(sep)}).explode(target_col).reset_index(drop=True)
-    split_cols[target_col] = split_cols[target_col].map(lambda x: str(x).strip() if pd.notna(x) else np.nan)
-    return split_cols
+    split_cols = df.assign(**{target_col: df[source_col].str.split(pat=f'\s*{sep}\s*')})
+    if not split_only:
+        split_cols = split_cols.explode(target_col)
+    return split_cols.reset_index(drop=True)
diff --git a/tests/resources/clinical_ann_alleles.tsv b/tests/resources/clinical_ann_alleles.tsv
@@ -37,3 +37,6 @@ Clinical Annotation ID	Genotype/Allele	Annotation Text	Allele Function
 1183621000	A- 202A_376G	Patients with one X-chromosome and the A- 202A_376G allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with the reference B allele (non-deficient, class IV). Patients with two X-chromosomes and the A- 202A_376G allele in combination with another deficient class I-III allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Patients with two X-chromosomes and the A- 202A_376G allele in combination with a non-deficient allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis.	III/Deficient
 1183621000	B (reference)	Patients with one X-chromosome and the reference B (reference) allele (non-deficient, class IV) who are treated with rasburicase may have a decreased risk of methemoglobinemia and/or hemolysis as compared to patients with a deficient class I-III allele. Patients with two X-chromosomes and two copies of the reference B allele (non-deficient, class IV) who are treated with rasburicase may have a decreased risk of methemoglobinemia and/or hemolysis as compared to patients with a deficient class I-III allele. Patients with two X-chromosomes, one copy of the reference B allele (non-deficient, class IV) and one deficient class I-III allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis.	IV/Normal
 1183621000	Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham	Patients with one X-chromosome and the Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham allele (rs5030868 allele A) who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with the reference B allele (non-deficient, class IV)(rs5030868 allele G). Patients with two X-chromosomes and the Mediterranean, Dallas, Panama' Sassari, Cagliari, Birmingham variant (rs5030868 allele A) in combination with another deficient class I-III allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV)(rs5030868 allele G). Patients with two X-chromosomes and the Mediterranean, Dallas, Panama' Sassari, Cagliari, Birmingham variant (rs5030868 allele A) in combination with a non-deficient allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis.	II/Deficient
+1447979749	CTT/CTT	Patients with cystic fibrosis and the rs113993960 CTT/CTT genotype (no copies of the CFTR F508del variant) have an unknown response to the combination drug ivacaftor/lumacaftor as this genotype is not an indication for ivacaftor/lumacaftor. Other genetic and clinical factors may also influence response to ivacaftor/lumacaftor.
+1447979749	CTT/del	Patients with cystic fibrosis and the rs113993960 CTT/del genotype (one copy of the CFTR F508del variant) may experience a limited benefit from treatment with the combination drug of ivacaftor/lumacaftor, as shown by improvement in sweat chloride concentrations CFQ-R questionnaire scores when compared to treatment with placebo. However, ppFEV1, BMI or body weight did not show a significant improvement following ivacaftor/lumacaftor treatment. This genotype is not an indication for use of the combination drug of ivacaftor/lumacaftor according to the FDA-approved drug label for this drug combination. Other genetic and clinical factors may also influence response to ivacaftor/lumacaftor.
+1447979749	del/del	Patients with cystic fibrosis and the rs113993960 del/del genotype (two copies of the F508del variant) may benefit from treatment with the combination drug of ivacaftor/lumacaftor, as shown by improvement in sweat chloride concentrations and/or forced expiratory volume in 1 second (FEV1) when compared to treatment with placebo. This genotype is an indication for use of ivacaftor/lumacaftor according to the FDA-approved drug label for this drug combination. Other genetic and clinical factors may also influence response to ivacaftor/lumacaftor.