Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bgee annotator #24

Merged
merged 7 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions src/pyBiodatafuse/annotators/bgee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Python file for queriying Bgee database (https://bgee.org) via its SPARQL endpoint (https://www.bgee.org/sparql/)."""

import datetime
import os
from string import Template

import pandas as pd
from SPARQLWrapper import JSON, SPARQLWrapper

from pyBiodatafuse.utils import collapse_data_sources, get_identifier_of_interest


def get_version_bgee() -> dict:
tabbassidaloii marked this conversation as resolved.
Show resolved Hide resolved
"""Get version of Bgee RDF data from its SPARQL endpoint.

# not sure if a version per-se can be retrieved, but the endpoint supports
# http://purl.org/dc/terms/modified
tabbassidaloii marked this conversation as resolved.
Show resolved Hide resolved
:returns: a dictionary containing the last modified date information
"""
with open(os.path.dirname(__file__) + "/queries/bgee-get-last-modified.rq", "r") as fin:
sparql_query = fin.read()

sparql = SPARQLWrapper("https://www.bgee.org/sparql/")
sparql.setReturnFormat(JSON)

sparql.setQuery(sparql_query)
res = sparql.queryAndConvert()

bgee_version = {"bgee_version": res["results"]["bindings"][0]["dateModified"]["value"]}

return bgee_version


def get_gene_literature(bridgedb_df: pd.DataFrame, anatomical_entities: pd.DataFrame):
"""Query gene-tissue expression information from Bgee.

:param bridgedb_df: BridgeDb output for creating the list of gene ids to query
:param anatomical_entities: a dataframe containing the names of Anatomical entities of interest
:returns: a DataFrame containing the Bgee output and dictionary of the Bgee metadata.
"""
# Record the start time
start_time = datetime.datetime.now()

data_df = get_identifier_of_interest(bridgedb_df, "Ensembl")
gene_list = data_df["target"].tolist()
gene_list = list(set(gene_list))

query_gene_lists = []
if len(gene_list) > 25:
for i in range(0, len(gene_list), 25):
tmp_list = gene_list[i : i + 25]
query_gene_lists.append(" ".join(f'"{g}"' for g in tmp_list))

else:
query_gene_lists.append(" ".join(f'"{g}"' for g in gene_list))

anat_entities_list = anatomical_entities["AnatomicalEntityNames"].tolist()
anat_entities_list = list(set(anat_entities_list))

query_anat_entities_lists = []
if len(anat_entities_list) > 25:
for i in range(0, len(anat_entities_list), 25):
tmp_list = anat_entities_list[i : i + 25]
query_anat_entities_lists.append(" ".join(f'"{g}"' for g in tmp_list))

else:
query_anat_entities_lists.append(" ".join(f'"{g}"' for g in anat_entities_list))

with open(os.path.dirname(__file__) + "/queries/bgee-genes-tissues-expression.rq", "r") as fin:
sparql_query = fin.read()

sparql = SPARQLWrapper("https://www.bgee.org/sparql/")
sparql.setReturnFormat(JSON)

query_count = 0

results_df_list = list()

for gene_list_str in query_gene_lists:
for query_anat_entities_str in query_anat_entities_lists:
query_count += 1

sparql_query_template = Template(sparql_query)
substit_dict = dict(gene_list=gene_list_str, anat_entities_list=query_anat_entities_str)
sparql_query_template_sub = sparql_query_template.substitute(substit_dict)

sparql.setQuery(sparql_query_template_sub)
res = sparql.queryAndConvert()

df = pd.DataFrame(res["results"]["bindings"])
df = df.applymap(lambda x: x["value"])

results_df_list.append(df)

# Organize the annotation results as an array of dictionaries
intermediate_df = pd.concat(results_df_list)

intermediate_df.rename(columns={"ensemblId": "target"}, inplace=True)

# Record the end time
end_time = datetime.datetime.now()

# Metadata details
# Get the current date and time
current_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Calculate the time elapsed
time_elapsed = str(end_time - start_time)

# Add version to metadata file
bgee_version = get_version_bgee()

# Add the datasource, query, query time, and the date to metadata
bgee_metadata = {
"datasource": "Bgee",
"metadata": {"source_version": bgee_version},
"query": {
"size": len(gene_list),
"time": time_elapsed,
"date": current_date,
"url": "https://www.bgee.org/sparql/",
},
}

# Merge the two DataFrames on the target column
merged_df = collapse_data_sources(
data_df=data_df,
source_namespace="Ensembl",
target_df=intermediate_df,
common_cols=["target"],
target_specific_cols=[
"anatomicalEntity",
"anatomicalEntityName",
"expressionLevel",
"confidenceLevel",
],
col_name="Bgee",
)

return merged_df, bgee_metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT ?gene ?ensemblId ?geneId ?anatomicalEntity ?anatomicalEntityName ?expressionLevel ?confidenceLevel
WHERE {
VALUES ?ensemblId { $gene_list }
VALUES ?anatomicalEntityName { $anat_entities_list }
BIND(CONCAT("http://rdf.ebi.ac.uk/resource/ensembl/", STR(?ensemblId)) AS ?geneIdStr)
BIND(URI(?geneIdStr) AS ?geneId)
?gene a orth:Gene .
?gene lscr:xrefEnsemblGene ?geneId .
?expr genex:hasSequenceUnit ?gene.
?expr a genex:Expression .
#?expr genex:hasConfidenceLevel obo:CIO_0000029 . # high confidence level
?expr genex:hasConfidenceLevel ?confidenceLevel .
?expr genex:hasExpressionLevel ?expressionLevel .
?expr genex:hasExpressionCondition ?cond .
?cond genex:hasAnatomicalEntity ?anatomicalEntity . # tissue
?anatomicalEntity rdfs:label ?anatomicalEntityName.
# also can filter by expression level: e.g. FILTER (?expressionLevel > 99) # highly expressed
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
SELECT DISTINCT (str(?dateModified) as ?dateModified)
WHERE {
<http://purl.org/query/bioquery#BGEE> <http://purl.org/dc/terms/modified> ?dateModified.
}
LIMIT 1