feat: add optional integration of GestaltMatcher/PEDIA (#399, #1125) (#…

…1249) Co-authored-by: Meghna Ahuja Bhasin <[email protected]>
varfish-org · Sep 27, 2024 · 6f695ec · 6f695ec
1 parent bfc692d
commit 6f695ec
Show file tree

Hide file tree

Showing 28 changed files with 1,104 additions and 60 deletions.
diff --git a/backend/cases/views.py b/backend/cases/views.py
@@ -59,6 +59,7 @@ def get_context_data(self, *args, **kwargs):
                 ),
                 "exomiser_enabled": settings.VARFISH_ENABLE_EXOMISER_PRIORITISER,
                 "cadd_enabled": settings.VARFISH_ENABLE_CADD,
+                "cada_enabled": settings.VARFISH_ENABLE_CADA,
                 "extra_anno_fields": extra_anno_fields,
                 "url_prefixes": {
                     "annonars": settings.VARFISH_BACKEND_URL_PREFIX_ANNONARS,

diff --git a/backend/config/settings/base.py b/backend/config/settings/base.py
@@ -133,6 +133,7 @@
     "varannos.apps.VarannosConfig",
     # Legacy apps - not used anymore!
     "hgmd.apps.HgmdConfig",
+    "ext_gestaltmatcher.apps.ExtGestaltmatcherConfig",
 ]
 
 # See: https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps
@@ -541,6 +542,16 @@
     "VARFISH_CADA_REST_API_URL", "https://cada.gene-talk.de/api/process"
 )
 
+# Enable PEDIA prioritization.
+VARFISH_ENABLE_PEDIA = env.bool("VARFISH_ENABLE_PEDIA", default=False)
+VARFISH_PEDIA_REST_API_URL = env.str("VARFISH_PEDIA_REST_API_URL", "http://127.0.0.1:9000/pedia")
+VARFISH_MIDDLEWARE_URL = env.str("VARFISH_MIDDLEWARE_URL", "http://127.0.0.1:7000")
+
+# Enable Gestalt-based prioritization.
+VARFISH_ENABLE_GESTALT_MATCHER = env.bool("VARFISH_ENABLE_GESTALT_MATCHER", default=False)
+# Configure URL to GestaltMatcher REST API
+VARFISH_GM_SENDER_URL = env.str("VARFISH_GM_SENDER_URL", "http://127.0.0.1:7000/")
+
 # Enable submission of variants to CADD server.
 VARFISH_ENABLE_CADD_SUBMISSION = env.bool("VARFISH_ENABLE_CADD_SUBMISSION", default=False)
 # CADD version to use for for submission

diff --git a/backend/ext_gestaltmatcher/__init__.py b/backend/ext_gestaltmatcher/__init__.py
diff --git a/backend/ext_gestaltmatcher/admin.py b/backend/ext_gestaltmatcher/admin.py
@@ -0,0 +1,7 @@
+from django.contrib import admin
+
+from .models import SmallVariantQueryGestaltMatcherScores, SmallVariantQueryPediaScores
+
+# Register your models here.
+admin.site.register(SmallVariantQueryGestaltMatcherScores)
+admin.site.register(SmallVariantQueryPediaScores)
diff --git a/backend/ext_gestaltmatcher/apps.py b/backend/ext_gestaltmatcher/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class ExtGestaltmatcherConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "ext_gestaltmatcher"
diff --git a/backend/ext_gestaltmatcher/migrations/0001_initial.py b/backend/ext_gestaltmatcher/migrations/0001_initial.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.20 on 2023-10-20 07:18
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="SmallVariantQueryGestaltMatcherScores",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                    ),
+                ),
+                ("gene_id", models.CharField(help_text="Entrez gene ID", max_length=64)),
+                ("gene_symbol", models.CharField(help_text="The gene symbol", max_length=128)),
+                ("priority_type", models.CharField(help_text="The priority type", max_length=64)),
+                ("score", models.FloatField(help_text="The gene score")),
+                (
+                    "query",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="variants.SmallVariantQuery"
+                    ),
+                ),
+            ],
+        )
+    ]
diff --git a/backend/ext_gestaltmatcher/migrations/0002_smallvariantquerypediascores.py b/backend/ext_gestaltmatcher/migrations/0002_smallvariantquerypediascores.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.20 on 2023-11-14 07:18
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+    dependencies = [("ext_gestaltmatcher", "0001_initial")]
+
+    operations = [
+        migrations.CreateModel(
+            name="SmallVariantQueryPediaScores",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                    ),
+                ),
+                ("gene_id", models.CharField(help_text="Entrez gene ID", max_length=64)),
+                ("gene_symbol", models.CharField(help_text="The gene symbol", max_length=128)),
+                ("score", models.FloatField(help_text="The gene score")),
+                (
+                    "query",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="variants.SmallVariantQuery"
+                    ),
+                ),
+            ],
+        )
+    ]
diff --git a/backend/ext_gestaltmatcher/migrations/__init__.py b/backend/ext_gestaltmatcher/migrations/__init__.py
diff --git a/backend/ext_gestaltmatcher/models.py b/backend/ext_gestaltmatcher/models.py
@@ -0,0 +1,43 @@
+from django.db import models
+
+
+# Create your models here.
+class SmallVariantQueryGestaltMatcherScores(models.Model):
+    """Annotate ``SmallVariantQuery`` with Gestalt Matcher scores (if configured to do so)."""
+
+    #: The query to annotate.
+    query = models.ForeignKey("variants.SmallVariantQuery", on_delete=models.CASCADE)
+
+    #: The Entrez gene ID.
+    gene_id = models.CharField(max_length=64, null=False, blank=False, help_text="Entrez gene ID")
+
+    #: The gene symbol.
+    gene_symbol = models.CharField(
+        max_length=128, null=False, blank=False, help_text="The gene symbol"
+    )
+
+    #: The priority type.
+    priority_type = models.CharField(
+        max_length=64, null=False, blank=False, help_text="The priority type"
+    )
+
+    #: The score.
+    score = models.FloatField(null=False, blank=False, help_text="The gene score")
+
+
+class SmallVariantQueryPediaScores(models.Model):
+    """Annotate ``SmallVariantQuery`` with PEDIA scores (if configured to do so)."""
+
+    #: The query to annotate.
+    query = models.ForeignKey("variants.SmallVariantQuery", on_delete=models.CASCADE)
+
+    #: The Entrez gene ID.
+    gene_id = models.CharField(max_length=64, null=False, blank=False, help_text="Entrez gene ID")
+
+    #: The gene symbol.
+    gene_symbol = models.CharField(
+        max_length=128, null=False, blank=False, help_text="The gene symbol"
+    )
+
+    #: The score.
+    score = models.FloatField(null=False, blank=False, help_text="The gene score")
diff --git a/backend/variants/file_export.py b/backend/variants/file_export.py
@@ -24,11 +24,15 @@
     ExportProjectCasesFileBgJobResult,
     SmallVariantComment,
     VariantScoresFactory,
+    annotate_with_gm_scores,
     annotate_with_joint_scores,
     annotate_with_pathogenicity_scores,
+    annotate_with_pedia_scores,
     annotate_with_phenotype_scores,
     annotate_with_transcripts,
+    get_pedia_scores,
     prioritize_genes,
+    prioritize_genes_gm,
     unroll_extra_annos_result,
 )
 from .queries import (
@@ -122,6 +126,16 @@ def to_str(val):
     ("phenotype_rank", "Phenotype Rank", int),
 )
 
+HEADERS_GM_SCORES = (
+    ("gm_score", "Gestalt Score", float),
+    ("gm_rank", "Gestalt Rank", int),
+)
+
+HEADERS_PEDIA_SCORES = (
+    ("pedia_score", "PEDIA Score", float),
+    ("pedia_rank", "PEDIA Rank", int),
+)
+
 #: Names of the pathogenicity scoring header columns.
 HEADERS_PATHO_SCORES = (
     ("pathogenicity_score", "Pathogenicity Score", float),
@@ -318,6 +332,14 @@ def _is_prioritization_enabled(self):
             )
         )
 
+    def _is_gm_enabled(self):
+        """Return whether Gestalt Matcher prioritization is enabled in this query."""
+        return settings.VARFISH_ENABLE_GESTALT_MATCHER and self.query_args.get("gm_enabled")
+
+    def _is_pedia_enabled(self):
+        """Return whether PEDIA prioritization is enabled in this query."""
+        return settings.VARFISH_ENABLE_PEDIA and self.query_args.get("pedia_enabled")
+
     def _is_pathogenicity_enabled(self):
         """Return whether pathogenicity scoring is enabled in this query."""
         return settings.VARFISH_ENABLE_CADD and all(
@@ -352,6 +374,10 @@ def _yield_columns(self, members):
             header += HEADERS_TRANSCRIPTS
         if self._is_prioritization_enabled() and self._is_pathogenicity_enabled():
             header += HEADERS_JOINT_SCORES
+        if self._is_gm_enabled():
+            header += HEADERS_GM_SCORES
+        if self._is_pedia_enabled():
+            header += HEADERS_PEDIA_SCORES
         header += HEADER_FLAGS
         header += HEADER_COMMENTS
         header += self.get_extra_annos_headers()
@@ -391,13 +417,25 @@ def _yield_smallvars(self):
                 _result = annotate_with_pathogenicity_scores(_result, variant_scores)
             if self._is_prioritization_enabled() and self._is_pathogenicity_enabled():
                 _result = annotate_with_joint_scores(_result)
+            if self._is_gm_enabled():
+                gene_scores = self._fetch_gm_scores([entry.entrez_id for entry in _result])
+                _result = annotate_with_gm_scores(_result, gene_scores)
+            if self._is_pedia_enabled():
+                pedia_scores = self._fetch_pedia_scores(_result)
+                if pedia_scores:
+                    _result = annotate_with_pedia_scores(_result, pedia_scores)
             fields = {x[1].label: x[0] for x in enumerate(list(ExtraAnnoField.objects.all()))}
             _result = unroll_extra_annos_result(_result, fields)
             self.job.add_log_entry("Writing output file...")
             total = len(_result)
             steps = math.ceil(total / 10)
             for i, small_var in enumerate(_result):
-                if self._is_prioritization_enabled() or self._is_pathogenicity_enabled():
+                if (
+                    self._is_prioritization_enabled()
+                    or self._is_pathogenicity_enabled()
+                    or self._is_gm_enabled
+                    or self._is_pedia_enabled()
+                ):
                     if i % steps == 0:
                         self.job.add_log_entry("{}%".format(int(100 * i / total)))
                 else:
@@ -433,6 +471,63 @@ def _fetch_gene_scores(self, entrez_ids):
         else:
             return {}
 
+    def _fetch_gm_scores(self, entrez_ids):
+        prio_gm = self.query_args.get("prio_gm")
+        if all((self._is_gm_enabled(), prio_gm)):
+            try:
+                return {
+                    str(gene_id): score
+                    for gene_id, gene_symbol, score, priority_type in prioritize_genes_gm(
+                        prio_gm, logging=self.job.add_log_entry
+                    )
+                }
+            except ConnectionError as e:
+                self.job.add_log_entry(e)
+        else:
+            return {}
+
+    def _fetch_pedia_scores(self, result):
+        if self._is_pedia_enabled():
+            try:
+                payloadList = []
+
+                """Read and json object by reading ``result`` ."""
+                for line in result:
+                    payload = dict()
+
+                    if all(
+                        (
+                            line.entrez_id,
+                            hasattr(line, "phenotype_score"),
+                            hasattr(line, "pathogenicity_score"),
+                            hasattr(line, "gm_score"),
+                        )
+                    ):
+                        payload["gene_name"] = line.symbol
+                        payload["gene_id"] = line.entrez_id
+
+                        payload["cada_score"] = line.phenotype_score
+                        payload["cadd_score"] = line.pathogenicity_score
+                        payload["gestalt_score"] = (
+                            0 if line.gm_score == float("inf") else line.gm_score
+                        )
+
+                    payload["label"] = False
+                    payloadList.append(payload)
+
+                case_name = self.job.case.name
+                if case_name.startswith("F_"):
+                    name = case_name[2:]  # Remove the first two characters ("F_")
+                else:
+                    name = case_name
+                scores = {"case_name": name, "genes": payloadList}
+
+                return {str(gene_id): score for gene_id, _, score in get_pedia_scores(scores)}
+            except ConnectionError as e:
+                self.job.add_log_entry(e)
+        else:
+            return {}
+
     def _fetch_variant_scores(self, variants):
         if self._is_pathogenicity_enabled():
             try:

diff --git a/backend/variants/models/jobs.py b/backend/variants/models/jobs.py
@@ -13,6 +13,10 @@
 from projectroles.plugins import get_backend_api
 from sqlalchemy import and_
 
+from ext_gestaltmatcher.models import (
+    SmallVariantQueryGestaltMatcherScores,
+    SmallVariantQueryPediaScores,
+)
 from variants.helpers import get_engine, get_meta
 from variants.models import SmallVariantQueryGeneScores, SmallVariantQueryVariantScores
 from variants.models.queries import (
@@ -98,7 +102,12 @@ def run_query_bg_job(pk):
         tl_event.add_object(obj=filter_job.case, label="case_name", name=filter_job.case.name)
 
     def _read_records(
-        inputf, smallvariantqueryresultset, pathogenicity_scores=None, phenotype_scores=None
+        inputf,
+        smallvariantqueryresultset,
+        pathogenicity_scores=None,
+        phenotype_scores=None,
+        gm_scores=None,
+        pedia_scores=None,
     ):
         """Read and yield ``SmallVariantQueryResultRow`` objects by reading ``inputf`` for the given ``SmallVariantQueryResultSet``."""
         for line in inputf:
@@ -113,6 +122,12 @@ def _read_records(
             if phenotype_scores and line.entrez_id:
                 payload["phenotype_score"] = phenotype_scores.get(line.entrez_id, -1)
 
+            if gm_scores and line.entrez_id:
+                payload["gm_score"] = gm_scores.get(line.entrez_id, 0)
+
+            if pedia_scores and line.entrez_id:
+                payload["pedia_score"] = pedia_scores.get(line.entrez_id, -1)
+
             if pathogenicity_scores and phenotype_scores and line.entrez_id:
                 if payload["pathogenicity_score"] == -1 or payload["phenotype_score"] == -1:
                     payload["patho_pheno_score"] = -1
@@ -164,6 +179,24 @@ def _inner():
                 )
                 if row.gene_id
             }
+        gm_scores = None
+        pedia_scores = None
+        if query_model.query_settings.get("gm_enabled"):
+            gm_scores = {
+                row.gene_id: row.score
+                for row in SmallVariantQueryGestaltMatcherScores.objects.filter(
+                    query__sodar_uuid=query_model.sodar_uuid
+                )
+                if row.gene_id
+            }
+        if query_model.query_settings.get("pedia_enabled"):
+            pedia_scores = {
+                row.gene_id: row.score
+                for row in SmallVariantQueryPediaScores.objects.filter(
+                    query__sodar_uuid=query_model.sodar_uuid
+                )
+                if row.gene_id
+            }
 
         with transaction.atomic():
             smallvariantqueryresultset = SmallVariantQueryResultSet.objects.create(
@@ -183,6 +216,8 @@ def _inner():
                     smallvariantqueryresultset,
                     pathogenicity_scores=pathogenicity_scores,
                     phenotype_scores=phenotype_scores,
+                    gm_scores=gm_scores,
+                    pedia_scores=pedia_scores,
                 ),
                 n=1000,
             ):