Merge branch 'master' into cleanup_discard_list

BRCAChallenge · Oct 24, 2024 · 8fff8bd · 8fff8bd
2 parents bf4e91c + 7b009b3
commit 8fff8bd
Show file tree

Hide file tree

Showing 32 changed files with 717 additions and 229 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,6 +1,7 @@
 version: 2
 jobs:
    buildweb:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
        - image: circleci/postgres:9.6.2
@@ -38,6 +39,7 @@ jobs:
        - store_test_results:
            path: ~/test_reports
    deploy-dev:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
      steps:
@@ -60,6 +62,7 @@ jobs:
            name: deploying to dev machine
            command: ~/project/deployment/deploy-dev
    deploy-beta:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
      steps:
@@ -94,8 +97,6 @@ jobs:
            command: |
              apk add --no-cache \
                py-pip=9.0.0-r1
-             pip install \
-              docker-compose==1.29.2
        - restore_cache:
            keys:
              - v1-{{ .Branch }}

diff --git a/deployment/deploy-dev b/deployment/deploy-dev
@@ -13,8 +13,9 @@ cd ${WEBSITE}
 npm run build:prod
 
 # deploy (not preserving owner/group)
-rsync -rlptD --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta
-rsync -rlptD --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django
+# Force rsync's ssh to use ipv4 to prevent "Cannot assign requested address" error
+rsync -rlptD -e 'ssh -4' --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta
+rsync -rlptD -e 'ssh -4' --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django
 
 requirements=$(cat requirements.txt)
 requirements=$(echo ${requirements}) # drop carriage returns

diff --git a/deployment/site_settings/config.beta.js b/deployment/site_settings/config.beta.js
@@ -5,7 +5,7 @@
         baseurl: '/',
         captcha_key: '', /* reCAPTCHA API key */
         maps_key: '', /* Google maps javascript API key */
-        backend_url: 'http://brcaexchange-prod.gi.ucsc.edu/backend',
+        backend_url: 'https://brcaexchange-prod.gi.ucsc.edu/backend',
         analytics: null,
         environment: 'beta'
     };

diff --git a/pipeline/docker/Dockerfile b/pipeline/docker/Dockerfile
@@ -31,7 +31,7 @@ COPY pipeline/requirements.txt .
 COPY test-requirements.txt .
 
 # pip 20.3+ uses strict dependency resolver that causes biocommons/bioutils and hgvs/ipython errors
-RUN pip install pip==20.2
+RUN pip install pip==24.0
 
 # install numpy first to avoid issues with bio python and bx-python (see also https://github.com/LUMC/vep2lovd/issues/1)
 RUN pip install $(grep numpy requirements.txt)

diff --git a/pipeline/genomeBrowserTrack/bigBedFromBed.sh b/pipeline/genomeBrowserTrack/bigBedFromBed.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+#
+# Generates an output BigBed file from a single input BED file,
+# correcting for errors. bedToBigBed errors on variants > 255 characters.
+# This script removes those problem variants until bedToBigBed succeeds
+#
+set -o errexit
+
+INPUT_BED=$1
+AS=$2
+CHROM_SIZES=$3
+OUTPUT_BIGBED=$4
+
+sort -k1,1 -k2,2 ${INPUT_BED} -o ${INPUT_BED}
+
+set +e
+
+# captures the first error message if present
+ERROR=`bedToBigBed -type=bed9+ -as=${AS} -tab ${INPUT_BED} ${CHROM_SIZES}  ${OUTPUT_BIGBED} 2>&1`
+
+# parses line number of the first problem variant from the error message
+ERRORLINE=`echo $ERROR | sed -n -e 's/^.*line //p' | sed 's/\s.*$//'`
+
+until [ -z "$ERRORLINE" ]
+do
+    # removes error variant 
+    sed -i "${ERRORLINE}d" ${INPUT_BED}
+    echo $ERROR
+
+    # captures and parses the line number of the next problem variant
+    ERROR=`bedToBigBed -type=bed9+ -as=${AS} -tab ${INPUT_BED} ${CHROM_SIZES}  ${OUTPUT_BIGBED} 2>&1`
+    ERRORLINE=`echo $ERROR | sed -n -e 's/^.*line //p' | sed 's/\s.*$//'`
+    echo $ERRORLINE
+done
+
+set -e
+
+bedToBigBed -type=bed9+ -as=${AS} -tab ${INPUT_BED} ${CHROM_SIZES} ${OUTPUT_BIGBED}
diff --git a/pipeline/genomeBrowserTrack/brcaPopfreqToBed.py b/pipeline/genomeBrowserTrack/brcaPopfreqToBed.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+from collections import namedtuple, OrderedDict
+import html
+import genomeBrowserUtils
+
+
+
+
+def _write_auto_sql_file(as_path):
+    with open(as_path, "w") as asFh:
+
+        sql = """table brcaExchanges
+        " These data are in BigBed bed9 format, and include selected fields from https://brcaexchange.org"
+        (
+        string chrom;      "Chromosome (or contig, scaffold, etc.)"
+        uint   chromStart; "Start position in chromosome"
+        uint   chromEnd;   "End position in chromosome"
+        string name;       "Name of item"
+        uint   score;      "Score from 0-1000"
+        char[1] strand;    "+ or -"
+        uint thickStart;   "Start of where display should be thick (start codon)"
+        uint thickEnd;     "End of where display should be thick (stop codon)"
+        uint reserved;     "Used as itemRgb as of 2004-11-22"        
+        string outlink;    "Link to the variant in BRCA Exchange"
+        string symbol;     "Gene Symbol"
+        string cdna_hgvs;       "Variant ID in cDNA HGVS nomenclature"
+        string protein_hgvs;    "Variant ID in protein HGVS nomenclature"
+        string CA_ID;       "ClinGen Allele Registry ID"
+        string provisional_evidence_code;      "Provisional ACMG code"
+        string provional_code_description; "Accompanying description"
+        string _mouseOver; "mouse over field hidden"
+        )
+        """
+
+        asFh.write(sql)
+        print("wrote as file to {}".format(asFh.name))
+
+
+def write_track_item(rec, start, end, output_fp):
+    chrom = "chr"+rec.Chr
+    score = 0
+    strand = "."
+    name = rec.pyhgvs_cDNA[0:254]
+    if name == "?":
+        assert(False)
+    thickStart = start
+    thickEnd = end
+    acmgCode = rec.Provisional_Evidence_Code_Popfreq
+    color = genomeBrowserUtils.acmgCodeToColor(acmgCode)
+    out_url = "https://brcaexchange.org/variant/" + rec.CA_ID
+    #                                                                                                 
+    # When generating the mouseOver, truncate the strings to 50 characters each,
+    # to not overhwelm the browser's internal limit of 255 characters.                                
+    description = "Click on the track item for more details"
+    mouseOver = (("<b>Provisional ACMG Evidence Code:</b> %s<br>" + \
+                  "<b>Details:</b> %s") \
+                 % (acmgCode, description))[:245] + "<br>"
+    outRow = [chrom, start, end, name, score, strand, thickStart, thickEnd, color, out_url,
+              rec.Gene_Symbol,
+              genomeBrowserUtils.displayString(rec.pyhgvs_cDNA[0:254]),
+              genomeBrowserUtils.displayString(rec.pyhgvs_Protein[0:254]),
+              genomeBrowserUtils.displayString(rec.CA_ID),
+              acmgCode, description[:254],
+              mouseOver]
+    outRow = [str(x) for x in outRow]
+    output_fp.write("\t".join(outRow)+"\n")
+
+
+def main():
+    args = genomeBrowserUtils._get_args()
+
+    with open(args.input, 'r') as ifh:
+        ofhg19v = open(args.output_hg19_var, 'w')
+        ofhg38v = open(args.output_hg38_var, 'w')
+        ofhg19sv = open(args.output_hg19_sv, 'w')
+        ofhg38sv =  open(args.output_hg38_sv, 'w')
+        print("Reading %s..." % ifh.name)
+
+        headers = ifh.readline().rstrip("\n").rstrip("\r").strip().split("\t")
+        rowRec = namedtuple("rec", headers)
+
+        _write_auto_sql_file(args.auto_sql_file)
+
+        for line in ifh:
+            row = line.rstrip("\n").rstrip("\r").split("\t")
+            rec = rowRec(*row)
+            rd = OrderedDict(zip(headers, row)) # row as dict
+            if int(rec.Hg38_End) - int(rec.Hg38_Start) < args.length_threshold:
+                write_track_item(rec, str(int(rec.pyhgvs_Hg37_Start)-1), rec.pyhgvs_Hg37_End, ofhg19v)
+                write_track_item(rec, str(int(rec.Hg38_Start)-1), rec.Hg38_End, ofhg38v)
+            else:
+                write_track_item(rec, str(int(rec.pyhgvs_Hg37_Start)-1), rec.pyhgvs_Hg37_End, ofhg19sv)
+                write_track_item(rec, str(int(rec.Hg38_Start)-1), rec.Hg38_End, ofhg38sv)
+
+        print("wrote to %s and %s" % (ofhg19v.name, ofhg38v.name))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pipeline/genomeBrowserTrack/brcaToBed.py b/pipeline/genomeBrowserTrack/brcaToBed.py
@@ -1,45 +1,6 @@
 #!/usr/bin/env python
 from collections import namedtuple, OrderedDict
-import argparse
-
-
-def _add_urls(s, url=None):
-    """ transform a list of URLs to hrefs """
-    lines = []
-    for part in s.split(","):
-        part = part.strip()
-        if part == "":
-            continue
-        if part.startswith("http"):
-            label = part.split("/")[-1]
-            if "=" in label:
-                label = label.split("=")[-1]
-            part = "<a href='%s'>%s</a>" % (part, label)
-            lines.append(part)
-        else:
-            if url == None:
-                lines.append(part)
-            else:
-                part = "<a href='%s%s'>%s</a>" % (url, part, part)
-                lines.append(part)
-
-    return ", ".join(lines)
-
-
-def _get_parser():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("-i", "--input", help="Path to built_with_change_types.tsv file",
-                        default="output/release/built_with_change_types.tsv")
-
-    parser.add_argument("-o19", "--output-hg19", help="Output BED file with hg19",
-                        default="brcaExchange.hg19.bed")
-    parser.add_argument("-o38", "--output-hg38", help="Output BED file with hg38",
-                        default="brcaExchange.hg38.bed")
-
-    parser.add_argument("-a", "--auto-sql-file", help="Field definitions in AutoSQL format",
-                        default="brcaExchange.as")
-    return parser
+import genomeBrowserUtils
 
 
 def _write_auto_sql_file(as_path):
@@ -60,6 +21,7 @@ def _write_auto_sql_file(as_path):
         string outlink;    "Link to the variant in BRCA Exchange"
         string symbol;     "Gene Symbol"
         string cdna_hgvs;       "Variant ID in cDNA HGVS nomenclature"
+        string protein_hgvs;    "Variant ID in protein HGVS nomenclature"
         string CA_ID;       "ClinGen Allele Registry ID"
         string Clinical_significance_ENIGMA;      "Clinical Significance as curated by the ENIGMA VCEP"
         string _mouseOver; "mouse over field hidden"
@@ -71,18 +33,52 @@ def _write_auto_sql_file(as_path):
 
         print("wrote as file to {}".format(asFh.name))
 
+def write_track_item(rec, start, end, output_fp):
+    chrom = "chr"+rec.Chr
+    score = 0
+    strand = "."
+    name = rec.pyhgvs_cDNA[0:254]
+    if name == "?":
+        assert(False)
+    thickStart = start
+    thickEnd = end
+    color = genomeBrowserUtils.pathogenicityToColor(rec.Clinical_significance_ENIGMA)
+    out_url = "https://brcaexchange.org/variant/" + rec.CA_ID
+    #
+    # When generating the mouseOver, truncate the cDNA and protein HGVS string to 50 characters each, 
+    # to not overhwelm the browser's internal limit of 255 characters.
+    mouseOver = ("<b>Gene:</b> %s<br>" + \
+                 "<b>HGVS cDNA:</b> %s<br>" + \
+                 "<b>HGVS Protein:</b> %s<br>" + \
+                 "<b>VCEP Curation:</b> %s<br>" + \
+                 "<b>URL:</b> %s<br>") \
+                 % (rec.Gene_Symbol, rec.pyhgvs_cDNA[0:25], rec.pyhgvs_Protein[0:25],
+                    rec.Clinical_significance_ENIGMA,
+                    out_url)
+    outRow = [chrom, start, end, name, score, strand, thickStart, thickEnd, color, out_url,
+              rec.Gene_Symbol,
+              genomeBrowserUtils.displayString(rec.pyhgvs_cDNA[0:254]),
+              genomeBrowserUtils.displayString(rec.pyhgvs_Protein[0:254]),
+              genomeBrowserUtils.displayString(rec.CA_ID),
+              rec.Clinical_significance_ENIGMA, mouseOver]
+    outRow = [str(x) for x in outRow]
+    output_fp.write("\t".join(outRow)+"\n")
+
+
+
 
 def main():
-    parser = _get_parser()
+    args = genomeBrowserUtils._get_args()
 
-    args = parser.parse_args()
-
-    with open(args.input, 'r') as ifh, open(args.output_hg19, 'w') as ofh19, open(args.output_hg38, 'w') as ofh38:
+    with open(args.input, 'r') as ifh:
+        ofhg19v = open(args.output_hg19_var, 'w')
+        ofhg38v = open(args.output_hg38_var, 'w') 
+        ofhg19sv = open(args.output_hg19_sv, 'w')
+        ofhg38sv =  open(args.output_hg38_sv, 'w')
         print("Reading %s..." % ifh.name)
 
         headers = ifh.readline().rstrip("\n").rstrip("\r").strip().split("\t")
         rowRec = namedtuple("rec", headers)
-        include_cols = ["Chr", "Pos", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End"]
 
         _write_auto_sql_file(args.auto_sql_file)
 
@@ -91,58 +87,15 @@ def main():
             rec = rowRec(*row)
             rd = OrderedDict(zip(headers, row)) # row as dict
 
-            pat = rec.Clinical_significance_ENIGMA.lower()
-            if "pathogen" in pat:
-                color = "255,0,0"
-            elif "benign" in pat:
-                color = "0,255,0"
-            elif "uncertain" in pat:
-                color = "100,100,100"
+            if int(rec.Hg38_End) - int(rec.Hg38_Start) < args.length_threshold:
+                write_track_item(rec, str(int(rec.pyhgvs_Hg37_Start)-1), rec.pyhgvs_Hg37_End, ofhg19v)
+                write_track_item(rec, str(int(rec.Hg38_Start)-1), rec.Hg38_End, ofhg38v)
             else:
-                color = "0,0,0"
-            out_url = "https://brcaexchange.org/variant/" + rec.CA_ID
-
-            chrom = "chr"+rec.Chr
-            score = 0
-            strand = "."
-            name = rec.pyhgvs_cDNA[0:254]
-            if name == "?":
-                assert(False)
-            #
-            # When generating the mouseOver, truncate the HGVS string to 100 characters, to not overhwelm
-            # the browser's internal limit of 255 characters.
-            mouseOver = ("<b>Variant ID:</b> %s %s<br>" + \
-                         "<b>ENIGMA VCEP Clinical Significance:</b> %s<br>" + \
-                         "<b>Variant URL:</b> %s<br>") \
-                         % (rec.Gene_Symbol, rec.pyhgvs_cDNA[0:100], rec.Clinical_significance_ENIGMA,
-                            out_url)
-
-            #Start with the hg19 version
-            start = str(int(rec.pyhgvs_Hg37_Start)-1)
-            end = rec.pyhgvs_Hg37_End
-            thickStart = start
-            thickEnd = end
-            outRow = [chrom, start, end, name, score, strand, thickStart, thickEnd, color, out_url,
-                      rec.Gene_Symbol, rec.pyhgvs_cDNA[0:254], rec.CA_ID,
-                      rec.Clinical_significance_ENIGMA, mouseOver]
-
-            outRow = [str(x) for x in outRow]
-            ofh19.write("\t".join(outRow)+"\n")
-
-            # Repeat with the hg38 version
-            ftLen = int(end)-int(start)
-            start = str(int(rec.Hg38_Start)-1)
-            end = str(int(start)+ftLen)
-            thickStart = start
-            thickEnd = end
-            outRow = [chrom, start, end, name, score, strand, thickStart, thickEnd, color, out_url,
-                      rec.Gene_Symbol, rec.pyhgvs_cDNA[0:254], rec.CA_ID,
-                      rec.Clinical_significance_ENIGMA, mouseOver]
-
-            outRow = [str(x) for x in outRow]
-            ofh38.write("\t".join(outRow)+"\n")
-
-        print("wrote to %s and %s" % (ofh19.name, ofh38.name))
+                write_track_item(rec, str(int(rec.pyhgvs_Hg37_Start)-1), rec.pyhgvs_Hg37_End, ofhg19sv)
+                write_track_item(rec, str(int(rec.Hg38_Start)-1), rec.Hg38_End, ofhg38sv)
+
+
+        print("wrote to %s, %s, %s and %s" % (ofhg19v.name, ofhg38v.name, ofhg19sv.name, ofhg38sv.name))
 
 
 if __name__ == '__main__':