Merge branch 'master' into vcep_trackhub

BRCAChallenge · Aug 31, 2024 · f9e9712 · f9e9712
2 parents 36497d8 + 0cffa22
commit f9e9712
Show file tree

Hide file tree

Showing 32 changed files with 981 additions and 93 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,6 +1,7 @@
 version: 2
 jobs:
    buildweb:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
        - image: circleci/postgres:9.6.2
@@ -38,6 +39,7 @@ jobs:
        - store_test_results:
            path: ~/test_reports
    deploy-dev:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
      steps:
@@ -60,6 +62,7 @@ jobs:
            name: deploying to dev machine
            command: ~/project/deployment/deploy-dev
    deploy-beta:
+     circleci_ip_ranges: true
      docker:
        - image: circleci/node:16
      steps:
@@ -94,8 +97,6 @@ jobs:
            command: |
              apk add --no-cache \
                py-pip=9.0.0-r1
-             pip install \
-              docker-compose==1.12.0
        - restore_cache:
            keys:
              - v1-{{ .Branch }}

diff --git a/deployment/deploy-dev b/deployment/deploy-dev
@@ -13,8 +13,9 @@ cd ${WEBSITE}
 npm run build:prod
 
 # deploy (not preserving owner/group)
-rsync -rlptD --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta
-rsync -rlptD --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django
+# Force rsync's ssh to use ipv4 to prevent "Cannot assign requested address" error
+rsync -rlptD -e 'ssh -4' --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta
+rsync -rlptD -e 'ssh -4' --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django
 
 requirements=$(cat requirements.txt)
 requirements=$(echo ${requirements}) # drop carriage returns

diff --git a/deployment/site_settings/config.beta.js b/deployment/site_settings/config.beta.js
@@ -5,7 +5,7 @@
         baseurl: '/',
         captcha_key: '', /* reCAPTCHA API key */
         maps_key: '', /* Google maps javascript API key */
-        backend_url: 'http://brcaexchange-prod.gi.ucsc.edu/backend',
+        backend_url: 'https://brcaexchange-prod.gi.ucsc.edu/backend',
         analytics: null,
         environment: 'beta'
     };

diff --git a/pipeline/Makefile b/pipeline/Makefile
@@ -75,7 +75,6 @@ endif
 
 COMMON_DOCKER_ARGS = --rm -u `id -u ${USER}`:$(DOCKER_GRP) \
 	-e "DATA_DATE=$(DATA_DATE)" \
-        -e "UTA_DB_URL=postgresql://[email protected]:$(UTA_PORT)/uta/uta_$(UTA_RELEASE_DATE)" \
 	-e "HGVS_SEQREPO_DIR=$(SEQ_REPO_DIR_DOCKER)/latest" \
   -e "PYTHONPATH=/opt/brca-exchange/pipeline" \
 	--network host \
@@ -142,7 +141,7 @@ test-coverage: ## Running pipeline unit tests with coverage information
 	docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/data && bash getdata && cd /opt/brca-exchange/pipeline && pytest --cov --ignore=splicing/ && coverage html --include="/opt/brca-exchange/pipeline/*" --omit="*/test_*"'
 
 
-build-release: start-local-uta checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release
+build-release: checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release
 
 variants-by-source: ## postprocessing: compute statistics for changes with respect to the last release
 	docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) python /opt/brca-exchange/pipeline/utilities/variantsBySource.py  -i /files/data/output/release/built_with_change_types.tsv -c true
@@ -179,7 +178,7 @@ post-release-cmds: cleanup-failed include-release-notes push-docker tag-release
 setup-data-from-latest-release-tar: setup-files ## sets up brca output dir with data contained in release archive from last release (only data from variant merging onwards)
 	tar -C $(OUT_DIR) -zxf $(PREVIOUS_RELEASE_PATH)
 
-setup-dev-env: start-local-uta build-docker setup-files download-resources ## setup development environment
+setup-dev-env: build-docker setup-files download-resources ## setup development environment
 
 clean-pyc: ## remove Python file artifacts
 	find . -name '*.pyc' -exec rm -f {} +

diff --git a/pipeline/analysis/add_bioinfo_pred.py b/pipeline/analysis/add_bioinfo_pred.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import re
+import sys
+
+BIOINFO_CODE_ID = "Provisional_Evidence_Code_Bioinfo"
+BIOINFO_CODE_DESCR = "Provisional_Evidence_Description_Bioinfo"
+
+
+NO_CODE = "NO_CODE"
+PP3 = "PP3"
+BP4_BP7 = "BP4,BP7"
+BP4 = "BP4"
+BP1_STRONG = "BP1_STRONG"
+PVS1_CODE = "PVS1"
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", default="build_final.tsv",
+                        help="built_final")
+    parser.add_argument("-o", "--output", default="built_with_bioinfo.tsv",
+                        help="version of input file with new columns added")
+    parser.add_argument("-d", "--debug", action="store_true", default=False,
+                        help="Print debugging info")
+    args = parser.parse_args()
+    return(args)
+
+
+def initialize_output_file(input_file, output_filename):
+    """
+    Create an empty output file with the new columns                        
+    """
+    new_columns = [BIOINFO_CODE_ID, BIOINFO_CODE_DESCR]
+    input_header_row = input_file.fieldnames
+    if "change_type" in input_header_row:
+        idx = input_header_row.index("change_type")
+        output_header_row = input_header_row[:idx] + new_columns \
+            + input_header_row[idx:]
+    else:
+        output_header_row = input_header_row + new_columns
+    output_file = csv.DictWriter(open(output_filename,"w"),
+                                 fieldnames=output_header_row,
+                                 delimiter = '\t')
+    output_file.writeheader()
+    return(output_file)
+
+
+def extract_protein_coordinate(variant):
+    coordinate = None
+    hit = re.search("[0-9]+", variant["Protein_Change"])
+    if hit:
+        token = variant["Protein_Change"][hit.start():hit.end()]
+        pos = int(token)
+        print("from", variant["Protein_Change"], "derived", pos)
+        return(pos)
+
+def inside_functional_domain(variant):
+    inside_domain = False
+    pos = extract_protein_coordinate(variant)
+    if pos:
+        if variant["Gene_Symbol"] == "BRCA1":
+            if pos >= 2 and pos <= 99:
+                inside_domain = True
+            elif pos >= 503 and pos <= 508:
+                inside_domain = True
+            elif pos >= 607 and pos <= 614:
+                inside_domain = True
+            elif pos >= 651 and pos <= 656:
+                inside_domain = True
+            elif pos >= 1391 and pos <= 1424:
+                inside_domain = True
+            elif pos >= 1650 and pos <= 1863:
+                inside_domain = True
+        elif variant["Gene_Symbol"] == "BRCA2":
+            if pos >= 10 and pos <= 40:
+                inside_domain = True
+            elif pos >= 1002 and pos <= 1036:
+                inside_domain = True
+            elif pos >= 1212 and pos <= 1246:
+                inside_domain = True
+            elif pos >= 1422 and pos <= 1453:
+                inside_domain = True
+            elif pos >= 1518 and pos <= 1549:
+                inside_domain = True
+            elif pos >= 1665 and pos <= 1696:
+                inside_domain = True
+            elif pos >= 1837 and pos <= 1871:
+                inside_domain = True
+            elif pos >= 1971 and pos <= 2005:
+                inside_domain = True
+            elif pos >= 2051 and pos <= 2085:
+                inside_domain = True
+            elif pos >= 2481 and pos <= 3186:
+                inside_domain = True
+            elif pos >= 3263 and pos <= 3269:
+                inside_domain = True
+            elif pos >= 3265 and pos <= 3330:
+                inside_domain = True
+            elif pos >= 3381 and pos <= 3385:
+                inside_domain = True
+    return(inside_domain)
+
+
+
+def estimate_bioinfo_code(variant):
+    effect = "unknown"
+    bioinfo_code = NO_CODE
+    if re.search("=\)$", variant["pyhgvs_Protein"]):
+        effect = "synonymous_variant"
+    elif re.search("[A-Z]+[0-9]+[A-Z]+", variant["Protein_Change"]):
+        effect = "missense_variant"
+    elif re.search("c\.[0-9]+[+]", variant["pyhgvs_cDNA"]):
+        effect = "intron_variant"
+    elif re.search("c\.[0-9]+[-]", variant["pyhgvs_cDNA"]):
+        effect = "intron_variant"
+    print("variant", variant["pyhgvs_cDNA"], "protein change", variant["Protein_Change"], variant["pyhgvs_Protein"], "effect", effect)
+    if  variant["result_spliceai"] == "-":
+        splicing_effect = False
+        no_splicing_effect = True
+    else:
+        splicing_effect = (float(variant["result_spliceai"]) > 0.2)
+        no_splicing_effect = (float(variant["result_spliceai"]) < 0.1)
+    if variant["Gene_Symbol"] == "BRCA1":
+        if variant["BayesDel_nsfp33a_noAF"] == "-":
+            protein_effect = False
+            no_protein_effect = True
+        elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.28:
+            protein_effect = True
+            no_prptein_effect = False
+        elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.15:
+            protein_effect = False
+            no_protein_effect = True
+        else:
+            protein_effect = False
+            no_protein_effect = False
+    if variant["Gene_Symbol"] == "BRCA2":
+        if variant["BayesDel_nsfp33a_noAF"] == "-":
+            protein_effect = False
+            no_protein_effect = True
+        elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.30:
+            protein_effect = True
+            no_prptein_effect = False
+        elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.18:
+            protein_effect = False
+            no_protein_effect = True
+        else:
+            protein_effect = False
+            no_protein_effect = False
+    inside_domain = inside_functional_domain(variant)
+    print("effect", effect, "splicing effect", splicing_effect, "inside domain", inside_domain)
+    if effect == "synonymous_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        elif inside_domain:
+            bioinfo_code = BP4_BP7
+        else:
+            bioinfo_code = BP1_STRONG
+    elif effect == "intron_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        else:
+            bioinfo_code = BP4
+    elif effect == "missense_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        elif no_splicing_effect:
+            if not inside_domain:
+                bioinfo_code = BP1_STRONG
+            elif protein_effect:
+                bioinfo_code = PP3
+            elif no_protein_effect:
+                bioinfo_code = BP4
+        else:
+            if inside_domain and protein_effect:
+                bioinfo_code = PP3
+    return(bioinfo_code)
+
+
+def apply_pvs1_code(variant):
+    pvs1_code = NO_CODE
+    protein_hgvs = variant["pyhgvs_Protein"]
+    stop_added = re.search("Ter", protein_hgvs)
+    if stop_added:
+        pvs1_code = PVS1_CODE
+    return(pvs1_code)
+
+
+def main():
+    csv.field_size_limit(sys.maxsize)
+    args = parse_args()
+    with open(args.input, 'r') as input_fp:
+        input_reader = csv.DictReader(input_fp, delimiter = "\t")
+        writer = initialize_output_file(input_reader, args.output)
+        for variant in input_reader:
+            #variant[BIOINFO_CODE_ID] = estimate_bioinfo_code(variant, debug=args.debug)
+            #pvs1_code = apply_pvs1_code(variant)
+            variant[BIOINFO_CODE_ID] = ""
+            variant[BIOINFO_CODE_DESCR] = ""
+            writer.writerow(variant)
+
+if __name__ == "__main__":
+    main()