From cffbc8c7ffd3ad6dc5c95808b7bca8269fa0e9ac Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Wed, 21 Feb 2024 10:20:42 -0500
Subject: [PATCH 01/13] :pencil: start v14 updates

---
 COLLABORATIONS/openTARGETS/README.md          |   9 +-
 .../openpedcan_v14_case_meta_config.json      | 169 ++++++++++++++++++
 2 files changed, 175 insertions(+), 3 deletions(-)
 create mode 100644 COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index 45da505..f959b46 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -13,14 +13,15 @@ Genomic data generally obtained as such:
  - Copy number: tsv file with copy number, ploidy, and GISTIC-style information in maf-like format (each call is a row)
  - RNA expression: tpm values from rsem stored an `.rds` object
  - RNA fusion: annoFuse output
-For example, for v12, bucket s3://d3b-openaccess-us-east-1-prd-pbta/open-targets/v12/:
+For example, for v14, bucket s3://d3b-openaccess-us-east-1-prd-pbta/open-targets/v14/:
 ```
-consensus_wgs_plus_cnvkit_wxs.tsv.gz
+consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz
 fusion-dgd.tsv.gz
 fusion-putative-oncogenic.tsv
 gene-expression-rsem-tpm-collapsed.rds
 tcga-gene-expression-rsem-tpm-collapsed.rds
 snv-consensus-plus-hotspots.maf.tsv.gz
+snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz
 ```
 
 ### Prep work
@@ -68,13 +69,15 @@ To create the histologies file, recommended method is to:
 	  union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.chdm_phs001643_2018_cbio_sample
+    select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
+    from prod_cbio.pbta_mioncoseq_cbio_sample
 
     ```
 1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons
 
 ### Run as standalone
 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it
-1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histolgies-file.tsv -n path-to-cbio-names.csv -b Methylation`
+1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b Methylation`
 OR
 ### Run in repo
 1. Either run an interactive docker or using your local R, and ensure to mount a volume that will have the repo and whatever input histologies file you end up using, i.e. `docker run -it --mount type=bind,source=/home/ubuntu,target=/WORK pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest /bin/bash`
diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json
new file mode 100644
index 0000000..c53384e
--- /dev/null
+++ b/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json
@@ -0,0 +1,169 @@
+{
+    "merged_mafs": {
+        "dir": "study_build",
+        "dtypes": {
+            "mutation": {
+                "ext": "maf",
+                "cbio_name": "data_mutations_extended.txt",
+                "meta_file_attr": {
+                    "stable_id": "mutations",
+                    "profile_name": "Mutations",
+                    "profile_description": "Consensus calls from strelka2, mutect2, lancet, and VarDict Java.  Two or more callers required to pass, < 0.001 frequeney in gnomAD, and min read depth 8 in normal sample.",
+                    "genetic_alteration_type": "MUTATION_EXTENDED",
+                    "variant_classification_filter": "Silent,Intron,3'UTR,3'Flank,5'UTR,IGR,RNA",
+                    "datatype": "MAF",
+                    "show_profile_in_analysis_tab": "true"
+                }
+            }
+        }
+      },
+    "merged_cnvs": {
+        "dir": "study_build",
+        "dtypes": {
+            "linear": {
+                "ext": "predicted_cnv.txt",
+                "cbio_name": "data_linear_CNA.txt",
+                "meta_file_attr": {
+                    "stable_id": "linear_CNA",
+                    "profile_name": "copy-number values",
+                    "profile_description": "Predicted copy number values from WGS and WXS (Continuous). openPBTA consensus method used",
+                    "genetic_alteration_type": "COPY_NUMBER_ALTERATION",
+                    "datatype": "CONTINUOUS",
+                    "show_profile_in_analysis_tab": "false"
+                }
+
+            },
+            "discrete": {
+                "ext": "discrete_cnvs.txt",
+                "cbio_name": "data_CNA.txt",
+                "meta_file_attr": {
+                    "stable_id": "cna",
+                    "profile_name": "Binned copy-number values",
+                    "profile_description": "Predicted copy number values from WGS and WXS sequencing (Discrete). Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. openPBTA consensus method used",
+                    "genetic_alteration_type": "COPY_NUMBER_ALTERATION",
+                    "datatype": "DISCRETE",
+                    "show_profile_in_analysis_tab": "true"
+                }
+            }
+        }
+      },
+    "merged_rsem": {
+        "dir": "study_build",
+        "dtypes": {
+            "counts": {
+                "ext": "rsem_merged.txt",
+                "cbio_name": "data_mrna_seq_v2_rsem.txt",
+                "meta_file_attr": {
+                    "stable_id": "rna_seq_v2_mrna",
+                    "profile_name": "RNA expression",
+                    "profile_description": "Expression levels from RNA-Seq (rsem TPM). Only common transcripts between OpenPedCan harmonized (GENCODE27) and TCGA (GENCODE36) were kept.",
+                    "genetic_alteration_type": "MRNA_EXPRESSION",
+                    "datatype": "CONTINUOUS",
+                    "show_profile_in_analysis_tab": "false"
+                }
+            },
+            "zscore": {
+                "ext": "rsem_merged_zscore.txt",
+                "cbio_name": "data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt",
+                "meta_file_attr": {
+                    "stable_id": "rna_seq_v2_mrna_median_Zscores",
+                    "profile_name": "RNA expression z-scores",
+                    "profile_description": "Expression levels from RNA-Seq, Z scores of log2(TPM + 1) values",
+                    "genetic_alteration_type": "MRNA_EXPRESSION",
+                    "datatype": "Z-SCORE",
+                    "show_profile_in_analysis_tab": "true"
+                }
+            }
+        }
+    },
+    "merged_fusion": {
+        "dir": "study_build",
+        "dtypes": {
+            "fusion": {
+                "ext": "fusions.txt",
+                "cbio_name": "data_sv.txt",
+                "meta_file_attr": {
+                    "stable_id": "structural_variants",
+                    "profile_name": "Predicted RNA fusions",
+                    "profile_description": "Fusion data, from openPBTA using arriba and STAR Fusion, annotated and filtered using annoFuse. DGD sample subset consists of panel fusions",
+                    "genetic_alteration_type": "STRUCTURAL_VARIANT",
+                    "datatype": "SV",
+                    "show_profile_in_analysis_tab": "true"
+                }
+            }
+        }
+    },
+    "data_sheets": {
+        "dir": "datasheets",
+        "dtypes": {
+            "patient": {
+                "cbio_name": "data_clinical_patient.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "PATIENT_ATTRIBUTES"
+                }
+            },
+            "sample": {
+                "cbio_name": "data_clinical_sample.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "SAMPLE_ATTRIBUTES"
+                }
+            }
+
+        }
+    },
+    "study": {
+        "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics",
+        "description": "<a href=\"https://github.com/PediatricOpenTargets/OpenPedCan-analysis\">OpenPedCan</a> is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using <a href=\"https://kidsfirstdrc.org/\">KidsFirst Data Resource Center</a> workflows and harness <a href=\"https://github.com/AlexsLemonade/OpenPBTA-analysis/\">OpenPBTA analytics</a> workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v14 release of this effort, for v11, the latest accepted production release please see <a href=\"https://pedcbioportal.kidsfirstdrc.org/study/summary?id=openpedcan_v11\">OpenPedCan v11</a>. For study release details, please see <a href=\"https://tinyurl.com/55cxz9am\">Release Notes</a>",
+        "groups": "PUBLIC",
+        "cancer_study_identifier": "openpedcan_v14",
+        "reference_genome": "hg38",
+        "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14",
+        "type_of_cancer": "mixed",
+        "short_name": "openpedcan_v14"
+
+    },
+    "cases_3way_complete": {
+        "stable_id": "3way_complete",
+        "case_list_name": "Tumor samples with mutation, CNA and mRNA data",
+        "case_list_description": "All tumor samples with mutation, CNA, and mRNA data",
+        "case_list_category": "all_cases_with_mutation_and_cna_and_mrna_data"
+    },
+    "cases_all": {
+        "stable_id": "all",
+        "case_list_name": "All Samples",
+        "case_list_description": "All samples in study",
+        "case_list_category": "all_cases_in_study"
+    },
+    "cases_cnaseq": {
+        "stable_id": "cnaseq",
+        "case_list_name": "Tumor samples with mutation and CNA data",
+        "case_list_description": "All tumor samples with mutation and CNA data",
+        "case_list_category": "all_cases_with_mutation_and_cna_data"
+    },
+    "cases_cna": {
+        "stable_id": "cna",
+        "case_list_name": "Tumor Samples with CNA data",
+        "case_list_description": "All tumors with CNA data",
+        "case_list_category": "all_cases_with_cna_data"
+    },
+    "cases_RNA_Seq_v2_mRNA": {
+        "stable_id": "rna_seq_v2_mrna",
+        "case_list_name": "All Samples with mRNA data (RNA Seq V2)",
+        "case_list_description": "All samples with mRNA expression data",
+        "case_list_category": "all_cases_with_mrna_rnaseq_data"
+    },
+    "cases_sequenced": {
+        "stable_id": "sequenced",
+        "case_list_name": "Tumor samples with mutations",
+        "case_list_description": "All tumor samples with mutation data",
+        "case_list_category": "all_cases_with_mutation_data"
+    },
+    "cases_sv": {
+        "stable_id": "sv",
+        "case_list_name": "Tumor samples with fusions",
+        "case_list_description": "All tumor samples with fusion data",
+        "case_list_category": "all_cases_with_sv_data"
+    }
+}
\ No newline at end of file

From 04ca3a8a58b6792a24fae61776baa4f35a7963a9 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Wed, 21 Feb 2024 19:43:34 +0000
Subject: [PATCH 02/13] :wrench: fixed drop list behavior

---
 COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
index 0668011..7b3f55a 100644
--- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
+++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
@@ -59,11 +59,9 @@ cbio_names_list <- lapply(files_list, function(cbio_names){
 histology_df <- readr::read_tsv(hist_file, guess_max = 100000)
 message("Read histologies file")
 if (!is.null(opt$blacklist_strategy)){
-  drop_list <- strsplit(opt$blacklist_strategy, split = ",")
-  for (drop in drop_list){
-    histology_df <- dplyr::filter(histology_df, experimental_strategy != drop)
-    message(paste0("Dropping ", drop," as specified"))
-  }
+  drop_list <- as.list(strsplit(opt$blacklist_strategy, split = ",")[[1]])
+  message(paste0("Dropping ", opt$blacklist_strategy," as specified\n"))
+  histology_df <- histology_df %>% dplyr::filter(!experimental_strategy %in% drop_list)
 }
 # tmp update for broad histology bug, to be fixed in v11
 histology_df <- histology_df %>%

From 58bd8e897747335f7136a236c55451909704698b Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 22 Feb 2024 18:10:50 +0000
Subject: [PATCH 03/13] :construction: WIP - improved DGD disambguity, correctd
 PFS to EFS

---
 COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 2 +-
 COLLABORATIONS/openTARGETS/header_desc.tsv           | 6 ++++--
 COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 8 +++++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
index b8be856..5047111 100644
--- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
+++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
@@ -214,7 +214,7 @@ def build_header(header_list, entry):
                         # age_at_last_known = float(value)
                         # age_at_last_known = str(math.floor(age_at_last_known/365.25))
                         value = str(math.floor(float(value)/365.25))
-                elif header[i] == "PFS_days" and value != "NA":
+                elif header[i] == "EFS_days" and value != "NA":
                     value = str(math.floor(float(value)/30.5))
                     # d_free_mos = value
             elif header[i] == "tumor_descriptor":
diff --git a/COLLABORATIONS/openTARGETS/header_desc.tsv b/COLLABORATIONS/openTARGETS/header_desc.tsv
index 9baaef5..d847927 100644
--- a/COLLABORATIONS/openTARGETS/header_desc.tsv
+++ b/COLLABORATIONS/openTARGETS/header_desc.tsv
@@ -3,7 +3,8 @@ germline_sex_estimate			0	1	STRING	1	Female
 cancer_predispositions			0	1	STRING	2	None documented	
 OS_days	OS_MONTHS	Overall survival in months since initial diagnosis	0	1	NUMBER	3	NA	just convert to months
 OS_status	OS_STATUS	Overall patient survival status	0	1	STRING	4	LIVING	
-PFS_days	PFS_MONTHS	Progression free (months) since initial treatment	0	1	NUMBER	5	Not Reported	just convert to months
+EFS_days	EFS_MONTHS	Event free (months) since initial treatment	0	1	NUMBER	5	Not Reported	just convert to months
+EFS_event_type	EFS_STATUS	Event free status (months) since initial treatment	0	1	STRING	5	Not Reported	
 ethnicity	ETHNICITY		0	1	STRING	6	Not Hispanic or Latino	
 race	RACE		0	1	STRING	7	White	
 primary_site	TUMOR_SITE		0	1	STRING	8		
@@ -18,7 +19,8 @@ harmonized_diagnosis	CANCER_TYPE_DETAILED		1	0	STRING	10	Adenoma
 primary_site	TUMOR_TISSUE_SITE		1	0	STRING	9	Suprasellar/Hypothalamic/Pituitary	
 tumor_descriptor	TUMOR_TYPE		1	0	STRING	8	Initial CNS Tumor	
 composition	SAMPLE_TYPE		1	0	STRING	7	Solid Tissue	
-cohort	COHORT	Source study cohort name	1	0	STRING	6		
+cohort	COHORT	Source study cohort name	1	0	STRING	6	
+sub_cohort	SUB_COHORT	Source study sub-cohort name	1	0	STRING	6	
 CNS_region			1	0	STRING	5	Suprasellar	
 tumor_ploidy			1	0	NUMBER	4	3	
 tumor_fraction			1	0	NUMBER	3	0.476369391	
diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
index 7b3f55a..306681c 100644
--- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
+++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
@@ -119,7 +119,7 @@ message("Collated samples missing a cBio ID")
 #### Handle each cohort at a time - start with PBTA
 # get all sample IDs in the PBTA cohort
 sample_ids_pbta <- histology_df_no_format_id %>% 
-  dplyr::filter(cohort == "PBTA") %>% 
+  dplyr::filter(cohort == "PBTA", sub_cohort != "DGD") %>% 
   pull(sample_id) %>% 
   unique()
 
@@ -267,8 +267,10 @@ message("FINALIZE NAMES")
 no_need_for_tiebreaks <- histology_df %>%
   dplyr::filter(!Kids_First_Biospecimen_ID %in% all_tiebreaks$Kids_First_Biospecimen_ID) %>%
   dplyr::mutate(formatted_sample_id = case_when(
-    cohort == "PBTA" ~ sample_id,
-    cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id),
+    (cohort == "PBTA" & sub_cohort != "DGD") ~ sample_id,
+    sub_cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id),
+    ((cohort == "Maris" | cohort != "PPTC") & composition == "Derived Cell Line") ~ paste0(Kids_First_Participant_ID,"-CL"),
+    ((cohort == "Maris" | cohort != "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(Kids_First_Participant_ID,"-PDX"),
     TRUE ~ Kids_First_Participant_ID
 ))
 

From dde232c7bab2c9b103fcaf7d5f12995ab9046f13 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 29 Feb 2024 11:20:53 -0500
Subject: [PATCH 04/13] :wrench: fixed deprecated regex and hoe PPTC and Maris
 models are named

---
 COLLABORATIONS/openTARGETS/README.md                 | 5 +++--
 COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 2 +-
 COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index f959b46..4649390 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -25,7 +25,7 @@ snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz
 ```
 
 ### Prep work
-The histologies file needs `formatted_sample_id` added and likely a blacklist from the D3b Warehouse or some other source to supress duplicate RNA libraries from different sequencing methods.
+The histologies file needs `formatted_sample_id` added and likely a blacklist from the D3b Warehouse or some other source to suppress duplicate RNA libraries from different sequencing methods.
 Since we are not handling `Methylation` yet, it is recommended those entries be removed ahead of time.
 To create the histologies file, recommended method is to:
 1. `docker pull pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest` OR if you have R installed locally, ensure the following libraries are installed:
@@ -69,6 +69,7 @@ To create the histologies file, recommended method is to:
 	  union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.chdm_phs001643_2018_cbio_sample
+    union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.pbta_mioncoseq_cbio_sample
 
@@ -77,7 +78,7 @@ To create the histologies file, recommended method is to:
 
 ### Run as standalone
 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it
-1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b Methylation`
+1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b 'Methylation,Phospho-Proteomics,Whole Cell Proteomics,miRNA-Seq'`
 OR
 ### Run in repo
 1. Either run an interactive docker or using your local R, and ensure to mount a volume that will have the repo and whatever input histologies file you end up using, i.e. `docker run -it --mount type=bind,source=/home/ubuntu,target=/WORK pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest /bin/bash`
diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
index 5047111..8f6615e 100644
--- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
+++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
@@ -185,7 +185,7 @@ def build_header(header_list, entry):
         # if DGD DNA, add to gene matrix
         elif info[cohort] == 'DGD':
             # parse aliquot for panel type, i.e. ET_242MFKXW_DGD_STNGS_93
-            test = re.match('.*_DGD_(\w+)_\d+', info[a_idx])
+            test = re.match(r'.*_DGD_(\w+)_\d+', info[a_idx])
             data_gene.write(info[cbio_id] + '\tCHOP-' + test.group(1) + '\n')
 
     pt_id = info[header.index("Kids_First_Participant_ID")]
diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
index 306681c..3fcddc1 100644
--- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
+++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
@@ -269,8 +269,8 @@ no_need_for_tiebreaks <- histology_df %>%
   dplyr::mutate(formatted_sample_id = case_when(
     (cohort == "PBTA" & sub_cohort != "DGD") ~ sample_id,
     sub_cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id),
-    ((cohort == "Maris" | cohort != "PPTC") & composition == "Derived Cell Line") ~ paste0(Kids_First_Participant_ID,"-CL"),
-    ((cohort == "Maris" | cohort != "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(Kids_First_Participant_ID,"-PDX"),
+    ((cohort == "Maris" | cohort == "PPTC") & composition == "Derived Cell Line") ~ paste0(sample_id,"-CL"),
+    ((cohort == "Maris" | cohort == "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(sample_id,"-PDX"),
     TRUE ~ Kids_First_Participant_ID
 ))
 

From 57fff1e8b27148834c016ea6e7e577f21c9b5f64 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Tue, 5 Mar 2024 14:55:27 +0000
Subject: [PATCH 05/13] :pencil: rename to v15

---
 COLLABORATIONS/openTARGETS/README.md                        | 4 ++--
 ...eta_config.json => openpedcan_v15_case_meta_config.json} | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename COLLABORATIONS/openTARGETS/{openpedcan_v14_case_meta_config.json => openpedcan_v15_case_meta_config.json} (98%)

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index 4649390..4867046 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -19,7 +19,7 @@ consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz
 fusion-dgd.tsv.gz
 fusion-putative-oncogenic.tsv
 gene-expression-rsem-tpm-collapsed.rds
-tcga-gene-expression-rsem-tpm-collapsed.rds
+tcga_gene-expression-rsem-tpm-collapsed.rds
 snv-consensus-plus-hotspots.maf.tsv.gz
 snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz
 ```
@@ -44,7 +44,7 @@ To create the histologies file, recommended method is to:
     from prod_cbio.aml_sd_pet7q6f2_2018_cbio_sample
     union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
-    from prod_cbio.aml_sd_z6mwd3h0_2018_cbio_sample
+    from prod_cbio.bllnos_sd_z6mwd3h0_2018_cbio_sample
     union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.x01_fy16_nbl_maris_cbio_sample
diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
similarity index 98%
rename from COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json
rename to COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
index c53384e..ada2b57 100644
--- a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json
+++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
@@ -115,13 +115,13 @@
     },
     "study": {
         "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics",
-        "description": "<a href=\"https://github.com/PediatricOpenTargets/OpenPedCan-analysis\">OpenPedCan</a> is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using <a href=\"https://kidsfirstdrc.org/\">KidsFirst Data Resource Center</a> workflows and harness <a href=\"https://github.com/AlexsLemonade/OpenPBTA-analysis/\">OpenPBTA analytics</a> workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v14 release of this effort, for v11, the latest accepted production release please see <a href=\"https://pedcbioportal.kidsfirstdrc.org/study/summary?id=openpedcan_v11\">OpenPedCan v11</a>. For study release details, please see <a href=\"https://tinyurl.com/55cxz9am\">Release Notes</a>",
+        "description": "<a href=\"https://github.com/PediatricOpenTargets/OpenPedCan-analysis\">OpenPedCan</a> is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using <a href=\"https://kidsfirstdrc.org/\">KidsFirst Data Resource Center</a> workflows and harness <a href=\"https://github.com/AlexsLemonade/OpenPBTA-analysis/\">OpenPBTA analytics</a> workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort, for v11, the latest accepted production release please see <a href=\"https://pedcbioportal.kidsfirstdrc.org/study/summary?id=openpedcan_v11\">OpenPedCan v11</a>. For study release details, please see <a href=\"https://tinyurl.com/55cxz9am\">Release Notes</a>",
         "groups": "PUBLIC",
-        "cancer_study_identifier": "openpedcan_v14",
+        "cancer_study_identifier": "openpedcan_v15",
         "reference_genome": "hg38",
         "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14",
         "type_of_cancer": "mixed",
-        "short_name": "openpedcan_v14"
+        "short_name": "openpedcan_v15"
 
     },
     "cases_3way_complete": {

From 81073dbf67a375716c8539ef4ec1304cd15d50ce Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Tue, 5 Mar 2024 13:31:03 -0500
Subject: [PATCH 06/13] :hammer: rename append maf script :sparkles: added v15
 maf header :hammer: updated data clinical to better handle added DGD inputs

---
 COLLABORATIONS/openTARGETS/README.md          |  4 +-
 ...penpedcan.py => append_maf_to_existing.py} |  4 +-
 .../openTARGETS/clinical_to_datasheets.py     | 55 +++++++++++--------
 .../openTARGETS/maf_openpedcan_v15_header.txt |  2 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 rename COLLABORATIONS/openTARGETS/{add_dgd_maf_to_openpedcan.py => append_maf_to_existing.py} (97%)
 create mode 100644 COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index 4867046..6aae989 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -74,7 +74,7 @@ To create the histologies file, recommended method is to:
     from prod_cbio.pbta_mioncoseq_cbio_sample
 
     ```
-1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons
+1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons`
 
 ### Run as standalone
 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it
@@ -144,7 +144,7 @@ optional arguments:
 ```
 _NOTE_ for v11 input, I ran the following command `zcat snv-dgd.maf.tsv.gz | perl -e '$skip = <>; $skip= <>; while(<>){print $_;}' | gzip -c >> snv-consensus-plus-hotspots.maf.tsv.gz` to add DGD data
 
-_NOTE_ for v12 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v12_header.txt -c openpedcan_v12.maf -t ../bs_id_sample_map.txt -m ../GF_INPUTS/snv-dgd.maf.tsv.gz` to add DGD data, which is more robust - however, there are data issues with DGD, so it was left out
+_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../bs_id_sample_map.txt -m ...INPUT_PREP/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust
 
 Example run:
 `python3 COLLABORATIONS/openTARGETS/rename_filter_maf.py -m bs_id_sample_map.txt -v snv-consensus-plus-hotspots.maf.tsv.gz -s 1 -n openpedcan_v12`
diff --git a/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
similarity index 97%
rename from COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py
rename to COLLABORATIONS/openTARGETS/append_maf_to_existing.py
index b4999fb..67d35a2 100644
--- a/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py
+++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Helper script to append DGD data to an existing merged maf file.
+Helper script to append a maf to an existing  maf file.
 Uses filter_entry to filter out undesired calls like in other mafs
 """
 import sys
@@ -100,6 +100,6 @@
         else:
             skipped += 1
     sys.stderr.write("Processed " + maf_fn + "\n")
-    sys.stderr.write("Skipped " + str(skipped) + " entries meeting exlusion criteria\n")
+    sys.stderr.write("Skipped " + str(skipped) + " entries meeting exclusion criteria\n")
 
 append_maf.close()
\ No newline at end of file
diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
index 8f6615e..3cc55c8 100644
--- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
+++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
@@ -1,6 +1,5 @@
 import sys
 import argparse
-import json
 import math
 import re
 import pdb
@@ -153,17 +152,17 @@ def build_header(header_list, entry):
 
 # track some sample info so that somatic events can be collapsed
 samp_dict = {}
-s_type = header.index('sample_type')
-comp = header.index('composition')
-bs_id = header.index('Kids_First_Biospecimen_ID')
-exp = header.index('experimental_strategy')
+s_type_index = header.index('sample_type')
+comp_index = header.index('composition')
+bs_id_index = header.index('Kids_First_Biospecimen_ID')
+exp_index = header.index('experimental_strategy')
 
 # experimental strategy may be too vague, use to tell if RNA
 # related to recent addition of DGD samples, need gene matrix file
-rna_lib = header.index('RNA_library')
-cohort = header.index('cohort')
+rna_lib_index = header.index('RNA_library')
+cohort_index = header.index('cohort')
 a_idx = header.index('aliquot_id')
-cbio_id = header.index('formatted_sample_id')
+cbio_id_index = header.index('formatted_sample_id')
 data_gene = open('data_gene_matrix_CHOP.txt', 'w')
 data_gene.write('SAMPLE_ID\tmutations\n')
 
@@ -173,20 +172,20 @@ def build_header(header_list, entry):
 
 for data in clin_data:
     info = data.rstrip('\n').split('\t')
-    if info[s_type] == "Normal" and info[exp] != 'RNA-Seq':
+    if info[s_type_index] == "Normal" and info[exp_index] != 'RNA-Seq':
         continue
-    if info[bs_id] in blacklist_dict:
-        sys.stderr.write('Skipping output of ' + info[bs_id] + ' because in blacklist for reason ' + blacklist_dict[info[bs_id]] + '\n')
+    if info[bs_id_index] in blacklist_dict:
+        sys.stderr.write('Skipping output of ' + info[bs_id_index] + ' because in blacklist for reason ' + blacklist_dict[info[bs_id_index]] + '\n')
         continue
     # adjust exp value if targeted sequencing
-    if info[exp] == 'Targeted Sequencing':
-        if info[rna_lib] != 'NA':
-            info[exp] = 'RNA-Seq'
+    if info[exp_index] == 'Targeted Sequencing':
+        if info[rna_lib_index] != 'NA':
+            info[exp_index] = 'RNA-Seq'
         # if DGD DNA, add to gene matrix
-        elif info[cohort] == 'DGD':
+        elif info[cohort_index] == 'DGD':
             # parse aliquot for panel type, i.e. ET_242MFKXW_DGD_STNGS_93
             test = re.match(r'.*_DGD_(\w+)_\d+', info[a_idx])
-            data_gene.write(info[cbio_id] + '\tCHOP-' + test.group(1) + '\n')
+            data_gene.write(info[cbio_id_index] + '\tCHOP-' + test.group(1) + '\n')
 
     pt_id = info[header.index("Kids_First_Participant_ID")]
     if pt_id in pt_id_dict:
@@ -238,11 +237,11 @@ def build_header(header_list, entry):
     if samp_id not in samp_dict:
         samp_dict[samp_id] = sample_to_print
         id_mapping[samp_id] = []
-    id_mapping[samp_id].append(info[bs_id])
-    if info[exp] == "RNA-Seq":
-        bs_type[info[bs_id]] = "RNA"
+    id_mapping[samp_id].append(info[bs_id_index])
+    if info[exp_index] == "RNA-Seq":
+        bs_type[info[bs_id_index]] = "RNA"
     else:
-        bs_type[info[bs_id]] = "DNA"
+        bs_type[info[bs_id_index]] = "DNA"
 # cycle through sample IDs to see if there's matched DNA/RNA and if one can be made
 check = {}
 for samp_id in id_mapping:
@@ -256,8 +255,18 @@ def build_header(header_list, entry):
             spec = id_mapping[samp_id][1] + ";" + id_mapping[samp_id][0]
         samp_dict[samp_id][0] = spec
     elif len(id_mapping[samp_id]) > 2:
-        # QC check, only one or two biospec per sample ID
+        # QC check, only one or two biospec per sample ID, unless it's new DGD RNA + separate fusion biospecimen
         sys.stderr.write("Saw more than two biospecimens for " + samp_id + ": " + ",".join(id_mapping[samp_id]) + "\n")
+        if "DGD" in samp_id:
+            # If two RNA types and is DGD, throw a note to check
+            check_type = {"DNA": [], "RNA": []}
+            for bs_id in id_mapping[samp_id]:
+                check_type[bs_type[bs_id]].append(bs_id)
+            if len(check_type["DNA"]) == 1 and len(check_type["RNA"]) == 2:
+                spec = ";".join(check_type["DNA"] + check_type["RNA"])
+                samp_dict[samp_id][0] = spec
+                sys.stderr.write("Could be a DGD fusion + bulk RNA, may be ok\n")
+
         # exit(1)
     else:
         # skip cell line re-matching
@@ -272,9 +281,9 @@ def build_header(header_list, entry):
 mapping_file = open("bs_id_sample_map.txt", "w")
 mapping_file.write("BS_ID\tSample Type\tCbio ID\n")
 for samp_id in id_mapping:
-    for bs_id in id_mapping[samp_id]:
+    for bs_id_index in id_mapping[samp_id]:
         try:
-            mapping_file.write("\t".join([bs_id, bs_type[bs_id], samp_id]) + "\n")
+            mapping_file.write("\t".join([bs_id_index, bs_type[bs_id_index], samp_id]) + "\n")
         except Exception as e:
             sys.stderr.write(str(e) + "\n")
             pdb.set_trace()
diff --git a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt
new file mode 100644
index 0000000..0c0a0bc
--- /dev/null
+++ b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt
@@ -0,0 +1,2 @@
+#version 2.4
+Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_File	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	HGVSc	HGVSp	HGVSp_Short	Transcript_ID	Exon_Number	t_depth	t_ref_count	t_alt_count	n_depth	n_ref_count	n_alt_count	all_effects	Allele	Gene	Feature	Feature_type	Consequence	cDNA_position	CDS_position	Protein_position	Amino_acids	Codons	Existing_variation	ALLELE_NUM	DISTANCE	STRAND_VEP	SYMBOL	SYMBOL_SOURCE	HGNC_ID	BIOTYPE	CANONICAL	CCDS	ENSP	SWISSPROT	TREMBL	UNIPARC	RefSeq	SIFT	PolyPhen	EXON	INTRON	DOMAINS	AF	AFR_AF	AMR_AF	ASN_AF	EAS_AF	EUR_AF	SAS_AF	AA_AF	EA_AF	CLIN_SIG	SOMATIC	PUBMED	MOTIF_NAME	MOTIF_POS	HIGH_INF_POS	MOTIF_SCORE_CHANGE	IMPACT	PICKVARIANT_CLASS	TSL	HGVS_OFFSET	PHENO	MINIMISED	GENE_PHENO	FILTER	flanking_bps	vcf_id	vcf_qual	gnomAD_AF	gnomAD_AFR_AF	gnomAD_AMR_AF	gnomAD_ASJ_AF	gnomAD_EAS_AF	gnomAD_FIN_AF	gnomAD_NFE_AF	gnomAD_OTH_AF	gnomAD_SAS_AF	HGVSg	vcf_pos	gnomad_3_1_1_AC	gnomad_3_1_1_AN	gnomad_3_1_1_AF	gnomad_3_1_1_nhomalt	gnomad_3_1_1_AC_popmax	gnomad_3_1_1_AN_popmax	gnomad_3_1_1_AF_popmax	gnomad_3_1_1_nhomalt_popmax	gnomad_3_1_1_AC_controls_and_biobanks	gnomad_3_1_1_AN_controls_and_biobanks	gnomad_3_1_1_AF_controls_and_biobanks	gnomad_3_1_1_AF_non_cancer	gnomad_3_1_1_primate_ai_score	gnomad_3_1_1_splice_ai_consequence	MQ	MQ0	CAL	HotSpotAllele

From ecd0d889264c5de176a899c67e29f8e56067a068 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Wed, 6 Mar 2024 21:49:12 +0000
Subject: [PATCH 07/13] :pencil: update readme with latest changes :hammer:
 adjust cnv script to change in input :wrench: fix bug in handling header in
 maf script :bug: to-do fix DGD sampe naming

---
 COLLABORATIONS/openTARGETS/README.md          |  6 ++--
 .../openTARGETS/append_maf_to_existing.py     | 29 +++++++++++++------
 COLLABORATIONS/openTARGETS/cnv_to_tables.py   |  9 +++---
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index 6aae989..b67930d 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -39,7 +39,7 @@ To create the histologies file, recommended method is to:
 1. Pull the OpenPedCan repo (warning, it's 12GB ): https://github.com/PediatricOpenTargets/OpenPedCan-analysis, or just download the script from `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R`
 1. Export from D3b Warehouse the latest existing cBio IDs to use for population. Ensure that the output is csv double-quoted. Currently that can be obtained using the sql command:
     ```sql
-
+    with custom as (
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.aml_sd_pet7q6f2_2018_cbio_sample
     union
@@ -72,6 +72,8 @@ To create the histologies file, recommended method is to:
     union
     select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id
     from prod_cbio.pbta_mioncoseq_cbio_sample
+    )
+    select * from custom
 
     ```
 1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons`
@@ -144,7 +146,7 @@ optional arguments:
 ```
 _NOTE_ for v11 input, I ran the following command `zcat snv-dgd.maf.tsv.gz | perl -e '$skip = <>; $skip= <>; while(<>){print $_;}' | gzip -c >> snv-consensus-plus-hotspots.maf.tsv.gz` to add DGD data
 
-_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../bs_id_sample_map.txt -m ...INPUT_PREP/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust
+_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../INPUT_PREP/bs_id_sample_map.txt -m ../INPUTS/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust
 
 Example run:
 `python3 COLLABORATIONS/openTARGETS/rename_filter_maf.py -m bs_id_sample_map.txt -v snv-consensus-plus-hotspots.maf.tsv.gz -s 1 -n openpedcan_v12`
diff --git a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
index 67d35a2..93c732f 100644
--- a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
+++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
@@ -82,6 +82,12 @@
     v_idx = m_header.index("Variant_Classification")
     h_idx = m_header.index("Hugo_Symbol")
 
+    # need to also pop entrez ID if exists, as process_maf_entry() will do that to data
+    try:
+        m_header.pop(m_header.index("Entrez_Gene_Id"))
+    except Exception as e:
+        print(e, file=sys.stderr)
+
     for i in range(len(m_header)):
         if m_header[i] in h_dict:
             h_dict[m_header[i]] = i
@@ -90,15 +96,20 @@
         to_print = []
         datum = process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, bs_cbio_key)
         # Set tumor barcode to cBio ID
-        if datum:
-            for item in header:
-                if h_dict[item] != None:
-                    to_print.append(datum[h_dict[item]])
-                else:
-                    to_print.append("")
-            print("\t".join(to_print), file=append_maf)
-        else:
-            skipped += 1
+        try:
+            if datum:
+                for item in header:
+                    if h_dict[item] != None:
+                        to_print.append(datum[h_dict[item]])
+                    else:
+                        to_print.append("")
+                print("\t".join(to_print), file=append_maf)
+            else:
+                skipped += 1
+        except Exception as e:
+            print (e)
+            pdb.set_trace()
+            hold = 1
     sys.stderr.write("Processed " + maf_fn + "\n")
     sys.stderr.write("Skipped " + str(skipped) + " entries meeting exclusion criteria\n")
 
diff --git a/COLLABORATIONS/openTARGETS/cnv_to_tables.py b/COLLABORATIONS/openTARGETS/cnv_to_tables.py
index 588c7dd..c2af3fc 100644
--- a/COLLABORATIONS/openTARGETS/cnv_to_tables.py
+++ b/COLLABORATIONS/openTARGETS/cnv_to_tables.py
@@ -52,12 +52,13 @@ def collate_data(cnv_fn):
                 ploidy = data[p_idx]
                 cn = data[c_idx]
                 try:
-                    gistic = qual_to_gistic[data[s_idx]]
+                    qual = data[s_idx]
+                    if qual != "NA":
+                        qual = qual.lower()
+                    gistic = qual_to_gistic[qual]
                 except Exception as e:
-                    sys.stderr.write(str(e) + "\nInvalid value for gistic, skipping " + line.decode())
+                    sys.stderr.write(str(e) + "\nInvalid value for gistic, skipping " + line)
                     continue
-                    # pdb.set_trace()
-                    # hold=1
                 if samp_id not in ploidy_dict:
                     ploidy_dict[samp_id] = ploidy
                 if gene not in cn_dict:

From 993a5af9f9b9f98d9aa6f3847faaf8447d70bd86 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 7 Mar 2024 15:00:45 -0500
Subject: [PATCH 08/13] :wrench: fixed DGD naming issue

---
 COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
index 3fcddc1..130231a 100644
--- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
+++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R
@@ -119,7 +119,7 @@ message("Collated samples missing a cBio ID")
 #### Handle each cohort at a time - start with PBTA
 # get all sample IDs in the PBTA cohort
 sample_ids_pbta <- histology_df_no_format_id %>% 
-  dplyr::filter(cohort == "PBTA", sub_cohort != "DGD") %>% 
+  dplyr::filter(cohort == "PBTA" & sub_cohort != "DGD") %>% 
   pull(sample_id) %>% 
   unique()
 
@@ -132,7 +132,7 @@ for (i in 1:length(sample_ids_pbta)){
   
   # find the number of compositions
   each_specimen_need_tiebreak <- histology_df_no_format_id %>% 
-    dplyr::filter(sample_type == "Tumor") %>% 
+    dplyr::filter(sample_type == "Tumor" & sub_cohort != "DGD") %>% 
     dplyr::filter(sample_id == sample_id_of_interest) %>% 
     group_by(experimental_strategy) %>% 
     dplyr::mutate(n_sample_type = n()) %>%

From 63670e0b49a5f68807f8381461443ae01df59067 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Fri, 8 Mar 2024 14:58:56 +0000
Subject: [PATCH 09/13] :wrench: fix var instantiation and config value

---
 COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json | 2 +-
 scripts/organize_upload_packages.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
index ada2b57..6d47197 100644
--- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
+++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
@@ -94,7 +94,7 @@
         }
     },
     "data_sheets": {
-        "dir": "datasheets",
+        "dir": "study_build",
         "dtypes": {
             "patient": {
                 "cbio_name": "data_clinical_patient.txt",
diff --git a/scripts/organize_upload_packages.py b/scripts/organize_upload_packages.py
index 000d35b..2d6ca0d 100755
--- a/scripts/organize_upload_packages.py
+++ b/scripts/organize_upload_packages.py
@@ -244,8 +244,8 @@ def create_case_lists(data_dict, output_dir):
     sys.stderr.write(out_dir + " already exists.\n")
 
 try:
+    study_id = config_data["study"]["cancer_study_identifier"]
     if os.path.isdir(config_data["data_sheets"]["dir"]):
-        study_id = config_data["study"]["cancer_study_identifier"]
         cur_dir = out_dir + config_data["study"]["cancer_study_identifier"] + "/"
         try:
             os.mkdir(cur_dir)

From 554ec4e4192cd246e27b7fa1773562a155c831b3 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Fri, 8 Mar 2024 10:45:29 -0500
Subject: [PATCH 10/13] :wrench: fixed bug in clinical data for pt id and efs
 status

---
 COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
index 3cc55c8..4751828 100644
--- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
+++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py
@@ -219,6 +219,13 @@ def build_header(header_list, entry):
             elif header[i] == "tumor_descriptor":
                 if value in tumor_descriptor_dict:
                     value = tumor_descriptor_dict[value]
+            elif header[i] == "EFS_event_type":
+                if value == "Not Applicable":
+                    value = "0:No Event"
+                elif value == "Not Reported":
+                    value = "NA"
+                else:
+                    value = "1:" + value
             # replace status with NA if value not acceptable
             elif header[i] == "OS_status":
                 if value not in ["LIVING", "DECEASED", "NA"]:
@@ -253,7 +260,7 @@ def build_header(header_list, entry):
         spec = id_mapping[samp_id][0] + ";" + id_mapping[samp_id][1]
         if bs_type[id_mapping[samp_id][0]] == "RNA":
             spec = id_mapping[samp_id][1] + ";" + id_mapping[samp_id][0]
-        samp_dict[samp_id][0] = spec
+        samp_dict[samp_id][1] = spec
     elif len(id_mapping[samp_id]) > 2:
         # QC check, only one or two biospec per sample ID, unless it's new DGD RNA + separate fusion biospecimen
         sys.stderr.write("Saw more than two biospecimens for " + samp_id + ": " + ",".join(id_mapping[samp_id]) + "\n")
@@ -264,7 +271,7 @@ def build_header(header_list, entry):
                 check_type[bs_type[bs_id]].append(bs_id)
             if len(check_type["DNA"]) == 1 and len(check_type["RNA"]) == 2:
                 spec = ";".join(check_type["DNA"] + check_type["RNA"])
-                samp_dict[samp_id][0] = spec
+                samp_dict[samp_id][1] = spec
                 sys.stderr.write("Could be a DGD fusion + bulk RNA, may be ok\n")
 
         # exit(1)

From ef3fd0f6886b823845538a9995eb2b8a8f64108f Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Fri, 8 Mar 2024 22:01:48 +0000
Subject: [PATCH 11/13] :wrench: fixed header file :hammer: rafactoreed and
 added bug fix for fusion input data

---
 .../openTARGETS/maf_openpedcan_v15_header.txt |   2 +-
 .../openpedcan_v15_case_meta_config.json      |   2 +-
 .../openTARGETS/rename_filter_maf.py          |   7 +
 scripts/convert_fusion_as_sv.py               | 277 ++++++++++--------
 4 files changed, 167 insertions(+), 121 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt
index 0c0a0bc..615c0a2 100644
--- a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt
+++ b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt
@@ -1,2 +1,2 @@
 #version 2.4
-Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_File	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	HGVSc	HGVSp	HGVSp_Short	Transcript_ID	Exon_Number	t_depth	t_ref_count	t_alt_count	n_depth	n_ref_count	n_alt_count	all_effects	Allele	Gene	Feature	Feature_type	Consequence	cDNA_position	CDS_position	Protein_position	Amino_acids	Codons	Existing_variation	ALLELE_NUM	DISTANCE	STRAND_VEP	SYMBOL	SYMBOL_SOURCE	HGNC_ID	BIOTYPE	CANONICAL	CCDS	ENSP	SWISSPROT	TREMBL	UNIPARC	RefSeq	SIFT	PolyPhen	EXON	INTRON	DOMAINS	AF	AFR_AF	AMR_AF	ASN_AF	EAS_AF	EUR_AF	SAS_AF	AA_AF	EA_AF	CLIN_SIG	SOMATIC	PUBMED	MOTIF_NAME	MOTIF_POS	HIGH_INF_POS	MOTIF_SCORE_CHANGE	IMPACT	PICKVARIANT_CLASS	TSL	HGVS_OFFSET	PHENO	MINIMISED	GENE_PHENO	FILTER	flanking_bps	vcf_id	vcf_qual	gnomAD_AF	gnomAD_AFR_AF	gnomAD_AMR_AF	gnomAD_ASJ_AF	gnomAD_EAS_AF	gnomAD_FIN_AF	gnomAD_NFE_AF	gnomAD_OTH_AF	gnomAD_SAS_AF	HGVSg	vcf_pos	gnomad_3_1_1_AC	gnomad_3_1_1_AN	gnomad_3_1_1_AF	gnomad_3_1_1_nhomalt	gnomad_3_1_1_AC_popmax	gnomad_3_1_1_AN_popmax	gnomad_3_1_1_AF_popmax	gnomad_3_1_1_nhomalt_popmax	gnomad_3_1_1_AC_controls_and_biobanks	gnomad_3_1_1_AN_controls_and_biobanks	gnomad_3_1_1_AF_controls_and_biobanks	gnomad_3_1_1_AF_non_cancer	gnomad_3_1_1_primate_ai_score	gnomad_3_1_1_splice_ai_consequence	MQ	MQ0	CAL	HotSpotAllele
+Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_File	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	HGVSc	HGVSp	HGVSp_Short	Transcript_ID	Exon_Number	t_depth	t_ref_count	t_alt_count	n_depth	n_ref_count	n_alt_count	all_effects	Allele	Gene	Feature	Feature_type	Consequence	cDNA_position	CDS_position	Protein_position	Amino_acids	Codons	Existing_variation	ALLELE_NUM	DISTANCE	STRAND_VEP	SYMBOL	SYMBOL_SOURCE	HGNC_ID	BIOTYPE	CANONICAL	CCDS	ENSP	SWISSPROT	TREMBL	UNIPARC	RefSeq	SIFT	PolyPhen	EXON	INTRON	DOMAINS	AF	AFR_AF	AMR_AF	ASN_AF	EAS_AF	EUR_AF	SAS_AF	AA_AF	EA_AF	CLIN_SIG	SOMATIC	PUBMED	MOTIF_NAME	MOTIF_POS	HIGH_INF_POS	MOTIF_SCORE_CHANGE	IMPACT	PICK	VARIANT_CLASS	TSL	HGVS_OFFSET	PHENO	MINIMISED	GENE_PHENO	FILTER	flanking_bps	vcf_id	vcf_qual	gnomAD_AF	gnomAD_AFR_AF	gnomAD_AMR_AF	gnomAD_ASJ_AF	gnomAD_EAS_AF	gnomAD_FIN_AF	gnomAD_NFE_AF	gnomAD_OTH_AF	gnomAD_SAS_AF	HGVSg	vcf_pos	gnomad_3_1_1_AC	gnomad_3_1_1_AN	gnomad_3_1_1_AF	gnomad_3_1_1_nhomalt	gnomad_3_1_1_AC_popmax	gnomad_3_1_1_AN_popmax	gnomad_3_1_1_AF_popmax	gnomad_3_1_1_nhomalt_popmax	gnomad_3_1_1_AC_controls_and_biobanks	gnomad_3_1_1_AN_controls_and_biobanks	gnomad_3_1_1_AF_controls_and_biobanks	gnomad_3_1_1_AF_non_cancer	gnomad_3_1_1_primate_ai_score	gnomad_3_1_1_splice_ai_consequence	MQ	MQ0	CAL	HotSpotAllele
diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
index 6d47197..c5c01bc 100644
--- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
+++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
@@ -115,7 +115,7 @@
     },
     "study": {
         "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics",
-        "description": "<a href=\"https://github.com/PediatricOpenTargets/OpenPedCan-analysis\">OpenPedCan</a> is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using <a href=\"https://kidsfirstdrc.org/\">KidsFirst Data Resource Center</a> workflows and harness <a href=\"https://github.com/AlexsLemonade/OpenPBTA-analysis/\">OpenPBTA analytics</a> workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort, for v11, the latest accepted production release please see <a href=\"https://pedcbioportal.kidsfirstdrc.org/study/summary?id=openpedcan_v11\">OpenPedCan v11</a>. For study release details, please see <a href=\"https://tinyurl.com/55cxz9am\">Release Notes</a>",
+        "description": "<a href=\"https://github.com/PediatricOpenTargets/OpenPedCan-analysis\">OpenPedCan</a> is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using <a href=\"https://kidsfirstdrc.org/\">KidsFirst Data Resource Center</a> workflows and harness <a href=\"https://github.com/AlexsLemonade/OpenPBTA-analysis/\">OpenPBTA analytics</a> workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort. <a href=\"https://pedcbioportal.kidsfirstdrc.org/study/summary?id=openpedcan_v11\">V11 previous prod release here</a>. For study release details, please see <a href=\"https://tinyurl.com/55cxz9am\">Release Notes</a>",
         "groups": "PUBLIC",
         "cancer_study_identifier": "openpedcan_v15",
         "reference_genome": "hg38",
diff --git a/COLLABORATIONS/openTARGETS/rename_filter_maf.py b/COLLABORATIONS/openTARGETS/rename_filter_maf.py
index 5636ba0..227c57b 100644
--- a/COLLABORATIONS/openTARGETS/rename_filter_maf.py
+++ b/COLLABORATIONS/openTARGETS/rename_filter_maf.py
@@ -67,6 +67,9 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict):
     h_idx = header.index('Hugo_Symbol')
     eid_idx = header.index('Entrez_Gene_Id')
     header.pop(eid_idx)
+    # bug fix for OpenPedCan, position will be one less after process_maf_entry
+    n_ref_ct_idx = header.index('n_ref_count')
+    n_ref_alt_idx = header.index('n_alt_count')
     print("\t".join(header), file=maf_out)
 
     sys.stderr.write("Filtering entries and renaming samples\n")
@@ -74,6 +77,10 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict):
     for line in maf_file:
         to_print = process_maf_entry(line.decode(), maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict)
         if to_print:
+            # bug fix for maf format in OpenPedCan
+            for i in [n_ref_ct_idx, n_ref_alt_idx]:
+                if to_print[i] == "NA":
+                    to_print[i] = ""
             print("\t".join(to_print), file=maf_out)
 
     sys.stderr.write("Fin.\n")
diff --git a/scripts/convert_fusion_as_sv.py b/scripts/convert_fusion_as_sv.py
index 0b7c35c..ab99fe2 100755
--- a/scripts/convert_fusion_as_sv.py
+++ b/scripts/convert_fusion_as_sv.py
@@ -14,7 +14,159 @@
 import pdb
 
 
-if __name__ == "__main__":
+def collapse_and_format(fusion_data):
+    # Sort as a cheat to easily select preferred annot later
+    print("Sorting for collapse by caller and counts", file=sys.stderr)
+    fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"])
+    # Merge rows that have the exact same fusion from different callers - thanks Natasha!
+    key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"]
+    print("Grouping for collapse and counts", file=sys.stderr)
+    fusion_data["groupby_key"] = fusion_data.apply(
+    lambda row: "\t".join([str(row[col]) for col in key_cols]), 
+        axis=1
+    )
+    # "JunctionReadCount","SpanningFragCount","annots"
+    collapsed_list = []
+    
+    for g in fusion_data.groupby(by="groupby_key"):
+        values, df_group = g
+        df_group["Caller"] = ",".join(set(df_group["Caller"].tolist()))
+        df_group["JunctionReadCount"] = df_group["JunctionReadCount"].mean()
+        df_group["SpanningFragCount"] = df_group["SpanningFragCount"].mean()
+        # Go with the ceiling of the mean
+        df_group["JunctionReadCount"] = df_group["JunctionReadCount"].apply(np.ceil)
+        df_group["SpanningFragCount"] = df_group["SpanningFragCount"].apply(np.ceil)
+        collapsed_list.append(df_group[key_cols + [ "JunctionReadCount","SpanningFragCount","annots", "Fusion_Type", "Caller"]].head(1))
+    del fusion_data
+    fusion_data_collapsed = pd.concat(collapsed_list)
+    del collapsed_list
+    # Should be int
+    fusion_data_collapsed["JunctionReadCount"] = fusion_data_collapsed["JunctionReadCount"].astype(int)
+    fusion_data_collapsed["SpanningFragCount"] = fusion_data_collapsed["SpanningFragCount"].astype(int)
+
+
+    fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper()
+    return fusion_data_collapsed
+
+
+def openx_annot_bug(fusion_data):
+    # Sort as a cheat to easily select preferred annot later
+    print("Sorting for collapse bug fix", file=sys.stderr)
+    fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"])
+    # Merge rows that have the exact same fusion from different callers - thanks Natasha!
+    key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"]
+    remain_cols = list(set(fusion_data.columns.to_list()) - set(key_cols))
+    print("Grouping for collapse bug fix", file=sys.stderr)
+    fusion_data["groupby_key"] = fusion_data.apply(
+    lambda row: "\t".join([str(row[col]) for col in key_cols]), 
+        axis=1
+    )
+    # "JunctionReadCount","SpanningFragCount","annots"
+    collapsed_list = []
+    print("Collapsing annotations for bug fix", file=sys.stderr)
+    for g in fusion_data.groupby(by="groupby_key"):
+        values, df_group = g
+        df_group["Caller"] = ",".join(set(df_group["Caller"].tolist()))
+        df_group["Gene1A_anno"] = ", ".join(set(df_group["Gene1A_anno"].tolist()))
+        df_group["Gene1B_anno"] = ", ".join(set(df_group["Gene1B_anno"].tolist()))
+        collapsed_list.append(df_group[key_cols + remain_cols].head(1))
+    del fusion_data
+    fusion_data_collapsed = pd.concat(collapsed_list)
+    del collapsed_list
+    # Should be int
+    fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper()
+    print("Bug fix completed", file=sys.stderr)
+    return fusion_data_collapsed
+
+    
+def filter_and_format_annots(sample_renamed_df, drop_low):
+    """
+    Applies a filter to remove entries with ARRIBA confidence "low" when set, then formats annots file to include Gene1A_anno and Gene1B_anno
+    """
+    if drop_low:
+        sample_renamed_df = sample_renamed_df[sample_renamed_df.Confidence != 'low']
+    else:
+        # not drop low is a OpenX feature, applt repeat annotation bug fix
+        print("Applying OpenX annot bug fix", file=sys.stderr)
+        sample_renamed_df = openx_annot_bug(sample_renamed_df)
+    sample_renamed_df["annots"] = sample_renamed_df.apply(
+        lambda row: "Gene1: " + ",".join(
+            set(list(row["Gene1A_anno"].split(", "))))
+                + "; Gene2: " + ",".join(set(list(row["Gene1B_anno"].split(", ")))
+            ),
+            axis=1
+        ) + ";" + sample_renamed_df["annots"]
+    return sample_renamed_df
+
+
+def init_cbio_master(fusion_results, mode, rna_metadata):
+    """
+    Use data frame subset on RNA fusion files to find and merge result files
+    """
+    desired = [
+            "Sample",
+            "Gene1A",
+            "LeftBreakpoint",
+            "Gene1B",
+            "RightBreakpoint",
+            "Fusion_Type",
+            "JunctionReadCount",
+            "SpanningFragCount",
+            "annots",
+            "FusionName",
+            "Fusion_anno",
+            "Caller"
+        ]
+    if mode == "openX" or mode == "dgd":
+        openx_data = pd.read_csv(fusion_results, sep="\t", keep_default_na=False, na_values=[""])
+        # Merge so that sample names can be cBio names - thanks Natasha!
+        merged = pd.merge(
+            openx_data, rna_metadata[["T_CL_BS_ID", "Cbio_Tumor_Name"]], left_on="Sample", right_on="T_CL_BS_ID", how="left"
+            )
+        merged["Sample"] = merged.apply(
+            lambda row: row["Cbio_Tumor_Name"], axis=1
+            )
+        # OpenX data may not have all annoFuse cols
+        present = []
+        # openPBTA...and maybe open pedcan uses this
+        if 'CalledBy' in merged.columns:
+            merged.rename(
+                columns={"CalledBy": "Caller"},
+                inplace=True
+                )
+        elif mode == "dgd":
+            merged["Caller"] = "Archer"
+        # Also merge existing annotations in Gene1A_anno, Gene1B_anno into annots
+        merged = filter_and_format_annots(merged, False)
+        for col in desired:
+            if col in merged.columns:
+                present.append(col)
+        openx_data = merged[present]
+        # only if read counts there, collapse
+        if "JunctionReadCount" in openx_data.columns:
+            openx_data = collapse_and_format(openx_data)
+        return openx_data, present
+    else:
+        flist = rna_metadata.File_Name
+        frame_list = []
+        for i in range(0, len(flist), 1):
+            try:
+                # concat annofuse file, rename Sample Column according to cBio name
+                ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t", keep_default_na=False, na_values=[""])
+                ann_file = ann_file.assign(Sample=rna_metadata.at[i, "Cbio_Tumor_Name"])
+                frame_list.append(ann_file)
+            except Exception as e:
+                sys.stderr.write(str(e) + '\n')
+                exit(1)
+        concat_frame = pd.concat(frame_list)
+        concat_frame = filter_and_format_annots(concat_frame, True)
+        del frame_list
+        fusion_data = concat_frame[desired]
+        del concat_frame
+        return collapse_and_format(fusion_data), desired
+
+
+def main():
     parser = argparse.ArgumentParser(
         description="Convert openPBTA fusion table OR list of annofuse files to cbio format."
     )
@@ -67,124 +219,6 @@
             + "\n"
         )
         exit(1)
-
-
-    def collapse_and_format(fusion_data):
-            # Sort as a cheat to easily select preferred annot later
-            fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"])
-            # Merge rows that have the exact same fusion from different callers - thanks Natasha!
-            key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"]
-            fusion_data["groupby_key"] = fusion_data.apply(
-            lambda row: "\t".join([str(row[col]) for col in key_cols]), 
-                axis=1
-            )
-            # "JunctionReadCount","SpanningFragCount","annots"
-            collapsed_list = []
-            
-            for g in fusion_data.groupby(by="groupby_key"):
-                values, df_group = g
-                df_group["Caller"] = ",".join(set(df_group["Caller"].tolist()))
-                df_group["JunctionReadCount"] = df_group["JunctionReadCount"].mean()
-                df_group["SpanningFragCount"] = df_group["SpanningFragCount"].mean()
-                # Go with the ceiling of the mean
-                df_group["JunctionReadCount"] = df_group["JunctionReadCount"].apply(np.ceil)
-                df_group["SpanningFragCount"] = df_group["SpanningFragCount"].apply(np.ceil)
-                collapsed_list.append(df_group[key_cols + [ "JunctionReadCount","SpanningFragCount","annots", "Fusion_Type", "Caller"]].head(1))
-            del fusion_data
-            fusion_data_collapsed = pd.concat(collapsed_list)
-            del collapsed_list
-            # Should be int
-            fusion_data_collapsed["JunctionReadCount"] = fusion_data_collapsed["JunctionReadCount"].astype(int)
-            fusion_data_collapsed["SpanningFragCount"] = fusion_data_collapsed["SpanningFragCount"].astype(int)
-
-
-            fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper()
-            return fusion_data_collapsed
-
-
-
-    def filter_and_format_annots(sample_renamed_df, drop_low):
-        """
-        Applies a filter to remove entries with ARRIBA confidence "low" when set, then formats annots file to include Gene1A_anno and Gene1B_anno
-        """
-        if drop_low:
-            sample_renamed_df = sample_renamed_df[sample_renamed_df.Confidence != 'low']
-        sample_renamed_df["annots"] = sample_renamed_df.apply(
-            lambda row: "Gene1: " + ",".join(
-                set(list(row["Gene1A_anno"].split(", "))))
-                    + "; Gene2: " + ",".join(set(list(row["Gene1B_anno"].split(", ")))
-                ),
-                axis=1
-            ) + ";" + sample_renamed_df["annots"]
-        return sample_renamed_df
-
-
-    def init_cbio_master(fusion_results, mode, rna_metadata):
-        """
-        Use data frame subset on RNA fusion files to find and merge result files
-        """
-        desired = [
-                "Sample",
-                "Gene1A",
-                "LeftBreakpoint",
-                "Gene1B",
-                "RightBreakpoint",
-                "Fusion_Type",
-                "JunctionReadCount",
-                "SpanningFragCount",
-                "annots",
-                "FusionName",
-                "Fusion_anno",
-                "Caller"
-            ]
-        if mode == "openX" or mode == "dgd":
-            openx_data = pd.read_csv(fusion_results, sep="\t", keep_default_na=False, na_values=[""])
-            # Merge so that sample names can be cBio names - thanks Natasha!
-            merged = pd.merge(
-                openx_data, rna_metadata[["T_CL_BS_ID", "Cbio_Tumor_Name"]], left_on="Sample", right_on="T_CL_BS_ID", how="left"
-                )
-            merged["Sample"] = merged.apply(
-                lambda row: row["Cbio_Tumor_Name"], axis=1
-                )
-            # OpenX data may not have all annoFuse cols
-            present = []
-            # openPBTA...and maybe open pedcan uses this
-            if 'CalledBy' in merged.columns:
-                merged.rename(
-                    columns={"CalledBy": "Caller"},
-                    inplace=True
-                   )
-            elif mode == "dgd":
-                merged["Caller"] = "Archer"
-            # Also merge existing annotations in Gene1A_anno, Gene1B_anno into annots
-            merged = filter_and_format_annots(merged, False)
-            for col in desired:
-                if col in merged.columns:
-                    present.append(col)
-            openx_data = merged[present]
-            # only if read counts there, collapse
-            if "JunctionReadCount" in openx_data.columns:
-                openx_data = collapse_and_format(openx_data)
-            return openx_data, present
-        else:
-            flist = rna_metadata.File_Name
-            frame_list = []
-            for i in range(0, len(flist), 1):
-                try:
-                    # concat annofuse file, rename Sample Column according to cBio name
-                    ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t", keep_default_na=False, na_values=[""])
-                    ann_file = ann_file.assign(Sample=rna_metadata.at[i, "Cbio_Tumor_Name"])
-                    frame_list.append(ann_file)
-                except Exception as e:
-                    sys.stderr.write(str(e) + '\n')
-                    exit(1)
-            concat_frame = pd.concat(frame_list)
-            concat_frame = filter_and_format_annots(concat_frame, True)
-            del frame_list
-            fusion_data = concat_frame[desired]
-            del concat_frame
-            return collapse_and_format(fusion_data), desired
-
     out_dir = args.out_dir
     try:
         os.mkdir(out_dir)
@@ -317,3 +351,8 @@ def init_cbio_master(fusion_results, mode, rna_metadata):
             fus_tbl = fus_tbl[existing.columns]
             fus_tbl.set_index("Sample_Id", inplace=True)
             fus_tbl.to_csv(fus_fname, sep="\t", mode="a", index=True, quoting=csv.QUOTE_NONE, header=None)
+
+
+if __name__ == "__main__":
+    main()
+

From 4cecd9e0ec796bebb7346906a624b640d998c097 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 11 Mar 2024 15:39:29 +0000
Subject: [PATCH 12/13] :wrench: apply bug fix to the right script

---
 COLLABORATIONS/openTARGETS/append_maf_to_existing.py | 7 +++++++
 COLLABORATIONS/openTARGETS/rename_filter_maf.py      | 8 --------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
index 93c732f..9461ca0 100644
--- a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
+++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py
@@ -87,6 +87,9 @@
         m_header.pop(m_header.index("Entrez_Gene_Id"))
     except Exception as e:
         print(e, file=sys.stderr)
+    # bug fix for OpenPedCan, position will be one less after process_maf_entry
+    n_ref_ct_idx = m_header.index('n_ref_count')
+    n_alt_ct_idx = m_header.index('n_alt_count')    
 
     for i in range(len(m_header)):
         if m_header[i] in h_dict:
@@ -103,6 +106,10 @@
                         to_print.append(datum[h_dict[item]])
                     else:
                         to_print.append("")
+                # bug fix for maf format in OpenPedCan
+                for i in [n_ref_ct_idx, n_alt_ct_idx]:
+                    if to_print[i] == "NA":
+                        to_print[i] = ""
                 print("\t".join(to_print), file=append_maf)
             else:
                 skipped += 1
diff --git a/COLLABORATIONS/openTARGETS/rename_filter_maf.py b/COLLABORATIONS/openTARGETS/rename_filter_maf.py
index 227c57b..8d24c47 100644
--- a/COLLABORATIONS/openTARGETS/rename_filter_maf.py
+++ b/COLLABORATIONS/openTARGETS/rename_filter_maf.py
@@ -67,9 +67,6 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict):
     h_idx = header.index('Hugo_Symbol')
     eid_idx = header.index('Entrez_Gene_Id')
     header.pop(eid_idx)
-    # bug fix for OpenPedCan, position will be one less after process_maf_entry
-    n_ref_ct_idx = header.index('n_ref_count')
-    n_ref_alt_idx = header.index('n_alt_count')
     print("\t".join(header), file=maf_out)
 
     sys.stderr.write("Filtering entries and renaming samples\n")
@@ -77,11 +74,6 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict):
     for line in maf_file:
         to_print = process_maf_entry(line.decode(), maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict)
         if to_print:
-            # bug fix for maf format in OpenPedCan
-            for i in [n_ref_ct_idx, n_ref_alt_idx]:
-                if to_print[i] == "NA":
-                    to_print[i] = ""
             print("\t".join(to_print), file=maf_out)
-
     sys.stderr.write("Fin.\n")
     maf_out.close()
\ No newline at end of file

From 3874f3dec78e3696031a39100521e743afc05a0f Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 14 Mar 2024 14:07:44 +0000
Subject: [PATCH 13/13] :pencil: update docs and config

---
 COLLABORATIONS/openTARGETS/README.md                     | 9 +++++++--
 .../openTARGETS/openpedcan_v15_case_meta_config.json     | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md
index b67930d..9871ef9 100644
--- a/COLLABORATIONS/openTARGETS/README.md
+++ b/COLLABORATIONS/openTARGETS/README.md
@@ -20,6 +20,7 @@ fusion-dgd.tsv.gz
 fusion-putative-oncogenic.tsv
 gene-expression-rsem-tpm-collapsed.rds
 tcga_gene-expression-rsem-tpm-collapsed.rds
+gtex_gene-expression-rsem-tpm-collapsed.rds
 snv-consensus-plus-hotspots.maf.tsv.gz
 snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz
 ```
@@ -92,7 +93,11 @@ TCGA data are kept in a seprate matrix from everything else. We need to merge th
 ```sh
 Rscript COLLABORATIONS/openTARGETS/merge_rsem_rds.R --first_file gene-expression-rsem-tpm-collapsed.rds --second_file tcga-gene-expression-rsem-tpm-collapsed.rds --output_fn gene_tcga_expression_common_merge.rds
 ```
-
+UPDATE: GTEx is also in a seprate matrix, so run again currently to make the "final" merge before conversion
+```sh
+Rscript COLLABORATIONS/openTARGETS/merge_rsem_rds.R --first_file gene_tcga_expression_common_merge.rds --second_file gtex_gene-expression-rsem-tpm-collapsed.rds --output_fn gene_tcga_gtex_expression_common_merge.rds
+```
+```
 
 ### File Transformation
 It's recommended to put datasheets in a dir called `datasheets`, downloaded files in it's own dir (in v12 it's `GF_INPUTS`) and the rest of the processed outputs into it's own dir (`study_build` for v12) to keep things sane and also be able to leverage existing study build script in `scripts/organize_upload_packages.py`
@@ -195,7 +200,7 @@ Options:
 		Show this help message and exit
 ```
 Example run:
-`Rscript COLLABORATIONS/openTARGETS/rename_export_rsem.R --rna_rds gene_tcga_expression_common_merge.rds --map_id bs_id_sample_map.txt --type openpedcan_v11 --computeZscore R 2> rna_convert.errs`
+`Rscript COLLABORATIONS/openTARGETS/rename_export_rsem.R --rna_rds gene_tcga_gtex_expression_common_merge.rds --map_id bs_id_sample_map.txt --type openpedcan_v15 --computeZscore R 2> rna_convert.errs`
 
 #### 5. scripts/convert_fusion_as_sv.py
 
diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
index c5c01bc..805d5ca 100644
--- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
+++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json
@@ -119,7 +119,7 @@
         "groups": "PUBLIC",
         "cancer_study_identifier": "openpedcan_v15",
         "reference_genome": "hg38",
-        "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14",
+        "display_name": "Open Pediatric Cancer (OpenPedCan) Project v15",
         "type_of_cancer": "mixed",
         "short_name": "openpedcan_v15"