From cffbc8c7ffd3ad6dc5c95808b7bca8269fa0e9ac Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Wed, 21 Feb 2024 10:20:42 -0500 Subject: [PATCH 01/13] :pencil: start v14 updates --- COLLABORATIONS/openTARGETS/README.md | 9 +- .../openpedcan_v14_case_meta_config.json | 169 ++++++++++++++++++ 2 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index 45da505..f959b46 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -13,14 +13,15 @@ Genomic data generally obtained as such: - Copy number: tsv file with copy number, ploidy, and GISTIC-style information in maf-like format (each call is a row) - RNA expression: tpm values from rsem stored an `.rds` object - RNA fusion: annoFuse output -For example, for v12, bucket s3://d3b-openaccess-us-east-1-prd-pbta/open-targets/v12/: +For example, for v14, bucket s3://d3b-openaccess-us-east-1-prd-pbta/open-targets/v14/: ``` -consensus_wgs_plus_cnvkit_wxs.tsv.gz +consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz fusion-dgd.tsv.gz fusion-putative-oncogenic.tsv gene-expression-rsem-tpm-collapsed.rds tcga-gene-expression-rsem-tpm-collapsed.rds snv-consensus-plus-hotspots.maf.tsv.gz +snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz ``` ### Prep work @@ -68,13 +69,15 @@ To create the histologies file, recommended method is to: union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.chdm_phs001643_2018_cbio_sample + select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id + from prod_cbio.pbta_mioncoseq_cbio_sample ``` 1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons ### Run as standalone 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it -1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histolgies-file.tsv -n path-to-cbio-names.csv -b Methylation` +1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b Methylation` OR ### Run in repo 1. Either run an interactive docker or using your local R, and ensure to mount a volume that will have the repo and whatever input histologies file you end up using, i.e. `docker run -it --mount type=bind,source=/home/ubuntu,target=/WORK pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest /bin/bash` diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json new file mode 100644 index 0000000..c53384e --- /dev/null +++ b/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json @@ -0,0 +1,169 @@ +{ + "merged_mafs": { + "dir": "study_build", + "dtypes": { + "mutation": { + "ext": "maf", + "cbio_name": "data_mutations_extended.txt", + "meta_file_attr": { + "stable_id": "mutations", + "profile_name": "Mutations", + "profile_description": "Consensus calls from strelka2, mutect2, lancet, and VarDict Java. Two or more callers required to pass, < 0.001 frequeney in gnomAD, and min read depth 8 in normal sample.", + "genetic_alteration_type": "MUTATION_EXTENDED", + "variant_classification_filter": "Silent,Intron,3'UTR,3'Flank,5'UTR,IGR,RNA", + "datatype": "MAF", + "show_profile_in_analysis_tab": "true" + } + } + } + }, + "merged_cnvs": { + "dir": "study_build", + "dtypes": { + "linear": { + "ext": "predicted_cnv.txt", + "cbio_name": "data_linear_CNA.txt", + "meta_file_attr": { + "stable_id": "linear_CNA", + "profile_name": "copy-number values", + "profile_description": "Predicted copy number values from WGS and WXS (Continuous). openPBTA consensus method used", + "genetic_alteration_type": "COPY_NUMBER_ALTERATION", + "datatype": "CONTINUOUS", + "show_profile_in_analysis_tab": "false" + } + + }, + "discrete": { + "ext": "discrete_cnvs.txt", + "cbio_name": "data_CNA.txt", + "meta_file_attr": { + "stable_id": "cna", + "profile_name": "Binned copy-number values", + "profile_description": "Predicted copy number values from WGS and WXS sequencing (Discrete). Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. openPBTA consensus method used", + "genetic_alteration_type": "COPY_NUMBER_ALTERATION", + "datatype": "DISCRETE", + "show_profile_in_analysis_tab": "true" + } + } + } + }, + "merged_rsem": { + "dir": "study_build", + "dtypes": { + "counts": { + "ext": "rsem_merged.txt", + "cbio_name": "data_mrna_seq_v2_rsem.txt", + "meta_file_attr": { + "stable_id": "rna_seq_v2_mrna", + "profile_name": "RNA expression", + "profile_description": "Expression levels from RNA-Seq (rsem TPM). Only common transcripts between OpenPedCan harmonized (GENCODE27) and TCGA (GENCODE36) were kept.", + "genetic_alteration_type": "MRNA_EXPRESSION", + "datatype": "CONTINUOUS", + "show_profile_in_analysis_tab": "false" + } + }, + "zscore": { + "ext": "rsem_merged_zscore.txt", + "cbio_name": "data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", + "meta_file_attr": { + "stable_id": "rna_seq_v2_mrna_median_Zscores", + "profile_name": "RNA expression z-scores", + "profile_description": "Expression levels from RNA-Seq, Z scores of log2(TPM + 1) values", + "genetic_alteration_type": "MRNA_EXPRESSION", + "datatype": "Z-SCORE", + "show_profile_in_analysis_tab": "true" + } + } + } + }, + "merged_fusion": { + "dir": "study_build", + "dtypes": { + "fusion": { + "ext": "fusions.txt", + "cbio_name": "data_sv.txt", + "meta_file_attr": { + "stable_id": "structural_variants", + "profile_name": "Predicted RNA fusions", + "profile_description": "Fusion data, from openPBTA using arriba and STAR Fusion, annotated and filtered using annoFuse. DGD sample subset consists of panel fusions", + "genetic_alteration_type": "STRUCTURAL_VARIANT", + "datatype": "SV", + "show_profile_in_analysis_tab": "true" + } + } + } + }, + "data_sheets": { + "dir": "datasheets", + "dtypes": { + "patient": { + "cbio_name": "data_clinical_patient.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "PATIENT_ATTRIBUTES" + } + }, + "sample": { + "cbio_name": "data_clinical_sample.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "SAMPLE_ATTRIBUTES" + } + } + + } + }, + "study": { + "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics", + "description": "OpenPedCan is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using KidsFirst Data Resource Center workflows and harness OpenPBTA analytics workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v14 release of this effort, for v11, the latest accepted production release please see OpenPedCan v11. For study release details, please see Release Notes", + "groups": "PUBLIC", + "cancer_study_identifier": "openpedcan_v14", + "reference_genome": "hg38", + "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14", + "type_of_cancer": "mixed", + "short_name": "openpedcan_v14" + + }, + "cases_3way_complete": { + "stable_id": "3way_complete", + "case_list_name": "Tumor samples with mutation, CNA and mRNA data", + "case_list_description": "All tumor samples with mutation, CNA, and mRNA data", + "case_list_category": "all_cases_with_mutation_and_cna_and_mrna_data" + }, + "cases_all": { + "stable_id": "all", + "case_list_name": "All Samples", + "case_list_description": "All samples in study", + "case_list_category": "all_cases_in_study" + }, + "cases_cnaseq": { + "stable_id": "cnaseq", + "case_list_name": "Tumor samples with mutation and CNA data", + "case_list_description": "All tumor samples with mutation and CNA data", + "case_list_category": "all_cases_with_mutation_and_cna_data" + }, + "cases_cna": { + "stable_id": "cna", + "case_list_name": "Tumor Samples with CNA data", + "case_list_description": "All tumors with CNA data", + "case_list_category": "all_cases_with_cna_data" + }, + "cases_RNA_Seq_v2_mRNA": { + "stable_id": "rna_seq_v2_mrna", + "case_list_name": "All Samples with mRNA data (RNA Seq V2)", + "case_list_description": "All samples with mRNA expression data", + "case_list_category": "all_cases_with_mrna_rnaseq_data" + }, + "cases_sequenced": { + "stable_id": "sequenced", + "case_list_name": "Tumor samples with mutations", + "case_list_description": "All tumor samples with mutation data", + "case_list_category": "all_cases_with_mutation_data" + }, + "cases_sv": { + "stable_id": "sv", + "case_list_name": "Tumor samples with fusions", + "case_list_description": "All tumor samples with fusion data", + "case_list_category": "all_cases_with_sv_data" + } +} \ No newline at end of file From 04ca3a8a58b6792a24fae61776baa4f35a7963a9 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Wed, 21 Feb 2024 19:43:34 +0000 Subject: [PATCH 02/13] :wrench: fixed drop list behavior --- COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R index 0668011..7b3f55a 100644 --- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R +++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R @@ -59,11 +59,9 @@ cbio_names_list <- lapply(files_list, function(cbio_names){ histology_df <- readr::read_tsv(hist_file, guess_max = 100000) message("Read histologies file") if (!is.null(opt$blacklist_strategy)){ - drop_list <- strsplit(opt$blacklist_strategy, split = ",") - for (drop in drop_list){ - histology_df <- dplyr::filter(histology_df, experimental_strategy != drop) - message(paste0("Dropping ", drop," as specified")) - } + drop_list <- as.list(strsplit(opt$blacklist_strategy, split = ",")[[1]]) + message(paste0("Dropping ", opt$blacklist_strategy," as specified\n")) + histology_df <- histology_df %>% dplyr::filter(!experimental_strategy %in% drop_list) } # tmp update for broad histology bug, to be fixed in v11 histology_df <- histology_df %>% From 58bd8e897747335f7136a236c55451909704698b Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 22 Feb 2024 18:10:50 +0000 Subject: [PATCH 03/13] :construction: WIP - improved DGD disambguity, correctd PFS to EFS --- COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 2 +- COLLABORATIONS/openTARGETS/header_desc.tsv | 6 ++++-- COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 8 +++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py index b8be856..5047111 100644 --- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py +++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py @@ -214,7 +214,7 @@ def build_header(header_list, entry): # age_at_last_known = float(value) # age_at_last_known = str(math.floor(age_at_last_known/365.25)) value = str(math.floor(float(value)/365.25)) - elif header[i] == "PFS_days" and value != "NA": + elif header[i] == "EFS_days" and value != "NA": value = str(math.floor(float(value)/30.5)) # d_free_mos = value elif header[i] == "tumor_descriptor": diff --git a/COLLABORATIONS/openTARGETS/header_desc.tsv b/COLLABORATIONS/openTARGETS/header_desc.tsv index 9baaef5..d847927 100644 --- a/COLLABORATIONS/openTARGETS/header_desc.tsv +++ b/COLLABORATIONS/openTARGETS/header_desc.tsv @@ -3,7 +3,8 @@ germline_sex_estimate 0 1 STRING 1 Female cancer_predispositions 0 1 STRING 2 None documented OS_days OS_MONTHS Overall survival in months since initial diagnosis 0 1 NUMBER 3 NA just convert to months OS_status OS_STATUS Overall patient survival status 0 1 STRING 4 LIVING -PFS_days PFS_MONTHS Progression free (months) since initial treatment 0 1 NUMBER 5 Not Reported just convert to months +EFS_days EFS_MONTHS Event free (months) since initial treatment 0 1 NUMBER 5 Not Reported just convert to months +EFS_event_type EFS_STATUS Event free status (months) since initial treatment 0 1 STRING 5 Not Reported ethnicity ETHNICITY 0 1 STRING 6 Not Hispanic or Latino race RACE 0 1 STRING 7 White primary_site TUMOR_SITE 0 1 STRING 8 @@ -18,7 +19,8 @@ harmonized_diagnosis CANCER_TYPE_DETAILED 1 0 STRING 10 Adenoma primary_site TUMOR_TISSUE_SITE 1 0 STRING 9 Suprasellar/Hypothalamic/Pituitary tumor_descriptor TUMOR_TYPE 1 0 STRING 8 Initial CNS Tumor composition SAMPLE_TYPE 1 0 STRING 7 Solid Tissue -cohort COHORT Source study cohort name 1 0 STRING 6 +cohort COHORT Source study cohort name 1 0 STRING 6 +sub_cohort SUB_COHORT Source study sub-cohort name 1 0 STRING 6 CNS_region 1 0 STRING 5 Suprasellar tumor_ploidy 1 0 NUMBER 4 3 tumor_fraction 1 0 NUMBER 3 0.476369391 diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R index 7b3f55a..306681c 100644 --- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R +++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R @@ -119,7 +119,7 @@ message("Collated samples missing a cBio ID") #### Handle each cohort at a time - start with PBTA # get all sample IDs in the PBTA cohort sample_ids_pbta <- histology_df_no_format_id %>% - dplyr::filter(cohort == "PBTA") %>% + dplyr::filter(cohort == "PBTA", sub_cohort != "DGD") %>% pull(sample_id) %>% unique() @@ -267,8 +267,10 @@ message("FINALIZE NAMES") no_need_for_tiebreaks <- histology_df %>% dplyr::filter(!Kids_First_Biospecimen_ID %in% all_tiebreaks$Kids_First_Biospecimen_ID) %>% dplyr::mutate(formatted_sample_id = case_when( - cohort == "PBTA" ~ sample_id, - cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id), + (cohort == "PBTA" & sub_cohort != "DGD") ~ sample_id, + sub_cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id), + ((cohort == "Maris" | cohort != "PPTC") & composition == "Derived Cell Line") ~ paste0(Kids_First_Participant_ID,"-CL"), + ((cohort == "Maris" | cohort != "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(Kids_First_Participant_ID,"-PDX"), TRUE ~ Kids_First_Participant_ID )) From dde232c7bab2c9b103fcaf7d5f12995ab9046f13 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Feb 2024 11:20:53 -0500 Subject: [PATCH 04/13] :wrench: fixed deprecated regex and hoe PPTC and Maris models are named --- COLLABORATIONS/openTARGETS/README.md | 5 +++-- COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 2 +- COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index f959b46..4649390 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -25,7 +25,7 @@ snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz ``` ### Prep work -The histologies file needs `formatted_sample_id` added and likely a blacklist from the D3b Warehouse or some other source to supress duplicate RNA libraries from different sequencing methods. +The histologies file needs `formatted_sample_id` added and likely a blacklist from the D3b Warehouse or some other source to suppress duplicate RNA libraries from different sequencing methods. Since we are not handling `Methylation` yet, it is recommended those entries be removed ahead of time. To create the histologies file, recommended method is to: 1. `docker pull pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest` OR if you have R installed locally, ensure the following libraries are installed: @@ -69,6 +69,7 @@ To create the histologies file, recommended method is to: union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.chdm_phs001643_2018_cbio_sample + union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.pbta_mioncoseq_cbio_sample @@ -77,7 +78,7 @@ To create the histologies file, recommended method is to: ### Run as standalone 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it -1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b Methylation` +1. Run `Rscript --vanilla pedcbio_sample_name_col.R -i path-to-histologies-file.tsv -n path-to-cbio-names.csv -b 'Methylation,Phospho-Proteomics,Whole Cell Proteomics,miRNA-Seq'` OR ### Run in repo 1. Either run an interactive docker or using your local R, and ensure to mount a volume that will have the repo and whatever input histologies file you end up using, i.e. `docker run -it --mount type=bind,source=/home/ubuntu,target=/WORK pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest /bin/bash` diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py index 5047111..8f6615e 100644 --- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py +++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py @@ -185,7 +185,7 @@ def build_header(header_list, entry): # if DGD DNA, add to gene matrix elif info[cohort] == 'DGD': # parse aliquot for panel type, i.e. ET_242MFKXW_DGD_STNGS_93 - test = re.match('.*_DGD_(\w+)_\d+', info[a_idx]) + test = re.match(r'.*_DGD_(\w+)_\d+', info[a_idx]) data_gene.write(info[cbio_id] + '\tCHOP-' + test.group(1) + '\n') pt_id = info[header.index("Kids_First_Participant_ID")] diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R index 306681c..3fcddc1 100644 --- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R +++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R @@ -269,8 +269,8 @@ no_need_for_tiebreaks <- histology_df %>% dplyr::mutate(formatted_sample_id = case_when( (cohort == "PBTA" & sub_cohort != "DGD") ~ sample_id, sub_cohort == "DGD" ~ gsub("(^.*DGD)_\\w+_(\\d+$)", "\\1_\\2", aliquot_id), - ((cohort == "Maris" | cohort != "PPTC") & composition == "Derived Cell Line") ~ paste0(Kids_First_Participant_ID,"-CL"), - ((cohort == "Maris" | cohort != "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(Kids_First_Participant_ID,"-PDX"), + ((cohort == "Maris" | cohort == "PPTC") & composition == "Derived Cell Line") ~ paste0(sample_id,"-CL"), + ((cohort == "Maris" | cohort == "PPTC") & composition == "Patient Derived Xenograft") ~ paste0(sample_id,"-PDX"), TRUE ~ Kids_First_Participant_ID )) From 57fff1e8b27148834c016ea6e7e577f21c9b5f64 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 5 Mar 2024 14:55:27 +0000 Subject: [PATCH 05/13] :pencil: rename to v15 --- COLLABORATIONS/openTARGETS/README.md | 4 ++-- ...eta_config.json => openpedcan_v15_case_meta_config.json} | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) rename COLLABORATIONS/openTARGETS/{openpedcan_v14_case_meta_config.json => openpedcan_v15_case_meta_config.json} (98%) diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index 4649390..4867046 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -19,7 +19,7 @@ consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz fusion-dgd.tsv.gz fusion-putative-oncogenic.tsv gene-expression-rsem-tpm-collapsed.rds -tcga-gene-expression-rsem-tpm-collapsed.rds +tcga_gene-expression-rsem-tpm-collapsed.rds snv-consensus-plus-hotspots.maf.tsv.gz snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz ``` @@ -44,7 +44,7 @@ To create the histologies file, recommended method is to: from prod_cbio.aml_sd_pet7q6f2_2018_cbio_sample union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id - from prod_cbio.aml_sd_z6mwd3h0_2018_cbio_sample + from prod_cbio.bllnos_sd_z6mwd3h0_2018_cbio_sample union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.x01_fy16_nbl_maris_cbio_sample diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json similarity index 98% rename from COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json rename to COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json index c53384e..ada2b57 100644 --- a/COLLABORATIONS/openTARGETS/openpedcan_v14_case_meta_config.json +++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json @@ -115,13 +115,13 @@ }, "study": { "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics", - "description": "OpenPedCan is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using KidsFirst Data Resource Center workflows and harness OpenPBTA analytics workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v14 release of this effort, for v11, the latest accepted production release please see OpenPedCan v11. For study release details, please see Release Notes", + "description": "OpenPedCan is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using KidsFirst Data Resource Center workflows and harness OpenPBTA analytics workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort, for v11, the latest accepted production release please see OpenPedCan v11. For study release details, please see Release Notes", "groups": "PUBLIC", - "cancer_study_identifier": "openpedcan_v14", + "cancer_study_identifier": "openpedcan_v15", "reference_genome": "hg38", "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14", "type_of_cancer": "mixed", - "short_name": "openpedcan_v14" + "short_name": "openpedcan_v15" }, "cases_3way_complete": { From 81073dbf67a375716c8539ef4ec1304cd15d50ce Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 5 Mar 2024 13:31:03 -0500 Subject: [PATCH 06/13] :hammer: rename append maf script :sparkles: added v15 maf header :hammer: updated data clinical to better handle added DGD inputs --- COLLABORATIONS/openTARGETS/README.md | 4 +- ...penpedcan.py => append_maf_to_existing.py} | 4 +- .../openTARGETS/clinical_to_datasheets.py | 55 +++++++++++-------- .../openTARGETS/maf_openpedcan_v15_header.txt | 2 + 4 files changed, 38 insertions(+), 27 deletions(-) rename COLLABORATIONS/openTARGETS/{add_dgd_maf_to_openpedcan.py => append_maf_to_existing.py} (97%) create mode 100644 COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index 4867046..6aae989 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -74,7 +74,7 @@ To create the histologies file, recommended method is to: from prod_cbio.pbta_mioncoseq_cbio_sample ``` -1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons +1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons` ### Run as standalone 1. Download from https://github.com/PediatricOpenTargets/OpenPedCan-analysis the `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` or run from repo if you have it @@ -144,7 +144,7 @@ optional arguments: ``` _NOTE_ for v11 input, I ran the following command `zcat snv-dgd.maf.tsv.gz | perl -e '$skip = <>; $skip= <>; while(<>){print $_;}' | gzip -c >> snv-consensus-plus-hotspots.maf.tsv.gz` to add DGD data -_NOTE_ for v12 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v12_header.txt -c openpedcan_v12.maf -t ../bs_id_sample_map.txt -m ../GF_INPUTS/snv-dgd.maf.tsv.gz` to add DGD data, which is more robust - however, there are data issues with DGD, so it was left out +_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../bs_id_sample_map.txt -m ...INPUT_PREP/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust Example run: `python3 COLLABORATIONS/openTARGETS/rename_filter_maf.py -m bs_id_sample_map.txt -v snv-consensus-plus-hotspots.maf.tsv.gz -s 1 -n openpedcan_v12` diff --git a/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py similarity index 97% rename from COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py rename to COLLABORATIONS/openTARGETS/append_maf_to_existing.py index b4999fb..67d35a2 100644 --- a/COLLABORATIONS/openTARGETS/add_dgd_maf_to_openpedcan.py +++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Helper script to append DGD data to an existing merged maf file. +Helper script to append a maf to an existing maf file. Uses filter_entry to filter out undesired calls like in other mafs """ import sys @@ -100,6 +100,6 @@ else: skipped += 1 sys.stderr.write("Processed " + maf_fn + "\n") - sys.stderr.write("Skipped " + str(skipped) + " entries meeting exlusion criteria\n") + sys.stderr.write("Skipped " + str(skipped) + " entries meeting exclusion criteria\n") append_maf.close() \ No newline at end of file diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py index 8f6615e..3cc55c8 100644 --- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py +++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py @@ -1,6 +1,5 @@ import sys import argparse -import json import math import re import pdb @@ -153,17 +152,17 @@ def build_header(header_list, entry): # track some sample info so that somatic events can be collapsed samp_dict = {} -s_type = header.index('sample_type') -comp = header.index('composition') -bs_id = header.index('Kids_First_Biospecimen_ID') -exp = header.index('experimental_strategy') +s_type_index = header.index('sample_type') +comp_index = header.index('composition') +bs_id_index = header.index('Kids_First_Biospecimen_ID') +exp_index = header.index('experimental_strategy') # experimental strategy may be too vague, use to tell if RNA # related to recent addition of DGD samples, need gene matrix file -rna_lib = header.index('RNA_library') -cohort = header.index('cohort') +rna_lib_index = header.index('RNA_library') +cohort_index = header.index('cohort') a_idx = header.index('aliquot_id') -cbio_id = header.index('formatted_sample_id') +cbio_id_index = header.index('formatted_sample_id') data_gene = open('data_gene_matrix_CHOP.txt', 'w') data_gene.write('SAMPLE_ID\tmutations\n') @@ -173,20 +172,20 @@ def build_header(header_list, entry): for data in clin_data: info = data.rstrip('\n').split('\t') - if info[s_type] == "Normal" and info[exp] != 'RNA-Seq': + if info[s_type_index] == "Normal" and info[exp_index] != 'RNA-Seq': continue - if info[bs_id] in blacklist_dict: - sys.stderr.write('Skipping output of ' + info[bs_id] + ' because in blacklist for reason ' + blacklist_dict[info[bs_id]] + '\n') + if info[bs_id_index] in blacklist_dict: + sys.stderr.write('Skipping output of ' + info[bs_id_index] + ' because in blacklist for reason ' + blacklist_dict[info[bs_id_index]] + '\n') continue # adjust exp value if targeted sequencing - if info[exp] == 'Targeted Sequencing': - if info[rna_lib] != 'NA': - info[exp] = 'RNA-Seq' + if info[exp_index] == 'Targeted Sequencing': + if info[rna_lib_index] != 'NA': + info[exp_index] = 'RNA-Seq' # if DGD DNA, add to gene matrix - elif info[cohort] == 'DGD': + elif info[cohort_index] == 'DGD': # parse aliquot for panel type, i.e. ET_242MFKXW_DGD_STNGS_93 test = re.match(r'.*_DGD_(\w+)_\d+', info[a_idx]) - data_gene.write(info[cbio_id] + '\tCHOP-' + test.group(1) + '\n') + data_gene.write(info[cbio_id_index] + '\tCHOP-' + test.group(1) + '\n') pt_id = info[header.index("Kids_First_Participant_ID")] if pt_id in pt_id_dict: @@ -238,11 +237,11 @@ def build_header(header_list, entry): if samp_id not in samp_dict: samp_dict[samp_id] = sample_to_print id_mapping[samp_id] = [] - id_mapping[samp_id].append(info[bs_id]) - if info[exp] == "RNA-Seq": - bs_type[info[bs_id]] = "RNA" + id_mapping[samp_id].append(info[bs_id_index]) + if info[exp_index] == "RNA-Seq": + bs_type[info[bs_id_index]] = "RNA" else: - bs_type[info[bs_id]] = "DNA" + bs_type[info[bs_id_index]] = "DNA" # cycle through sample IDs to see if there's matched DNA/RNA and if one can be made check = {} for samp_id in id_mapping: @@ -256,8 +255,18 @@ def build_header(header_list, entry): spec = id_mapping[samp_id][1] + ";" + id_mapping[samp_id][0] samp_dict[samp_id][0] = spec elif len(id_mapping[samp_id]) > 2: - # QC check, only one or two biospec per sample ID + # QC check, only one or two biospec per sample ID, unless it's new DGD RNA + separate fusion biospecimen sys.stderr.write("Saw more than two biospecimens for " + samp_id + ": " + ",".join(id_mapping[samp_id]) + "\n") + if "DGD" in samp_id: + # If two RNA types and is DGD, throw a note to check + check_type = {"DNA": [], "RNA": []} + for bs_id in id_mapping[samp_id]: + check_type[bs_type[bs_id]].append(bs_id) + if len(check_type["DNA"]) == 1 and len(check_type["RNA"]) == 2: + spec = ";".join(check_type["DNA"] + check_type["RNA"]) + samp_dict[samp_id][0] = spec + sys.stderr.write("Could be a DGD fusion + bulk RNA, may be ok\n") + # exit(1) else: # skip cell line re-matching @@ -272,9 +281,9 @@ def build_header(header_list, entry): mapping_file = open("bs_id_sample_map.txt", "w") mapping_file.write("BS_ID\tSample Type\tCbio ID\n") for samp_id in id_mapping: - for bs_id in id_mapping[samp_id]: + for bs_id_index in id_mapping[samp_id]: try: - mapping_file.write("\t".join([bs_id, bs_type[bs_id], samp_id]) + "\n") + mapping_file.write("\t".join([bs_id_index, bs_type[bs_id_index], samp_id]) + "\n") except Exception as e: sys.stderr.write(str(e) + "\n") pdb.set_trace() diff --git a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt new file mode 100644 index 0000000..0c0a0bc --- /dev/null +++ b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt @@ -0,0 +1,2 @@ +#version 2.4 +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND_VEP SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS AF AFR_AF AMR_AF ASN_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICKVARIANT_CLASS TSL HGVS_OFFSET PHENO MINIMISED GENE_PHENO FILTER flanking_bps vcf_id vcf_qual gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF HGVSg vcf_pos gnomad_3_1_1_AC gnomad_3_1_1_AN gnomad_3_1_1_AF gnomad_3_1_1_nhomalt gnomad_3_1_1_AC_popmax gnomad_3_1_1_AN_popmax gnomad_3_1_1_AF_popmax gnomad_3_1_1_nhomalt_popmax gnomad_3_1_1_AC_controls_and_biobanks gnomad_3_1_1_AN_controls_and_biobanks gnomad_3_1_1_AF_controls_and_biobanks gnomad_3_1_1_AF_non_cancer gnomad_3_1_1_primate_ai_score gnomad_3_1_1_splice_ai_consequence MQ MQ0 CAL HotSpotAllele From ecd0d889264c5de176a899c67e29f8e56067a068 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Wed, 6 Mar 2024 21:49:12 +0000 Subject: [PATCH 07/13] :pencil: update readme with latest changes :hammer: adjust cnv script to change in input :wrench: fix bug in handling header in maf script :bug: to-do fix DGD sampe naming --- COLLABORATIONS/openTARGETS/README.md | 6 ++-- .../openTARGETS/append_maf_to_existing.py | 29 +++++++++++++------ COLLABORATIONS/openTARGETS/cnv_to_tables.py | 9 +++--- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index 6aae989..b67930d 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -39,7 +39,7 @@ To create the histologies file, recommended method is to: 1. Pull the OpenPedCan repo (warning, it's 12GB ): https://github.com/PediatricOpenTargets/OpenPedCan-analysis, or just download the script from `analyses/pedcbio-sample-name/pedcbio_sample_name_col.R` 1. Export from D3b Warehouse the latest existing cBio IDs to use for population. Ensure that the output is csv double-quoted. Currently that can be obtained using the sql command: ```sql - + with custom as ( select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.aml_sd_pet7q6f2_2018_cbio_sample union @@ -72,6 +72,8 @@ To create the histologies file, recommended method is to: union select participant_id, formatted_sample_id, specimen_id, analyte_types, normal_bs_id, normal_sample_id from prod_cbio.pbta_mioncoseq_cbio_sample + ) + select * from custom ``` 1. Get a blacklist from D3b Warehouse, exporting table `bix_workflows.cbio_hide_reasons` @@ -144,7 +146,7 @@ optional arguments: ``` _NOTE_ for v11 input, I ran the following command `zcat snv-dgd.maf.tsv.gz | perl -e '$skip = <>; $skip= <>; while(<>){print $_;}' | gzip -c >> snv-consensus-plus-hotspots.maf.tsv.gz` to add DGD data -_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../bs_id_sample_map.txt -m ...INPUT_PREP/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust +_NOTE_ for v15 input,I would have following command `python3 ~/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/append_maf_to_existing.py -i /home/ubuntu/tools/kf-cbioportal-etl/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt -c openpedcan_v15.maf -t ../INPUT_PREP/bs_id_sample_map.txt -m ../INPUTS/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz` to add tumor-only data, which is more robust Example run: `python3 COLLABORATIONS/openTARGETS/rename_filter_maf.py -m bs_id_sample_map.txt -v snv-consensus-plus-hotspots.maf.tsv.gz -s 1 -n openpedcan_v12` diff --git a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py index 67d35a2..93c732f 100644 --- a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py +++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py @@ -82,6 +82,12 @@ v_idx = m_header.index("Variant_Classification") h_idx = m_header.index("Hugo_Symbol") + # need to also pop entrez ID if exists, as process_maf_entry() will do that to data + try: + m_header.pop(m_header.index("Entrez_Gene_Id")) + except Exception as e: + print(e, file=sys.stderr) + for i in range(len(m_header)): if m_header[i] in h_dict: h_dict[m_header[i]] = i @@ -90,15 +96,20 @@ to_print = [] datum = process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, bs_cbio_key) # Set tumor barcode to cBio ID - if datum: - for item in header: - if h_dict[item] != None: - to_print.append(datum[h_dict[item]]) - else: - to_print.append("") - print("\t".join(to_print), file=append_maf) - else: - skipped += 1 + try: + if datum: + for item in header: + if h_dict[item] != None: + to_print.append(datum[h_dict[item]]) + else: + to_print.append("") + print("\t".join(to_print), file=append_maf) + else: + skipped += 1 + except Exception as e: + print (e) + pdb.set_trace() + hold = 1 sys.stderr.write("Processed " + maf_fn + "\n") sys.stderr.write("Skipped " + str(skipped) + " entries meeting exclusion criteria\n") diff --git a/COLLABORATIONS/openTARGETS/cnv_to_tables.py b/COLLABORATIONS/openTARGETS/cnv_to_tables.py index 588c7dd..c2af3fc 100644 --- a/COLLABORATIONS/openTARGETS/cnv_to_tables.py +++ b/COLLABORATIONS/openTARGETS/cnv_to_tables.py @@ -52,12 +52,13 @@ def collate_data(cnv_fn): ploidy = data[p_idx] cn = data[c_idx] try: - gistic = qual_to_gistic[data[s_idx]] + qual = data[s_idx] + if qual != "NA": + qual = qual.lower() + gistic = qual_to_gistic[qual] except Exception as e: - sys.stderr.write(str(e) + "\nInvalid value for gistic, skipping " + line.decode()) + sys.stderr.write(str(e) + "\nInvalid value for gistic, skipping " + line) continue - # pdb.set_trace() - # hold=1 if samp_id not in ploidy_dict: ploidy_dict[samp_id] = ploidy if gene not in cn_dict: From 993a5af9f9b9f98d9aa6f3847faaf8447d70bd86 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 7 Mar 2024 15:00:45 -0500 Subject: [PATCH 08/13] :wrench: fixed DGD naming issue --- COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R index 3fcddc1..130231a 100644 --- a/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R +++ b/COLLABORATIONS/openTARGETS/pedcbio_sample_name_col.R @@ -119,7 +119,7 @@ message("Collated samples missing a cBio ID") #### Handle each cohort at a time - start with PBTA # get all sample IDs in the PBTA cohort sample_ids_pbta <- histology_df_no_format_id %>% - dplyr::filter(cohort == "PBTA", sub_cohort != "DGD") %>% + dplyr::filter(cohort == "PBTA" & sub_cohort != "DGD") %>% pull(sample_id) %>% unique() @@ -132,7 +132,7 @@ for (i in 1:length(sample_ids_pbta)){ # find the number of compositions each_specimen_need_tiebreak <- histology_df_no_format_id %>% - dplyr::filter(sample_type == "Tumor") %>% + dplyr::filter(sample_type == "Tumor" & sub_cohort != "DGD") %>% dplyr::filter(sample_id == sample_id_of_interest) %>% group_by(experimental_strategy) %>% dplyr::mutate(n_sample_type = n()) %>% From 63670e0b49a5f68807f8381461443ae01df59067 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 8 Mar 2024 14:58:56 +0000 Subject: [PATCH 09/13] :wrench: fix var instantiation and config value --- COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json | 2 +- scripts/organize_upload_packages.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json index ada2b57..6d47197 100644 --- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json +++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json @@ -94,7 +94,7 @@ } }, "data_sheets": { - "dir": "datasheets", + "dir": "study_build", "dtypes": { "patient": { "cbio_name": "data_clinical_patient.txt", diff --git a/scripts/organize_upload_packages.py b/scripts/organize_upload_packages.py index 000d35b..2d6ca0d 100755 --- a/scripts/organize_upload_packages.py +++ b/scripts/organize_upload_packages.py @@ -244,8 +244,8 @@ def create_case_lists(data_dict, output_dir): sys.stderr.write(out_dir + " already exists.\n") try: + study_id = config_data["study"]["cancer_study_identifier"] if os.path.isdir(config_data["data_sheets"]["dir"]): - study_id = config_data["study"]["cancer_study_identifier"] cur_dir = out_dir + config_data["study"]["cancer_study_identifier"] + "/" try: os.mkdir(cur_dir) From 554ec4e4192cd246e27b7fa1773562a155c831b3 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 8 Mar 2024 10:45:29 -0500 Subject: [PATCH 10/13] :wrench: fixed bug in clinical data for pt id and efs status --- COLLABORATIONS/openTARGETS/clinical_to_datasheets.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py index 3cc55c8..4751828 100644 --- a/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py +++ b/COLLABORATIONS/openTARGETS/clinical_to_datasheets.py @@ -219,6 +219,13 @@ def build_header(header_list, entry): elif header[i] == "tumor_descriptor": if value in tumor_descriptor_dict: value = tumor_descriptor_dict[value] + elif header[i] == "EFS_event_type": + if value == "Not Applicable": + value = "0:No Event" + elif value == "Not Reported": + value = "NA" + else: + value = "1:" + value # replace status with NA if value not acceptable elif header[i] == "OS_status": if value not in ["LIVING", "DECEASED", "NA"]: @@ -253,7 +260,7 @@ def build_header(header_list, entry): spec = id_mapping[samp_id][0] + ";" + id_mapping[samp_id][1] if bs_type[id_mapping[samp_id][0]] == "RNA": spec = id_mapping[samp_id][1] + ";" + id_mapping[samp_id][0] - samp_dict[samp_id][0] = spec + samp_dict[samp_id][1] = spec elif len(id_mapping[samp_id]) > 2: # QC check, only one or two biospec per sample ID, unless it's new DGD RNA + separate fusion biospecimen sys.stderr.write("Saw more than two biospecimens for " + samp_id + ": " + ",".join(id_mapping[samp_id]) + "\n") @@ -264,7 +271,7 @@ def build_header(header_list, entry): check_type[bs_type[bs_id]].append(bs_id) if len(check_type["DNA"]) == 1 and len(check_type["RNA"]) == 2: spec = ";".join(check_type["DNA"] + check_type["RNA"]) - samp_dict[samp_id][0] = spec + samp_dict[samp_id][1] = spec sys.stderr.write("Could be a DGD fusion + bulk RNA, may be ok\n") # exit(1) From ef3fd0f6886b823845538a9995eb2b8a8f64108f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 8 Mar 2024 22:01:48 +0000 Subject: [PATCH 11/13] :wrench: fixed header file :hammer: rafactoreed and added bug fix for fusion input data --- .../openTARGETS/maf_openpedcan_v15_header.txt | 2 +- .../openpedcan_v15_case_meta_config.json | 2 +- .../openTARGETS/rename_filter_maf.py | 7 + scripts/convert_fusion_as_sv.py | 277 ++++++++++-------- 4 files changed, 167 insertions(+), 121 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt index 0c0a0bc..615c0a2 100644 --- a/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt +++ b/COLLABORATIONS/openTARGETS/maf_openpedcan_v15_header.txt @@ -1,2 +1,2 @@ #version 2.4 -Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND_VEP SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS AF AFR_AF AMR_AF ASN_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICKVARIANT_CLASS TSL HGVS_OFFSET PHENO MINIMISED GENE_PHENO FILTER flanking_bps vcf_id vcf_qual gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF HGVSg vcf_pos gnomad_3_1_1_AC gnomad_3_1_1_AN gnomad_3_1_1_AF gnomad_3_1_1_nhomalt gnomad_3_1_1_AC_popmax gnomad_3_1_1_AN_popmax gnomad_3_1_1_AF_popmax gnomad_3_1_1_nhomalt_popmax gnomad_3_1_1_AC_controls_and_biobanks gnomad_3_1_1_AN_controls_and_biobanks gnomad_3_1_1_AF_controls_and_biobanks gnomad_3_1_1_AF_non_cancer gnomad_3_1_1_primate_ai_score gnomad_3_1_1_splice_ai_consequence MQ MQ0 CAL HotSpotAllele +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND_VEP SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS AF AFR_AF AMR_AF ASN_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL HGVS_OFFSET PHENO MINIMISED GENE_PHENO FILTER flanking_bps vcf_id vcf_qual gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF HGVSg vcf_pos gnomad_3_1_1_AC gnomad_3_1_1_AN gnomad_3_1_1_AF gnomad_3_1_1_nhomalt gnomad_3_1_1_AC_popmax gnomad_3_1_1_AN_popmax gnomad_3_1_1_AF_popmax gnomad_3_1_1_nhomalt_popmax gnomad_3_1_1_AC_controls_and_biobanks gnomad_3_1_1_AN_controls_and_biobanks gnomad_3_1_1_AF_controls_and_biobanks gnomad_3_1_1_AF_non_cancer gnomad_3_1_1_primate_ai_score gnomad_3_1_1_splice_ai_consequence MQ MQ0 CAL HotSpotAllele diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json index 6d47197..c5c01bc 100644 --- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json +++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json @@ -115,7 +115,7 @@ }, "study": { "_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#cancer-study for detailed specifics", - "description": "OpenPedCan is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using KidsFirst Data Resource Center workflows and harness OpenPBTA analytics workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort, for v11, the latest accepted production release please see OpenPedCan v11. For study release details, please see Release Notes", + "description": "OpenPedCan is a collaborative project between the National Cancer Institute and the Children's Hospital of Philadelphia as part of the NCI's Childhood Cancer Data Initiative. Here, we harmonize pan-cancer data using KidsFirst Data Resource Center workflows and harness OpenPBTA analytics workflows to scale and add modules across pediatric cancer datasets. This data has been integrated into the pediatric open targets platform to assist in development and query of the FDA's Relevant Pediatric Molecular Targets List (PMTL) to identify new therapeutics for children with cancer. This is the v15 release of this effort. V11 previous prod release here. For study release details, please see Release Notes", "groups": "PUBLIC", "cancer_study_identifier": "openpedcan_v15", "reference_genome": "hg38", diff --git a/COLLABORATIONS/openTARGETS/rename_filter_maf.py b/COLLABORATIONS/openTARGETS/rename_filter_maf.py index 5636ba0..227c57b 100644 --- a/COLLABORATIONS/openTARGETS/rename_filter_maf.py +++ b/COLLABORATIONS/openTARGETS/rename_filter_maf.py @@ -67,6 +67,9 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict): h_idx = header.index('Hugo_Symbol') eid_idx = header.index('Entrez_Gene_Id') header.pop(eid_idx) + # bug fix for OpenPedCan, position will be one less after process_maf_entry + n_ref_ct_idx = header.index('n_ref_count') + n_ref_alt_idx = header.index('n_alt_count') print("\t".join(header), file=maf_out) sys.stderr.write("Filtering entries and renaming samples\n") @@ -74,6 +77,10 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict): for line in maf_file: to_print = process_maf_entry(line.decode(), maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict) if to_print: + # bug fix for maf format in OpenPedCan + for i in [n_ref_ct_idx, n_ref_alt_idx]: + if to_print[i] == "NA": + to_print[i] = "" print("\t".join(to_print), file=maf_out) sys.stderr.write("Fin.\n") diff --git a/scripts/convert_fusion_as_sv.py b/scripts/convert_fusion_as_sv.py index 0b7c35c..ab99fe2 100755 --- a/scripts/convert_fusion_as_sv.py +++ b/scripts/convert_fusion_as_sv.py @@ -14,7 +14,159 @@ import pdb -if __name__ == "__main__": +def collapse_and_format(fusion_data): + # Sort as a cheat to easily select preferred annot later + print("Sorting for collapse by caller and counts", file=sys.stderr) + fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"]) + # Merge rows that have the exact same fusion from different callers - thanks Natasha! + key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"] + print("Grouping for collapse and counts", file=sys.stderr) + fusion_data["groupby_key"] = fusion_data.apply( + lambda row: "\t".join([str(row[col]) for col in key_cols]), + axis=1 + ) + # "JunctionReadCount","SpanningFragCount","annots" + collapsed_list = [] + + for g in fusion_data.groupby(by="groupby_key"): + values, df_group = g + df_group["Caller"] = ",".join(set(df_group["Caller"].tolist())) + df_group["JunctionReadCount"] = df_group["JunctionReadCount"].mean() + df_group["SpanningFragCount"] = df_group["SpanningFragCount"].mean() + # Go with the ceiling of the mean + df_group["JunctionReadCount"] = df_group["JunctionReadCount"].apply(np.ceil) + df_group["SpanningFragCount"] = df_group["SpanningFragCount"].apply(np.ceil) + collapsed_list.append(df_group[key_cols + [ "JunctionReadCount","SpanningFragCount","annots", "Fusion_Type", "Caller"]].head(1)) + del fusion_data + fusion_data_collapsed = pd.concat(collapsed_list) + del collapsed_list + # Should be int + fusion_data_collapsed["JunctionReadCount"] = fusion_data_collapsed["JunctionReadCount"].astype(int) + fusion_data_collapsed["SpanningFragCount"] = fusion_data_collapsed["SpanningFragCount"].astype(int) + + + fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper() + return fusion_data_collapsed + + +def openx_annot_bug(fusion_data): + # Sort as a cheat to easily select preferred annot later + print("Sorting for collapse bug fix", file=sys.stderr) + fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"]) + # Merge rows that have the exact same fusion from different callers - thanks Natasha! + key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"] + remain_cols = list(set(fusion_data.columns.to_list()) - set(key_cols)) + print("Grouping for collapse bug fix", file=sys.stderr) + fusion_data["groupby_key"] = fusion_data.apply( + lambda row: "\t".join([str(row[col]) for col in key_cols]), + axis=1 + ) + # "JunctionReadCount","SpanningFragCount","annots" + collapsed_list = [] + print("Collapsing annotations for bug fix", file=sys.stderr) + for g in fusion_data.groupby(by="groupby_key"): + values, df_group = g + df_group["Caller"] = ",".join(set(df_group["Caller"].tolist())) + df_group["Gene1A_anno"] = ", ".join(set(df_group["Gene1A_anno"].tolist())) + df_group["Gene1B_anno"] = ", ".join(set(df_group["Gene1B_anno"].tolist())) + collapsed_list.append(df_group[key_cols + remain_cols].head(1)) + del fusion_data + fusion_data_collapsed = pd.concat(collapsed_list) + del collapsed_list + # Should be int + fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper() + print("Bug fix completed", file=sys.stderr) + return fusion_data_collapsed + + +def filter_and_format_annots(sample_renamed_df, drop_low): + """ + Applies a filter to remove entries with ARRIBA confidence "low" when set, then formats annots file to include Gene1A_anno and Gene1B_anno + """ + if drop_low: + sample_renamed_df = sample_renamed_df[sample_renamed_df.Confidence != 'low'] + else: + # not drop low is a OpenX feature, applt repeat annotation bug fix + print("Applying OpenX annot bug fix", file=sys.stderr) + sample_renamed_df = openx_annot_bug(sample_renamed_df) + sample_renamed_df["annots"] = sample_renamed_df.apply( + lambda row: "Gene1: " + ",".join( + set(list(row["Gene1A_anno"].split(", ")))) + + "; Gene2: " + ",".join(set(list(row["Gene1B_anno"].split(", "))) + ), + axis=1 + ) + ";" + sample_renamed_df["annots"] + return sample_renamed_df + + +def init_cbio_master(fusion_results, mode, rna_metadata): + """ + Use data frame subset on RNA fusion files to find and merge result files + """ + desired = [ + "Sample", + "Gene1A", + "LeftBreakpoint", + "Gene1B", + "RightBreakpoint", + "Fusion_Type", + "JunctionReadCount", + "SpanningFragCount", + "annots", + "FusionName", + "Fusion_anno", + "Caller" + ] + if mode == "openX" or mode == "dgd": + openx_data = pd.read_csv(fusion_results, sep="\t", keep_default_na=False, na_values=[""]) + # Merge so that sample names can be cBio names - thanks Natasha! + merged = pd.merge( + openx_data, rna_metadata[["T_CL_BS_ID", "Cbio_Tumor_Name"]], left_on="Sample", right_on="T_CL_BS_ID", how="left" + ) + merged["Sample"] = merged.apply( + lambda row: row["Cbio_Tumor_Name"], axis=1 + ) + # OpenX data may not have all annoFuse cols + present = [] + # openPBTA...and maybe open pedcan uses this + if 'CalledBy' in merged.columns: + merged.rename( + columns={"CalledBy": "Caller"}, + inplace=True + ) + elif mode == "dgd": + merged["Caller"] = "Archer" + # Also merge existing annotations in Gene1A_anno, Gene1B_anno into annots + merged = filter_and_format_annots(merged, False) + for col in desired: + if col in merged.columns: + present.append(col) + openx_data = merged[present] + # only if read counts there, collapse + if "JunctionReadCount" in openx_data.columns: + openx_data = collapse_and_format(openx_data) + return openx_data, present + else: + flist = rna_metadata.File_Name + frame_list = [] + for i in range(0, len(flist), 1): + try: + # concat annofuse file, rename Sample Column according to cBio name + ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t", keep_default_na=False, na_values=[""]) + ann_file = ann_file.assign(Sample=rna_metadata.at[i, "Cbio_Tumor_Name"]) + frame_list.append(ann_file) + except Exception as e: + sys.stderr.write(str(e) + '\n') + exit(1) + concat_frame = pd.concat(frame_list) + concat_frame = filter_and_format_annots(concat_frame, True) + del frame_list + fusion_data = concat_frame[desired] + del concat_frame + return collapse_and_format(fusion_data), desired + + +def main(): parser = argparse.ArgumentParser( description="Convert openPBTA fusion table OR list of annofuse files to cbio format." ) @@ -67,124 +219,6 @@ + "\n" ) exit(1) - - - def collapse_and_format(fusion_data): - # Sort as a cheat to easily select preferred annot later - fusion_data = fusion_data.sort_values(["Sample", "LeftBreakpoint", "RightBreakpoint", "Caller"]) - # Merge rows that have the exact same fusion from different callers - thanks Natasha! - key_cols = ["Sample","Gene1A","LeftBreakpoint","Gene1B","RightBreakpoint","FusionName","Fusion_anno"] - fusion_data["groupby_key"] = fusion_data.apply( - lambda row: "\t".join([str(row[col]) for col in key_cols]), - axis=1 - ) - # "JunctionReadCount","SpanningFragCount","annots" - collapsed_list = [] - - for g in fusion_data.groupby(by="groupby_key"): - values, df_group = g - df_group["Caller"] = ",".join(set(df_group["Caller"].tolist())) - df_group["JunctionReadCount"] = df_group["JunctionReadCount"].mean() - df_group["SpanningFragCount"] = df_group["SpanningFragCount"].mean() - # Go with the ceiling of the mean - df_group["JunctionReadCount"] = df_group["JunctionReadCount"].apply(np.ceil) - df_group["SpanningFragCount"] = df_group["SpanningFragCount"].apply(np.ceil) - collapsed_list.append(df_group[key_cols + [ "JunctionReadCount","SpanningFragCount","annots", "Fusion_Type", "Caller"]].head(1)) - del fusion_data - fusion_data_collapsed = pd.concat(collapsed_list) - del collapsed_list - # Should be int - fusion_data_collapsed["JunctionReadCount"] = fusion_data_collapsed["JunctionReadCount"].astype(int) - fusion_data_collapsed["SpanningFragCount"] = fusion_data_collapsed["SpanningFragCount"].astype(int) - - - fusion_data_collapsed["Caller"] = fusion_data_collapsed["Caller"].str.upper() - return fusion_data_collapsed - - - - def filter_and_format_annots(sample_renamed_df, drop_low): - """ - Applies a filter to remove entries with ARRIBA confidence "low" when set, then formats annots file to include Gene1A_anno and Gene1B_anno - """ - if drop_low: - sample_renamed_df = sample_renamed_df[sample_renamed_df.Confidence != 'low'] - sample_renamed_df["annots"] = sample_renamed_df.apply( - lambda row: "Gene1: " + ",".join( - set(list(row["Gene1A_anno"].split(", ")))) - + "; Gene2: " + ",".join(set(list(row["Gene1B_anno"].split(", "))) - ), - axis=1 - ) + ";" + sample_renamed_df["annots"] - return sample_renamed_df - - - def init_cbio_master(fusion_results, mode, rna_metadata): - """ - Use data frame subset on RNA fusion files to find and merge result files - """ - desired = [ - "Sample", - "Gene1A", - "LeftBreakpoint", - "Gene1B", - "RightBreakpoint", - "Fusion_Type", - "JunctionReadCount", - "SpanningFragCount", - "annots", - "FusionName", - "Fusion_anno", - "Caller" - ] - if mode == "openX" or mode == "dgd": - openx_data = pd.read_csv(fusion_results, sep="\t", keep_default_na=False, na_values=[""]) - # Merge so that sample names can be cBio names - thanks Natasha! - merged = pd.merge( - openx_data, rna_metadata[["T_CL_BS_ID", "Cbio_Tumor_Name"]], left_on="Sample", right_on="T_CL_BS_ID", how="left" - ) - merged["Sample"] = merged.apply( - lambda row: row["Cbio_Tumor_Name"], axis=1 - ) - # OpenX data may not have all annoFuse cols - present = [] - # openPBTA...and maybe open pedcan uses this - if 'CalledBy' in merged.columns: - merged.rename( - columns={"CalledBy": "Caller"}, - inplace=True - ) - elif mode == "dgd": - merged["Caller"] = "Archer" - # Also merge existing annotations in Gene1A_anno, Gene1B_anno into annots - merged = filter_and_format_annots(merged, False) - for col in desired: - if col in merged.columns: - present.append(col) - openx_data = merged[present] - # only if read counts there, collapse - if "JunctionReadCount" in openx_data.columns: - openx_data = collapse_and_format(openx_data) - return openx_data, present - else: - flist = rna_metadata.File_Name - frame_list = [] - for i in range(0, len(flist), 1): - try: - # concat annofuse file, rename Sample Column according to cBio name - ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t", keep_default_na=False, na_values=[""]) - ann_file = ann_file.assign(Sample=rna_metadata.at[i, "Cbio_Tumor_Name"]) - frame_list.append(ann_file) - except Exception as e: - sys.stderr.write(str(e) + '\n') - exit(1) - concat_frame = pd.concat(frame_list) - concat_frame = filter_and_format_annots(concat_frame, True) - del frame_list - fusion_data = concat_frame[desired] - del concat_frame - return collapse_and_format(fusion_data), desired - out_dir = args.out_dir try: os.mkdir(out_dir) @@ -317,3 +351,8 @@ def init_cbio_master(fusion_results, mode, rna_metadata): fus_tbl = fus_tbl[existing.columns] fus_tbl.set_index("Sample_Id", inplace=True) fus_tbl.to_csv(fus_fname, sep="\t", mode="a", index=True, quoting=csv.QUOTE_NONE, header=None) + + +if __name__ == "__main__": + main() + From 4cecd9e0ec796bebb7346906a624b640d998c097 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 11 Mar 2024 15:39:29 +0000 Subject: [PATCH 12/13] :wrench: apply bug fix to the right script --- COLLABORATIONS/openTARGETS/append_maf_to_existing.py | 7 +++++++ COLLABORATIONS/openTARGETS/rename_filter_maf.py | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py index 93c732f..9461ca0 100644 --- a/COLLABORATIONS/openTARGETS/append_maf_to_existing.py +++ b/COLLABORATIONS/openTARGETS/append_maf_to_existing.py @@ -87,6 +87,9 @@ m_header.pop(m_header.index("Entrez_Gene_Id")) except Exception as e: print(e, file=sys.stderr) + # bug fix for OpenPedCan, position will be one less after process_maf_entry + n_ref_ct_idx = m_header.index('n_ref_count') + n_alt_ct_idx = m_header.index('n_alt_count') for i in range(len(m_header)): if m_header[i] in h_dict: @@ -103,6 +106,10 @@ to_print.append(datum[h_dict[item]]) else: to_print.append("") + # bug fix for maf format in OpenPedCan + for i in [n_ref_ct_idx, n_alt_ct_idx]: + if to_print[i] == "NA": + to_print[i] = "" print("\t".join(to_print), file=append_maf) else: skipped += 1 diff --git a/COLLABORATIONS/openTARGETS/rename_filter_maf.py b/COLLABORATIONS/openTARGETS/rename_filter_maf.py index 227c57b..8d24c47 100644 --- a/COLLABORATIONS/openTARGETS/rename_filter_maf.py +++ b/COLLABORATIONS/openTARGETS/rename_filter_maf.py @@ -67,9 +67,6 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict): h_idx = header.index('Hugo_Symbol') eid_idx = header.index('Entrez_Gene_Id') header.pop(eid_idx) - # bug fix for OpenPedCan, position will be one less after process_maf_entry - n_ref_ct_idx = header.index('n_ref_count') - n_ref_alt_idx = header.index('n_alt_count') print("\t".join(header), file=maf_out) sys.stderr.write("Filtering entries and renaming samples\n") @@ -77,11 +74,6 @@ def process_maf_entry(data, maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict): for line in maf_file: to_print = process_maf_entry(line.decode(), maf_exc, v_idx, h_idx, tid_idx, eid_idx, map_dict) if to_print: - # bug fix for maf format in OpenPedCan - for i in [n_ref_ct_idx, n_ref_alt_idx]: - if to_print[i] == "NA": - to_print[i] = "" print("\t".join(to_print), file=maf_out) - sys.stderr.write("Fin.\n") maf_out.close() \ No newline at end of file From 3874f3dec78e3696031a39100521e743afc05a0f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 14 Mar 2024 14:07:44 +0000 Subject: [PATCH 13/13] :pencil: update docs and config --- COLLABORATIONS/openTARGETS/README.md | 9 +++++++-- .../openTARGETS/openpedcan_v15_case_meta_config.json | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/README.md b/COLLABORATIONS/openTARGETS/README.md index b67930d..9871ef9 100644 --- a/COLLABORATIONS/openTARGETS/README.md +++ b/COLLABORATIONS/openTARGETS/README.md @@ -20,6 +20,7 @@ fusion-dgd.tsv.gz fusion-putative-oncogenic.tsv gene-expression-rsem-tpm-collapsed.rds tcga_gene-expression-rsem-tpm-collapsed.rds +gtex_gene-expression-rsem-tpm-collapsed.rds snv-consensus-plus-hotspots.maf.tsv.gz snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz ``` @@ -92,7 +93,11 @@ TCGA data are kept in a seprate matrix from everything else. We need to merge th ```sh Rscript COLLABORATIONS/openTARGETS/merge_rsem_rds.R --first_file gene-expression-rsem-tpm-collapsed.rds --second_file tcga-gene-expression-rsem-tpm-collapsed.rds --output_fn gene_tcga_expression_common_merge.rds ``` - +UPDATE: GTEx is also in a seprate matrix, so run again currently to make the "final" merge before conversion +```sh +Rscript COLLABORATIONS/openTARGETS/merge_rsem_rds.R --first_file gene_tcga_expression_common_merge.rds --second_file gtex_gene-expression-rsem-tpm-collapsed.rds --output_fn gene_tcga_gtex_expression_common_merge.rds +``` +``` ### File Transformation It's recommended to put datasheets in a dir called `datasheets`, downloaded files in it's own dir (in v12 it's `GF_INPUTS`) and the rest of the processed outputs into it's own dir (`study_build` for v12) to keep things sane and also be able to leverage existing study build script in `scripts/organize_upload_packages.py` @@ -195,7 +200,7 @@ Options: Show this help message and exit ``` Example run: -`Rscript COLLABORATIONS/openTARGETS/rename_export_rsem.R --rna_rds gene_tcga_expression_common_merge.rds --map_id bs_id_sample_map.txt --type openpedcan_v11 --computeZscore R 2> rna_convert.errs` +`Rscript COLLABORATIONS/openTARGETS/rename_export_rsem.R --rna_rds gene_tcga_gtex_expression_common_merge.rds --map_id bs_id_sample_map.txt --type openpedcan_v15 --computeZscore R 2> rna_convert.errs` #### 5. scripts/convert_fusion_as_sv.py diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json index c5c01bc..805d5ca 100644 --- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json +++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json @@ -119,7 +119,7 @@ "groups": "PUBLIC", "cancer_study_identifier": "openpedcan_v15", "reference_genome": "hg38", - "display_name": "Open Pediatric Cancer (OpenPedCan) Project v14", + "display_name": "Open Pediatric Cancer (OpenPedCan) Project v15", "type_of_cancer": "mixed", "short_name": "openpedcan_v15"