From c8404f5fd4a634304d5e8d77ab5f9f4ad97fdda7 Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Sat, 6 May 2023 11:00:13 -0400 Subject: [PATCH 1/9] Stating to remove legacy archive from TCGAbiolinks, since it will be shutdown by GDC --- DESCRIPTION | 2 +- NEWS | 5 + R/clinical.R | 9 +- R/download.R | 21 +-- R/internal.R | 145 ++++++------------ R/prepare.R | 92 +++++------ R/query.R | 102 ++++--------- R/visualize.R | 2 - man/GDCdownload.Rd | 13 +- man/GDCquery.Rd | 39 +---- man/TCGAvisualize_oncoprint.Rd | 1 - tests/testthat/test-prepare-download.R | 103 +++++-------- tests/testthat/test-query-clinical.R | 4 +- tests/testthat/test-query.R | 94 +++--------- vignettes/download_prepare.Rmd | 201 +------------------------ vignettes/query.Rmd | 149 ++---------------- 16 files changed, 219 insertions(+), 763 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31d9c76d3..f3358f98a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TCGAbiolinks Type: Package Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data -Version: 2.29.0 +Version: 2.29.1 Date: 2022-17-08 Author: Antonio Colaprico, Tiago Chedraoui Silva, diff --git a/NEWS b/NEWS index 61ec4e710..458f3daa1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +CHANGES IN VERSION 2.29.1 +------------------------- + +* Removing support to legacy archive since it will be shutdown by GDC soon. + CHANGES IN VERSION 2.21.1 ------------------------- diff --git a/R/clinical.R b/R/clinical.R index 6773831cc..1419a5a13 100644 --- a/R/clinical.R +++ b/R/clinical.R @@ -344,13 +344,16 @@ GDCquery_clinic <- function( } else { # HTMCP-03-06-02061 has two diagnosis x$submitter_id <- gsub("_diagnosis.*","",x$submitter_id) + # If there are two rows for the same submitter_id + # we will collapse them into one single row + # concatanating all columns using ; aux <- x %>% dplyr::group_by(submitter_id) %>% - dplyr::summarise_each(funs(paste(unique(.), collapse = ";"))) + summarise(across(everything(),~ paste(unique(.), collapse = ";"))) aux$treatments <- list(dplyr::bind_rows(x$treatments)) aux } } - ),fill = T + ), fill = TRUE ) #df$submitter_id <- gsub("^d|_diagnosis|diag-|-DX|-DIAG|-diagnosis","", df$submitter_id) # ^d ORGANOID-PANCREATIC @@ -500,7 +503,7 @@ GDCprepare_clinic <- function( } # Get all the clincal xml files - source <- ifelse(query$legacy,"legacy","harmonized") + source <- "harmonized" files <- file.path( query$results[[1]]$project, source, gsub(" ","_",query$results[[1]]$data_category), diff --git a/R/download.R b/R/download.R index e0dc437b1..be812ff9f 100644 --- a/R/download.R +++ b/R/download.R @@ -16,15 +16,6 @@ #' @importFrom methods is #' @export #' @examples -#' query <- GDCquery( -#' project = "TCGA-ACC", -#' data.category = "Copy number variation", -#' legacy = TRUE, -#' file.type = "hg19.seg", -#' barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01") -#' ) -#' # data will be saved in GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation -#' GDCdownload(query, method = "api") #' \dontrun{ #' # Download clinical data from XML #' query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical") @@ -39,14 +30,14 @@ #' # data will be saved in: #' # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification #' GDCdownload(query, method = "client", directory = "example_data_dir") -#' acc.gbm <- GDCquery( +#' query_acc_gbm <- GDCquery( #' project = c("TCGA-ACC","TCGA-GBM"), #' data.category = "Transcriptome Profiling", #' data.type = "Gene Expression Quantification", #' workflow.type = "STAR - Counts" #' ) #' GDCdownload( -#' query = acc.gbm, +#' query = query_acc_gbm, #' method = "api", #' directory = "example", #' files.per.chunk = 50 @@ -73,7 +64,7 @@ GDCdownload <- function( stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.") } - source <- ifelse(query$legacy,"legacy","harmonized") + source <- "harmonized" dir.create(directory, showWarnings = FALSE, recursive = TRUE) for(proj in unique(unlist(query$project))){ @@ -152,11 +143,7 @@ GDCdownload <- function( ) } - server <- ifelse( - query$legacy, - "https://api.gdc.cancer.gov/legacy/data/", - "https://api.gdc.cancer.gov/data/" - ) + server <- "https://api.gdc.cancer.gov/data/" if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) { message("The total size of files is big. We will download files in chunks") diff --git a/R/internal.R b/R/internal.R index bea2f02d5..dd35d78a8 100644 --- a/R/internal.R +++ b/R/internal.R @@ -67,107 +67,52 @@ checkProjectInput <- function(project){ } } -checkLegacyPlatform <- function(project,data.category, legacy = FALSE){ - project.summary <- getProjectSummary(project, legacy) - if(missing(data.category)) { - print(knitr::kable(project.summary$data_categories)) - stop("Please set a data.category argument from the column data_category above") - } - if(!(data.category %in% project.summary$data_categories$data_category)) { - print(knitr::kable(project.summary$data_categories)) - stop("Please set a valid data.category argument from the column data_category above") - } -} +checkDataTypeInput <- function(data.type){ + + harmonized.data.type <- c( + "Aggregated Somatic Mutation", + "Aligned Reads", + "Gene Expression Quantification", + "Raw CGI Variant", + "Methylation Beta Value", + "Differential Gene Expression", + "Splice Junction Quantification", + "Protein Expression Quantification", + "Annotated Somatic Mutation", + "Raw Simple Somatic Mutation", + "Masked Somatic Mutation", + "Copy Number Segment", + "Masked Intensities", + "Allele-specific Copy Number Segment", + "Masked Copy Number Segment", + "Isoform Expression Quantification", + "miRNA Expression Quantification", + "Gene Level Copy Number", + "Biospecimen Supplement", + "Gene Level Copy Number Scores", + "Protein Expression Quantification", + "Clinical Supplement", + "Single Cell Analysis", + "Masked Somatic Mutation", + "Slide Image" + ) -checkDataTypeInput <- function(legacy, data.type){ - if(legacy){ - legacy.data.type <- c("Copy number segmentation", - "Raw intensities", - "Aligned reads", - "Copy number estimate", - "Simple nucleotide variation", - "Gene expression quantification", - "Coverage WIG", - "miRNA gene quantification", - "Genotypes", - "miRNA isoform quantification", - "Normalized copy numbers", - "Isoform expression quantification", - "Normalized intensities", - "Tissue slide image", - "Exon quantification", - "Exon junction quantification", - "Methylation beta value", - "Unaligned reads", - "Diagnostic image", - "CGH array QC", - "Biospecimen Supplement", - "Pathology report", - "Clinical Supplement", - "Intensities", - "Protein expression quantification", - "Microsatellite instability", - "Structural variation", - "Auxiliary test", - "Copy number QC metrics", - "Intensities Log2Ratio", - "Methylation array QC metrics", - "Clinical data", - "Copy number variation", - "ABI sequence trace", - "Protein Expression Quantification", - "Biospecimen data", - "Simple somatic mutation", - "Bisulfite sequence alignment", - "Methylation percentage", - "Sequencing tag", - "Sequencing tag counts", - "LOH") - if(!data.type %in% legacy.data.type) { - print(knitr::kable(as.data.frame(sort(legacy.data.type)))) - stop("Please set a data.type argument from the column legacy.data.type above") - } - } else { - harmonized.data.type <- c( - "Aggregated Somatic Mutation", - "Aligned Reads", - "Gene Expression Quantification", - "Raw CGI Variant", - "Methylation Beta Value", - "Differential Gene Expression", - "Splice Junction Quantification", - "Protein Expression Quantification", - "Annotated Somatic Mutation", - "Raw Simple Somatic Mutation", - "Masked Somatic Mutation", - "Copy Number Segment", - "Masked Intensities", - "Allele-specific Copy Number Segment", - "Masked Copy Number Segment", - "Isoform Expression Quantification", - "miRNA Expression Quantification", - "Gene Level Copy Number", - "Biospecimen Supplement", - "Gene Level Copy Number Scores", - "Protein Expression Quantification", - "Clinical Supplement", - "Single Cell Analysis", - "Masked Somatic Mutation", - "Slide Image") - if(!data.type %in% harmonized.data.type) { - print(knitr::kable(as.data.frame(sort(harmonized.data.type)))) - stop("Please set a data.type argument from the column harmonized.data.type above") - } + if (!data.type %in% harmonized.data.type) { + print(knitr::kable(as.data.frame(sort(harmonized.data.type)))) + stop("Please set a data.type argument from the column harmonized.data.type above") } } -checkDataCategoriesInput <- function(project,data.category, legacy = FALSE){ +checkDataCategoriesInput <- function(project,data.category){ + for(proj in project){ - project.summary <- getProjectSummary(proj, legacy) + + project.summary <- getProjectSummary(proj) if(missing(data.category)) { print(knitr::kable(project.summary$data_categories)) stop("Please set a data.category argument from the column data_category above") } + if(!(data.category %in% project.summary$data_categories$data_category)) { print(knitr::kable(project.summary$data_categories)) stop("Please set a valid data.category argument from the column data_category above. We could not validade the data.category for project ", proj) @@ -618,13 +563,10 @@ get.mutation <- function( if(missing(genes)) stop("Argument genes is missing") # Get mutation annotation file - library(maftools) - library(dplyr) query <- GDCquery( project = project, data.category = "Simple Nucleotide Variation", access = "open", - legacy = FALSE, data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) @@ -638,8 +580,9 @@ get.mutation <- function( unlist( sapply( mutant_variant_classification, - function(x) grep(x,maf$Variant_Classification, - ignore.case = TRUE) + function(x) { + grep(x,maf$Variant_Classification,ignore.case = TRUE) + } ) ) ) @@ -648,8 +591,10 @@ get.mutation <- function( mut <- NULL for(i in genes) { if(!i %in% maf$Hugo_Symbol) next - aux <- data.frame(patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15), - mut = TRUE) + aux <- data.frame( + patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15), + mut = TRUE + ) colnames(aux)[2] <- paste0("mut_hg38_",i) if(is.null(mut)) { mut <- aux @@ -668,6 +613,7 @@ get.mutation <- function( return(mut) } + get.mut.gistc <- function( project, genes, @@ -694,6 +640,7 @@ get.mut.gistc <- function( } else if(is.null(mut) & !is.null(cnv)) { return(cnv) } + return(NULL) } get.mut.gistc.information <- function( diff --git a/R/prepare.R b/R/prepare.R index 55cbbbcf2..fbfc3f172 100644 --- a/R/prepare.R +++ b/R/prepare.R @@ -91,7 +91,7 @@ GDCprepare <- function( stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost") } # We save the files in project/source/data.category/data.type/file_id/file_name - source <- ifelse(query$legacy,"legacy","harmonized") + source <- "harmonized" files <- file.path( query$results[[1]]$project, source, gsub(" ","_",query$results[[1]]$data_category), @@ -174,8 +174,7 @@ GDCprepare <- function( files = files, cases = cases, summarizedExperiment = summarizedExperiment, - platform = unique(query$results[[1]]$platform), - legacy = query$legacy + platform = unique(query$results[[1]]$platform) ) } else if (grepl("Raw intensities|Masked Intensities",query$data.type, ignore.case = TRUE)) { # preparing IDAT files @@ -183,8 +182,7 @@ GDCprepare <- function( files = files, barcode = cases, summarizedExperiment = summarizedExperiment, - platform = unique(query$results[[1]]$platform), - legacy = query$legacy + platform = unique(query$results[[1]]$platform) ) } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { @@ -199,7 +197,7 @@ GDCprepare <- function( } else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) { - if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE) | source == "legacy"){ + if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ data <- readSimpleNucleotideVariationMaf(files) } @@ -212,7 +210,7 @@ GDCprepare <- function( files = files, cases = cases, summarizedExperiment = summarizedExperiment, - genome = ifelse(query$legacy,"hg19","hg38"), + genome = "hg38", experimental.strategy = unique(query$results[[1]]$experimental_strategy) ) @@ -221,7 +219,7 @@ GDCprepare <- function( files = files, cases = cases, summarizedExperiment = FALSE, - genome = ifelse(query$legacy,"hg19","hg38"), + genome = "hg38", experimental.strategy = unique(query$results[[1]]$experimental_strategy) ) @@ -713,14 +711,13 @@ readIDATDNAmethylation <- function( files, barcode, summarizedExperiment, - platform, - legacy + platform ) { check_package("sesame") # Check if moved files would be moved outside of scope folder, if so, path doesn't change - moved.files <- sapply(files,USE.NAMES=FALSE,function(x){ + moved.files <- sapply(files,USE.NAMES = FALSE,function(x){ if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) { return(file.path(dirname(dirname(x)), basename(x))) } @@ -753,7 +750,7 @@ readIDATDNAmethylation <- function( betas <- makeSEFromDNAMethylationMatrix( betas = betas, - genome = ifelse(legacy,"hg19","hg38"), + genome ="hg38", met.platform = platform ) colData(betas) <- DataFrame(colDataPrepare(colnames(betas))) @@ -774,8 +771,7 @@ readDNAmethylation <- function( files, cases, summarizedExperiment = TRUE, - platform, - legacy + platform ){ if(length(platform) > 1){ @@ -847,7 +843,7 @@ readDNAmethylation <- function( df <- makeSEFromDNAMethylationMatrix( betas = df, - genome = ifelse(legacy,"hg19","hg38"), + genome = "hg38", met.platform = platform ) } @@ -1056,31 +1052,37 @@ colDataPrepareTCGA <- function(barcode){ # For the moment this will work only for TCGA Data # We should search what TARGET data means - code <- c('01','02','03','04','05','06','07','08','09','10','11', - '12','13','14','20','40','50','60','61') - shortLetterCode <- c("TP","TR","TB","TRBM","TAP","TM","TAM","THOC", - "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB", - "CELL","XP","XCL") - - definition <- c("Primary solid Tumor", # 01 - "Recurrent Solid Tumor", # 02 - "Primary Blood Derived Cancer - Peripheral Blood", # 03 - "Recurrent Blood Derived Cancer - Bone Marrow", # 04 - "Additional - New Primary", # 05 - "Metastatic", # 06 - "Additional Metastatic", # 07 - "Human Tumor Original Cells", # 08 - "Primary Blood Derived Cancer - Bone Marrow", # 09 - "Blood Derived Normal", # 10 - "Solid Tissue Normal", # 11 - "Buccal Cell Normal", # 12 - "EBV Immortalized Normal", # 13 - "Bone Marrow Normal", # 14 - "Control Analyte", # 20 - "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 - "Cell Lines", # 50 - "Primary Xenograft Tissue", # 60 - "Cell Line Derived Xenograft Tissue") # 61 + code <- c( + '01','02','03','04','05','06','07','08','09','10','11', + '12','13','14','20','40','50','60','61' + ) + shortLetterCode <- c( + "TP","TR","TB","TRBM","TAP","TM","TAM","THOC", + "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB", + "CELL","XP","XCL" + ) + + definition <- c( + "Primary solid Tumor", # 01 + "Recurrent Solid Tumor", # 02 + "Primary Blood Derived Cancer - Peripheral Blood", # 03 + "Recurrent Blood Derived Cancer - Bone Marrow", # 04 + "Additional - New Primary", # 05 + "Metastatic", # 06 + "Additional Metastatic", # 07 + "Human Tumor Original Cells", # 08 + "Primary Blood Derived Cancer - Bone Marrow", # 09 + "Blood Derived Normal", # 10 + "Solid Tissue Normal", # 11 + "Buccal Cell Normal", # 12 + "EBV Immortalized Normal", # 13 + "Bone Marrow Normal", # 14 + "Control Analyte", # 20 + "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 + "Cell Lines", # 50 + "Primary Xenograft Tissue", # 60 + "Cell Line Derived Xenograft Tissue" + ) # 61 aux <- DataFrame(code = code,shortLetterCode,definition) # in case multiple equal barcode @@ -1088,10 +1090,12 @@ colDataPrepareTCGA <- function(barcode){ "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") samples <- str_match(barcode,regex)[,1] - ret <- DataFrame(barcode = barcode, - patient = substr(barcode, 1, 12), - sample = substr(barcode, 1, 16), - code = substr(barcode, 14, 15)) + ret <- DataFrame( + barcode = barcode, + patient = substr(barcode, 1, 12), + sample = substr(barcode, 1, 16), + code = substr(barcode, 14, 15) + ) ret <- merge(ret,aux, by = "code", sort = FALSE) ret <- ret[match(barcode,ret$barcode),] rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE)) diff --git a/R/query.R b/R/query.R index 3b394d7af..5a86d06f1 100644 --- a/R/query.R +++ b/R/query.R @@ -3,7 +3,6 @@ #' Uses GDC API to search for search, it searches for both controlled and #' open-access data. #' For GDC data arguments project, data.category, data.type and workflow.type should be used -#' For the legacy data arguments project, data.category, platform and/or file.extension should be used. #' Please, see the vignette for a table with the possibilities. #' @param project A list of valid project (see list with TCGAbiolinks:::getGDCprojects()$project_id)] #' \itemize{ @@ -75,33 +74,15 @@ #' \item{ Simple Nucleotide Variation } #' \item{ Transcriptome Profiling } #' } -#' List for legacy archive -#' \itemize{ -#' \item{ Biospecimen } -#' \item{ Clinical } -#' \item{ Copy number variation } -#' \item{ DNA methylation } -#' \item{ Gene expression } -#' \item{ Protein expression } -#' \item{ Raw microarray data } -#' \item{ Raw sequencing data } -#' \item{ Simple nucleotide variation } -#' } #' @param data.type A data type to filter the files to download #' For the complete list please check the vignette. #' @param sample.type A sample type to filter the files to download #' @param barcode A list of barcodes to filter the files to download -#' @param legacy Search in the legacy repository #' @param data.format Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML", #' "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX") -#' @param file.type To be used in the legacy database for some platforms, -#' to define which file types to be used. #' @param workflow.type GDC workflow type -#' @param experimental.strategy Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. -#' Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, -#' DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, -#' MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, -#' Total RNA-Seq, Capillary sequencing, Bisulfite-Seq +#' @param experimental.strategy Filter to experimental strategy. +#' Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. #' @param access Filter by access type. Possible values: controlled, open #' @param platform Example: #' \tabular{ll}{ @@ -157,19 +138,6 @@ #' data.type = "Masked Copy Number Segment", #' sample.type = c("Primary Tumor") #' ) -#' query.met <- GDCquery( -#' project = c("TCGA-GBM","TCGA-LGG"), -#' legacy = TRUE, -#' data.category = "DNA methylation", -#' platform = "Illumina Human Methylation 450" -#' ) -#' query <- GDCquery( -#' project = "TCGA-ACC", -#' data.category = "Copy number variation", -#' legacy = TRUE, -#' file.type = "hg19.seg", -#' barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01") -#' ) #' } #' @return A data frame with the results and the parameters used #' @importFrom jsonlite fromJSON @@ -183,7 +151,6 @@ GDCquery <- function( data.category, data.type, workflow.type, - legacy = FALSE, access, platform, file.type, @@ -243,11 +210,11 @@ GDCquery <- function( } }) print.header("GDCquery: Searching in GDC database","section") - message("Genome of reference: ",ifelse(legacy,"hg19","hg38")) + message("Genome of reference: hg38") # Check arguments checkProjectInput(project) - checkDataCategoriesInput(project, data.category, legacy) - if(!is.na(data.type)) checkDataTypeInput(legacy = legacy, data.type = data.type) + checkDataCategoriesInput(project, data.category) + if(!is.na(data.type)) checkDataTypeInput(data.type = data.type) if(!any(is.na(sample.type))) checkBarcodeDefinition(sample.type) results <- NULL @@ -257,7 +224,6 @@ GDCquery <- function( project = proj, data.category = data.category, data.type = data.type, - legacy = legacy, workflow.type = workflow.type, platform = platform, file.type = file.type, @@ -279,7 +245,6 @@ GDCquery <- function( project = proj, data.category = data.category, data.type = data.type, - legacy = legacy, workflow.type = NA, platform = NA, file.type = file.type, @@ -621,17 +586,6 @@ GDCquery <- function( message("ooo By sample.type") results <- results[tolower(results$sample_type) %in% tolower(sample.type),] } - # some how there are duplicated files in GDC we should remove them - # Example of problematic query - # query.exp <- GDCquery(project = "TCGA-BRCA", - # legacy = TRUE, - # data.category = "Gene expression", - # data.type = "Gene expression quantification", - # platform = "Illumina HiSeq", - # file.type = "results", - # experimental_strategy = "RNA-Seq", - # sample.type = c("Primary solid Tumor","Solid Tissue Normal")) - # print.header("Checking data","subsection") message("ooo Checking if there are duplicated cases") @@ -665,7 +619,6 @@ GDCquery <- function( project = I(list(project)), data.category = data.category, data.type = data.type, - legacy = legacy, access = I(list(access)), experimental.strategy = I(list(experimental.strategy)), file.type = file.type, @@ -677,37 +630,41 @@ GDCquery <- function( return(ret) } -getGDCquery <- function(project, data.category, data.type, legacy, workflow.type,platform,file.type,files.access,sample.type,experimental.strategy){ +getGDCquery <- function( + project, + data.category, + data.type, + workflow.type, + platform, + file.type, + files.access, + sample.type, + experimental.strategy +){ # Get manifest using the API - baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?") + baseURL <- "https://api.gdc.cancer.gov/files/?" options.pretty <- "pretty=true" - if(data.category == "Protein expression" & legacy) { - options.expand <- "fields=archive.revision,archive.file_name,md5sum,state,data_category,file_id,platform,file_name,file_size,md5sum,submitter_id,data_type&expand=cases.samples.portions,cases.project,center,analysis" - } else if(data.category %in% c("Clinical","Biospecimen")) { + if(data.category %in% c("Clinical","Biospecimen")) { options.expand <- "expand=cases,cases.project,center,analysis" } else { options.expand <- "expand=cases,cases.samples.portions.analytes.aliquots,cases.project,center,analysis,cases.samples" } - option.size <- paste0("size=",getNbFiles(project,data.category,legacy)) + option.size <- paste0("size=",getNbFiles(project,data.category)) option.format <- paste0("format=JSON") - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":['), # Start json request - URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), - project, - URLencode('"]}}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":['), # Start json request + URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), + project, + URLencode('"]}}') + ) - if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy)) + if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy)) if(!is.na(data.category)) options.filter <- paste0(options.filter,addFilter("files.data_category", data.category)) if(!is.na(data.type)) options.filter <- paste0(options.filter,addFilter("files.data_type", data.type)) if(!is.na(workflow.type)) options.filter <- paste0(options.filter,addFilter("files.analysis.workflow_type", workflow.type)) if(!any(is.na(platform))) options.filter <- paste0(options.filter,addFilter("files.platform", platform)) - if(!any(is.na(file.type))) { - if(file.type == "results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "unnormalized")) - if(file.type == "normalized_results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "normalized")) - if(file.type == "nocnv_hg19.seg" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "nocnv")) - if(file.type == "hg19.isoform" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "hg19")) - } if(!any(is.na(files.access))) { options.filter <- paste0(options.filter,addFilter("files.access", files.access)) } @@ -1028,12 +985,11 @@ GDCquery_ATAC_seq <- function( results$data_category <- "ATAC-seq" results$project <- "ATAC-seq" ret <- data.frame( - results=I(list(results)), + results = I(list(results)), tumor = I(list(tumor)), project = I(list("ATAC-seq")), data.type = I(list("ATAC-seq")), - data.category = I(list("ATAC-seq")), - legacy = I(list(FALSE)) + data.category = I(list("ATAC-seq")) ) return(ret) diff --git a/R/visualize.R b/R/visualize.R index 488417ed0..8da71a19a 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -871,7 +871,6 @@ unlistlabels <- function(lab) { #' @importFrom data.table dcast setDT setDF := #' @examples #' \dontrun{ -#' library(maftools) #' library(dplyr) #' query <- GDCquery( #' project = "TCGA-CHOL", @@ -929,7 +928,6 @@ TCGAvisualize_oncoprint <- function( annotation.legend.side = "bottom" ){ - check_package("ComplexHeatmap") check_package("circlize") check_package("grid") diff --git a/man/GDCdownload.Rd b/man/GDCdownload.Rd index 004555f18..cd66a9921 100644 --- a/man/GDCdownload.Rd +++ b/man/GDCdownload.Rd @@ -34,15 +34,6 @@ Uses GDC API or GDC transfer tool to download gdc data The data from query will be save in a folder: project/data.category } \examples{ -query <- GDCquery( - project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "hg19.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01") - ) -# data will be saved in GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation -GDCdownload(query, method = "api") \dontrun{ # Download clinical data from XML query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical") @@ -57,14 +48,14 @@ GDCdownload(query, method = "api") # data will be saved in: # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification GDCdownload(query, method = "client", directory = "example_data_dir") - acc.gbm <- GDCquery( + query_acc_gbm <- GDCquery( project = c("TCGA-ACC","TCGA-GBM"), data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts" ) GDCdownload( - query = acc.gbm, + query = query_acc_gbm, method = "api", directory = "example", files.per.chunk = 50 diff --git a/man/GDCquery.Rd b/man/GDCquery.Rd index d3b5e5241..10153d843 100644 --- a/man/GDCquery.Rd +++ b/man/GDCquery.Rd @@ -9,7 +9,6 @@ GDCquery( data.category, data.type, workflow.type, - legacy = FALSE, access, platform, file.type, @@ -90,18 +89,6 @@ List for harmonized database: \item{ Sequencing Reads } \item{ Simple Nucleotide Variation } \item{ Transcriptome Profiling } -} -List for legacy archive -\itemize{ -\item{ Biospecimen } -\item{ Clinical } -\item{ Copy number variation } -\item{ DNA methylation } -\item{ Gene expression } -\item{ Protein expression } -\item{ Raw microarray data } -\item{ Raw sequencing data } -\item{ Simple nucleotide variation } }} \item{data.type}{A data type to filter the files to download @@ -109,8 +96,6 @@ For the complete list please check the vignette.} \item{workflow.type}{GDC workflow type} -\item{legacy}{Search in the legacy repository} - \item{access}{Filter by access type. Possible values: controlled, open} \item{platform}{Example: @@ -140,19 +125,13 @@ HumanMethylation27 \tab Mixed_DNASeq_Cont_curated \cr IlluminaHiSeq_RNASeqV2 \tab Mixed_DNASeq_Cont }} -\item{file.type}{To be used in the legacy database for some platforms, -to define which file types to be used.} - \item{barcode}{A list of barcodes to filter the files to download} \item{data.format}{Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML", "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")} -\item{experimental.strategy}{Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. -Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, -DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, -MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, -Total RNA-Seq, Capillary sequencing, Bisulfite-Seq} +\item{experimental.strategy}{Filter to experimental strategy. +Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.} \item{sample.type}{A sample type to filter the files to download} } @@ -163,7 +142,6 @@ A data frame with the results and the parameters used Uses GDC API to search for search, it searches for both controlled and open-access data. For GDC data arguments project, data.category, data.type and workflow.type should be used - For the legacy data arguments project, data.category, platform and/or file.extension should be used. Please, see the vignette for a table with the possibilities. } \examples{ @@ -193,19 +171,6 @@ query <- GDCquery( data.type = "Masked Copy Number Segment", sample.type = c("Primary Tumor") ) -query.met <- GDCquery( - project = c("TCGA-GBM","TCGA-LGG"), - legacy = TRUE, - data.category = "DNA methylation", - platform = "Illumina Human Methylation 450" -) -query <- GDCquery( - project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "hg19.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01") -) } } \author{ diff --git a/man/TCGAvisualize_oncoprint.Rd b/man/TCGAvisualize_oncoprint.Rd index 001f1124b..505608cd6 100644 --- a/man/TCGAvisualize_oncoprint.Rd +++ b/man/TCGAvisualize_oncoprint.Rd @@ -87,7 +87,6 @@ Creating a oncoprint } \examples{ \dontrun{ -library(maftools) library(dplyr) query <- GDCquery( project = "TCGA-CHOL", diff --git a/tests/testthat/test-prepare-download.R b/tests/testthat/test-prepare-download.R index 6a58c48eb..6b5ee2f9c 100644 --- a/tests/testthat/test-prepare-download.R +++ b/tests/testthat/test-prepare-download.R @@ -1,17 +1,16 @@ -context("Download AND PREPARE") - - +context("Download and prepare") test_that("GDCdownload API method is working ", { skip_on_bioc() skip_if_offline() - cases <- c( + cases <- c( "TCGA-PA-A5YG-01A-11R-A29S-07", "TCGA-OR-A5JX-01A-11R-A29S-07", "TCGA-PK-A5HA-01A-11R-A29S-07", "TCGA-OR-A5KY-01A-11R-A29S-07" ) + acc <- GDCquery( project = c("TCGA-ACC"), data.category = "Transcriptome Profiling", @@ -20,8 +19,8 @@ test_that("GDCdownload API method is working ", { barcode = substr(cases,1,12) ) GDCdownload(acc, method = "api", directory = "ex") - obj <- GDCprepare(acc, directory = "ex",summarizedExperiment = TRUE) + expect_true(all(substr(colnames(obj),1,12) == substr(cases,1,12))) expect_true(all(obj$barcode == cases)) @@ -46,9 +45,6 @@ test_that("GDCdownload API method is working ", { expect_true(all(query$results[[1]]$sample.submitter_id == data$sample_submitter_id)) }) - - - test_that("getBarcodeInfo works", { skip_on_bioc() skip_if_offline() @@ -61,11 +57,14 @@ test_that("getBarcodeInfo works", { x <- getBarcodeInfo(c("TARGET-20-PARUDL-03A")) expect_true(all(cols %in% colnames(x))) - samples <- c("HCM-CSHL-0063-C18-85A", - "HCM-CSHL-0065-C20-06A", - "HCM-CSHL-0065-C20-85A", - "HCM-CSHL-0063-C18-01A") + samples <- c( + "HCM-CSHL-0063-C18-85A", + "HCM-CSHL-0065-C20-06A", + "HCM-CSHL-0065-C20-85A", + "HCM-CSHL-0063-C18-01A" + ) x <- colDataPrepare(samples) + expect_true(all(rownames(x) == samples)) expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","gender"] == "male") expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","tumor_grade"] == "G2") @@ -102,22 +101,29 @@ test_that("colDataPrepare handle replicates", { test_that("GDCprepare accepts more than one project", { skip_on_bioc() skip_if_offline() - cases <- c("TCGA-OR-A5JX-01A", "TCGA-OR-A5J3-01A", - "TCGA-06-0680-11A","TCGA-14-0871-01A") + cases <- c( + "TCGA-OR-A5JX-01A", + "TCGA-OR-A5J3-01A", + "TCGA-06-0680-11A", + "TCGA-14-0871-01A" + ) expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% colDataPrepare(cases)$project_id)) - acc.gbm <- GDCquery(project = c("TCGA-ACC","TCGA-GBM"), - data.category = "Transcriptome Profiling", - data.type = "Gene Expression Quantification", - workflow.type = "STAR - Counts", - barcode = substr(cases,1,12)) - GDCdownload(acc.gbm, method = "api", directory = "ex") - obj <- GDCprepare(acc.gbm, directory = "ex") + query_acc_gbm <- GDCquery( + project = c("TCGA-ACC","TCGA-GBM"), + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts", + barcode = substr(cases, 1, 12) + ) + GDCdownload(query_acc_gbm, method = "api", directory = "ex") + obj <- GDCprepare(query_acc_gbm, directory = "ex") expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% SummarizedExperiment::colData(obj)$project_id)) }) test_that("Non TCGA data is processed", { skip_on_bioc() skip_if_offline() + proj <- "MMRF-COMMPASS" query <- GDCquery( project = proj, @@ -132,8 +138,6 @@ test_that("Non TCGA data is processed", { workflow.type = "STAR - Counts", barcode = getResults(query)$cases[1:4] ) - #GDCdownload(query) - #data <- GDCprepare(query) }) test_that("Gene Level Copy Number is being correctly prepare", { @@ -151,7 +155,7 @@ test_that("Gene Level Copy Number is being correctly prepare", { data <- GDCprepare(query,directory = "ex") expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7"))) - unlink("ex",recursive = TRUE,force = TRUE) + unlink("ex", recursive = TRUE, force = TRUE) }) test_that("DNAm files is processed correctly", { @@ -170,28 +174,6 @@ test_that("DNAm files is processed correctly", { expect_lt(abs(assay(data.hg38)["cg16739396","TCGA-E2-A158-01A-11D-A12E-05"] - 0.0688655418909783),10^-10) }) -test_that("IDAT files is processed", { - skip_on_bioc() - skip_if_offline() - - proj <- "TCGA-LUAD" - query <- GDCquery( - project = proj, - data.category = "Raw microarray data", - data.type = "Raw intensities", - experimental.strategy = "Methylation array", - legacy = TRUE, - file.type = ".idat", - barcode = "TCGA-55-7724", - platform = "Illumina Human Methylation 450" - ) - #tryCatch(GDCdownload(query, method = "api", files.per.chunk = 20), - # error = function(e) GDCdownload(query, method = "client")) - #betas <- GDCprepare(query) - #expect_true(nrow(betas) == 485577) - #expect_true(ncol(betas) == 1) -}) - test_that("Prepare samples without clinical data", { skip_on_bioc() skip_if_offline() @@ -214,30 +196,10 @@ test_that("Prepare multiple samples from the same patient", { expect_true("age_at_diagnosis" %in% colnames(x)) }) -test_that("Preparing HT_HG-U133A as SE works", { - skip_on_bioc() - skip_if_offline() - - query <- GDCquery( - project = "TCGA-GBM", - legacy = TRUE, - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = c("HT_HG-U133A") - ) - query$results[[1]] <- query$results[[1]][1:2,] - GDCdownload(query, method = "api", files.per.chunk = 100) - se <- GDCprepare(query, summarizedExperiment = TRUE) - - expect_true(is(se,"SummarizedExperiment")) -}) - - test_that("Preparing RRPA files with number of proteins works", { skip_on_bioc() skip_if_offline() - query_rppa <- GDCquery( project = c("TCGA-COAD"), data.category = "Proteome Profiling", @@ -249,9 +211,12 @@ test_that("Preparing RRPA files with number of proteins works", { GDCdownload(query_rppa) - expect_message(object = { - data_rppa <- GDCprepare(query_rppa) - },regexp = "Some files have a different number of proteins, we will introduce NA for the missing values") + expect_message( + object = { + data_rppa <- GDCprepare(query_rppa) + }, + regexp = "Some files have a different number of proteins, we will introduce NA for the missing values" + ) expect_true(is(data_rppa,"data.frame")) }) diff --git a/tests/testthat/test-query-clinical.R b/tests/testthat/test-query-clinical.R index d35b3d326..e55ead608 100644 --- a/tests/testthat/test-query-clinical.R +++ b/tests/testthat/test-query-clinical.R @@ -11,7 +11,7 @@ test_that("TCGAquery_SampleTypes returns the correct barcodes", { test_that("GDCquery_clinic populates correctly the data", { skip_on_bioc() - results <- GDCquery_clinic( "BEATAML1.0-COHORT") + results <- GDCquery_clinic(project = "BEATAML1.0-COHORT") results.2028 <- results[results$submitter_id == "2028",] expect_equal(results.2028$vital_status,"Alive") expect_true( @@ -27,7 +27,7 @@ test_that("GDCquery_clinic populates correctly the data", { expect_equal(results.42$ethnicity,"not hispanic or latino") expect_equal(as.integer(results.2028$age_at_diagnosis %>% as.numeric() / 365.25),56) - results <- GDCquery_clinic( "TCGA-LUAD") + results <- GDCquery_clinic(project = "TCGA-LUAD") results.sample <- results[results$submitter_id == "TCGA-80-5608",] expect_equal(results.sample$vital_status,"Alive") expect_equal(results.sample$gender,"female") diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index a8925365b..f51c2297a 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -20,16 +20,19 @@ test_that("GDCquery accepts more than one project", { data.category = "Copy Number Variation", data.type = "Copy Number Segment" ) + gbm <- GDCquery( project = "TCGA-GBM", data.category = "Copy Number Variation", data.type = "Copy Number Segment" ) + acc.gbm <- GDCquery( project = c("TCGA-ACC","TCGA-GBM"), data.category = "Copy Number Variation", data.type = "Copy Number Segment" ) + expect_equal(unique(acc.gbm$results[[1]]$data_type),"Copy Number Segment") expect_equal(nrow(acc.gbm$results[[1]]), sum(nrow(acc$results[[1]]),nrow(gbm$results[[1]]))) expect_true(nrow(dplyr::anti_join(acc$results[[1]],acc.gbm$results[[1]], by = "file_id")) == 0) @@ -51,34 +54,24 @@ test_that("GDCquery can filter by sample.type", { expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) sample.type <- "Solid Tissue Normal" - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy Number Variation", - data.type = "Masked Copy Number Segment", - sample.type = sample.type) - expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) - - sample.type <- "Solid Tissue Normal" - query <- GDCquery(project = c("TCGA-COAD"), - data.category = "Transcriptome Profiling", - data.type = "Gene Expression Quantification", - workflow.type = "STAR - Counts", - sample.type = sample.type) + query <- GDCquery( + project = "TCGA-ACC", + data.category = "Copy Number Variation", + data.type = "Masked Copy Number Segment", + sample.type = sample.type + ) expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) - sample.type <- "Solid Tissue Normal" - query <- GDCquery(project = "TCGA-BRCA", - legacy = TRUE, - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = "Illumina HiSeq", - file.type = "results", - experimental.strategy = "RNA-Seq", - sample.type = sample.type) + query <- GDCquery( + project = c("TCGA-COAD"), + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts", + sample.type = sample.type + ) expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) - - sample.type <- c("Solid Tissue Normal", "Primary Tumor") query <- GDCquery( project = "TCGA-ACC", @@ -121,56 +114,6 @@ test_that("GDCquery can filter by barcode", { expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases)) }) -test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg18", { - skip_on_bioc() - skip_if_offline() - - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "nocnv_hg18.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) - expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg18.seg.txt") -}) - -test_that("GDCquery can filter copy number from legacy data by file type. Case: hg18", { - skip_on_bioc() - skip_if_offline() - - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "hg18.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) - expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg18.seg.txt") -}) - -test_that("GDCquery can filter copy number from legacy data by file type. Case: hg19", { - skip_on_bioc() - skip_if_offline() - - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "hg19.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) - expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg19.seg.txt") -}) - - -test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg19", { - skip_on_bioc() - skip_if_offline() - - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy number variation", - legacy = TRUE, - file.type = "nocnv_hg19.seg", - barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) - expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg19.seg.txt") - -}) - test_that("GDCquery can filter by access level", { skip_on_bioc() @@ -186,15 +129,12 @@ test_that("GDCquery can filter by access level", { expect_equal(unique(query$results[[1]]$access),"controlled") }) - - - test_that("getNbFiles and getNbCases works", { skip_on_bioc() skip_if_offline() aux <- getProjectSummary("TCGA-LUAD",TRUE) - files <- getNbFiles("TCGA-LUAD","Raw microarray data",legacy = T) + files <- getNbFiles("TCGA-LUAD","Raw microarray data") cases <- getNbCases("TCGA-LUAD","Raw microarray data") expect_true(cases < files) }) diff --git a/vignettes/download_prepare.Rmd b/vignettes/download_prepare.Rmd index 39af10ab5..8194b68da 100644 --- a/vignettes/download_prepare.Rmd +++ b/vignettes/download_prepare.Rmd @@ -72,18 +72,8 @@ which defines the output type a Summarized Experiment (default option) or a data To create a summarized Experiment object we annotate the data with genomic positions with last patch release version of the genome available. -For legacy data (data aligned to hg19) TCGAbiolinks is using GRCh37.p13 and for -harmonized data (data aligned to hg38) now it is using Gencode version 36. -Unfortunately, some of the updates changes/remove gene symbols, change coordinates, etc. -Which might introduce some loss of data. For example, if the gene was removed we cannot map -it anymore and that information will be lost in the `SummarizedExperiment`. - -If you set `SummarizedExperiment` to `FALSE`, you will get the data unmodified -just as they are in the files and ad your own annotation. - -Also, there are no updated for DNA methylation data. But the last metadata available can be found -here: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation) +Also, the latest DNA methylation metadata is available at: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation) @@ -132,48 +122,6 @@ in `GDCprepare` and `GDCdownload` | mut.pipeline | If add.gistic2.mut is not NULL this field will be taken in consideration. Four separate variant calling pipelines are implemented for GDC data harmonization. Options: muse, varscan2, somaticsniper, MuTect2. For more information: https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/ | | mutant_variant_classification | List of mutant_variant_classification that will be consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation" | -## Search and download data from legacy database using GDC api method - -In this example we will download gene expression data from legacy database (data -aligned against genome of reference hg19) using GDC api method and we will show object data and metadata. -```{r results = 'hide', message=FALSE, warning=FALSE, eval = F} -query <- GDCquery( - project = "TCGA-GBM", - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = "Illumina HiSeq", - file.type = "normalized_results", - experimental.strategy = "RNA-Seq", - barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), - legacy = TRUE -) -GDCdownload( - query = query, - method = "api", - files.per.chunk = 10 -) -data <- GDCprepare(query = query) -``` - -```{r message=FALSE, warning=FALSE, include=FALSE} -data <- gbm.exp.legacy -``` - -```{r message=FALSE, warning=FALSE} -# Gene expression aligned against hg19. -datatable( - as.data.frame(colData(data)), - options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), - rownames = FALSE) -# Only first 20 rows to make render faster -datatable( - assay(data)[1:20,], - options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), - rownames = TRUE -) - -rowRanges(data) -``` ## Search and download data for two samples from database @@ -238,44 +186,6 @@ Examples of query, download, prepare can be found in this [gist](https://gist.gi | Biospecimen | Biospecimen Supplement | | | | Clinical | | | | -## Legacy data -| Data.category | Data.type | Platform | file.type | Status | -|-----------------------------|-----------------------------------|-------------------------------------|--------------------|-----------------| -| Transcriptome Profiling | | | | | -| Copy number variation | - | Affymetrix SNP Array 6.0 | nocnv_hg18.seg | Working | -| | - | Affymetrix SNP Array 6.0 | hg18.seg | Working | -| | - | Affymetrix SNP Array 6.0 | nocnv_hg19.seg | Working | -| | - | Affymetrix SNP Array 6.0 | hg19.seg | Working | -| | - | Illumina HiSeq | Several | Working | -| Simple Nucleotide Variation | Simple somatic mutation | | | | -| Raw Sequencing Data | | | | | -| Biospecimen | | | | | -| Clinical | | | | | -| Protein expression | | MDA RPPA Core | - | Working | -| Gene expression | Gene expression quantification | Illumina HiSeq | normalized_results | Working | -| | | Illumina HiSeq | results | Working | -| | | HT_HG-U133A | - | Working | -| | | AgilentG4502A_07_2 | - | Data frame only | -| | | AgilentG4502A_07_1 | - | Data frame only | -| | | HuEx-1_0-st-v2 | FIRMA.txt | Not Preparing | -| | | | gene.txt | Not Preparing | -| | Isoform expression quantification | | | | -| | miRNA gene quantification | | | | -| | Exon junction quantification | | | | -| | Exon quantification | | | | -| | miRNA isoform quantification | | | | -| | | | | | -| DNA methylation | | Illumina Human Methylation 450 | Not used | Working | -| | | Illumina Human Methylation 27 | Not used | Working | -| | | Illumina DNA Methylation OMA003 CPI | Not used | Working | -| | | Illumina DNA Methylation OMA002 CPI | Not used | Working | -| | | Illumina Hi Seq | | Not working | -| Raw Microarray Data | | | | | -| Structural Rearrangement | | | | | -| Other | | | | | - - - # Examples @@ -444,8 +354,7 @@ query <- GDCquery( project = "TCGA-BRCA", data.category = "DNA Methylation", data.type = "Masked Intensities", - platform = "Illumina Human Methylation 27", - legacy = FALSE + platform = "Illumina Human Methylation 27" ) GDCdownload(query, files.per.chunk=10) betas <- GDCprepare(query) @@ -454,10 +363,9 @@ query <- GDCquery( project = "HCMI-CMDC", data.category = "DNA Methylation", data.type = "Masked Intensities", - platform = "Illumina Methylation Epic", - legacy = FALSE + platform = "Illumina Methylation Epic" ) -GDCdownload(query, files.per.chunk=10) +GDCdownload(query, files.per.chunk = 10) betas <- GDCprepare(query) @@ -465,8 +373,7 @@ query <- GDCquery( project = "CPTAC-3", data.category = "DNA Methylation", data.type = "Masked Intensities", - platform = "Illumina Methylation Epic", - legacy = FALSE + platform = "Illumina Methylation Epic" ) GDCdownload(query, files.per.chunk=10) betas <- GDCprepare(query) @@ -475,10 +382,9 @@ query <- GDCquery( project = "TCGA-BRCA", data.category = "DNA Methylation", data.type = "Masked Intensities", - platform = "Illumina Methylation Epic", - legacy = FALSE + platform = "Illumina Methylation Epic" ) -GDCdownload(query, files.per.chunk=10) +GDCdownload(query, files.per.chunk = 10) betas <- GDCprepare(query) @@ -571,7 +477,6 @@ https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeli query.sc.analysis <- GDCquery( project = "CPTAC-3", data.category = "Transcriptome Profiling", - legacy = FALSE, access = "open", data.type = "Single Cell Analysis", data.format = "TSV" @@ -584,7 +489,6 @@ Single.Cell.Analysis.list <- GDCprepare(query.sc.analysis) query.hdF5 <- GDCquery( project = "CPTAC-3", data.category = "Transcriptome Profiling", - legacy = FALSE, access = "open", data.type = "Single Cell Analysis", barcode = c("CPT0167860015","CPT0206880004"), @@ -598,7 +502,6 @@ df.HDF5 <- GDCprepare(query.hdF5) query.raw.counts <- GDCquery( project = "CPTAC-3", data.category = "Transcriptome Profiling", - legacy = FALSE, access = "open", data.type = "Gene Expression Quantification", barcode = c("CPT0167860015","CPT0206880004"), @@ -612,7 +515,6 @@ raw.counts.list <- GDCprepare(query.raw.counts) query.filtered.counts <- GDCquery( project = "CPTAC-3", data.category = "Transcriptome Profiling", - legacy = FALSE, access = "open", data.type = "Gene Expression Quantification", barcode = c("CPT0167860015","CPT0206880004"), @@ -627,7 +529,6 @@ filtered.counts.list <- GDCprepare(query.filtered.counts) query.sc.dea <- GDCquery( project = "CPTAC-3", data.category = "Transcriptome Profiling", - legacy = FALSE, access = "open", data.type = "Differential Gene Expression", barcode = c("CPT0167860015","CPT0206880004"), @@ -636,91 +537,3 @@ query.sc.dea <- GDCquery( GDCdownload(query.sc.dea) sc.dea.list <- GDCprepare(query.sc.dea) ``` - -## Legacy archive: data aligned against hg19 - -### DNA methylation: Get all TCGA IDAT files - -```{r message=FALSE, warning=FALSE, eval =FALSE} -#------------------------------------------------------- -# Example to idat files from TCGA projects -#------------------------------------------------------- -projects <- TCGAbiolinks:::getGDCprojects()$project_id -projects <- projects[grepl('^TCGA',projects,perl=T)] -match.file.cases.all <- NULL -for(proj in projects){ - print(proj) - query <- GDCquery( - project = proj, - data.category = "Raw microarray data", - data.type = "Raw intensities", - experimental.strategy = "Methylation array", - legacy = TRUE, - file.type = ".idat", - platform = "Illumina Human Methylation 450" - ) - match.file.cases <- getResults(query,cols=c("cases","file_name")) - match.file.cases$project <- proj - match.file.cases.all <- rbind(match.file.cases.all,match.file.cases) - tryCatch( - GDCdownload(query, method = "api", files.per.chunk = 20), - error = function(e) GDCdownload(query, method = "client") - ) -} -# This will create a map between idat file name, cases (barcode) and project -readr::write_tsv(match.file.cases.all, path = "idat_filename_case.txt") -# code to move all files to local folder -for(file in dir(".",pattern = ".idat", recursive = T)){ - TCGAbiolinks::move(file,basename(file)) -} -``` - - -### DNA methylation - -```{r, eval = FALSE} -query_meth.hg19 <- GDCquery( - project= "TCGA-LGG", - data.category = "DNA methylation", - platform = "Illumina Human Methylation 450", - barcode = c("TCGA-HT-8111-01A-11D-2399-05","TCGA-HT-A5R5-01A-11D-A28N-05"), - legacy = TRUE -) -GDCdownload(query_meth.hg19) -data.hg19 <- GDCprepare(query_meth.hg19) -``` - - -### Protein expression -```{r, eval = FALSE} -query <- GDCquery( - project = "TCGA-GBM", - data.category = "Protein expression", - legacy = TRUE, - barcode = c("TCGA-OX-A56R-01A-21-A44T-20","TCGA-08-0357-01A-21-1898-20") -) -GDCdownload(query) -data <- GDCprepare( - query, save = TRUE, - save.filename = "gbmProteinExpression.rda", - remove.files.prepared = TRUE -) -``` - - -### Gene expression -```{r, eval = FALSE} -# Aligned against Hg19 -query.exp.hg19 <- GDCquery( - project = "TCGA-GBM", - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = "Illumina HiSeq", - file.type = "normalized_results", - experimental.strategy = "RNA-Seq", - barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), - legacy = TRUE -) -GDCdownload(query.exp.hg19) -data <- GDCprepare(query.exp.hg19) -``` diff --git a/vignettes/query.Rmd b/vignettes/query.Rmd index ac3e6a43d..3dc0400a4 100644 --- a/vignettes/query.Rmd +++ b/vignettes/query.Rmd @@ -18,8 +18,6 @@ knitr::opts_knit$set(progress = FALSE) **TCGAbiolinks** has provided a few functions to search GDC database. -This section starts by explaining the different GDC sources (Harmonized and Legacy Archive), followed by some examples -how to access them. --- @@ -33,23 +31,6 @@ library(DT) # Useful information -
-
Different sources: Legacy vs Harmonized
-
- - -There are two available sources to download GDC data using TCGAbiolinks: - -- GDC Legacy Archive : provides access to an unmodified copy of data that was previously stored in -[CGHub](https://cghub.ucsc.edu/) and in the TCGA Data Portal hosted by the TCGA Data Coordinating Center (DCC), in which uses -as references GRCh37 (hg19) and GRCh36 (hg18). -- GDC harmonized database: data available was harmonized against GRCh38 (hg38) using GDC Bioinformatics Pipelines -which provides methods to the standardization of biospecimen and -clinical data. - -
-
-
Understanding the barcode
@@ -79,7 +60,6 @@ with the following arguments: | data.category | A valid project (see list with TCGAbiolinks:::getProjectSummary(project)) | | | data.type | A data type to filter the files to download | | | workflow.type | GDC workflow type | | -| legacy | Search in the legacy repository | | | access | Filter by access type. Possible values: controlled, open | | | platform | Example: | | | | CGH- 1x1M_G4447A | IlluminaGA_RNASeqV2 | @@ -107,7 +87,7 @@ with the following arguments: | | IlluminaHiSeq_RNASeqV2 | Mixed_DNASeq_Cont | | file.type | To be used in the legacy database for some platforms, to define which file types to be used. | | | barcode | A list of barcodes to filter the files to download | | -| experimental.strategy | Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, Total RNA-Seq, Capillary sequencing, Bisulfite-Seq | | +| experimental.strategy | Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. | | | sample.type | A sample type to filter the files to download | | @@ -138,7 +118,7 @@ datatable( The other fields (data.category, data.type, workflow.type, platform, file.type) can be found below. Please, note that these tables are still incomplete. -## Harmonized data options (`legacy = FALSE`) +## Harmonized data options ```{r, echo=FALSE} datatable( @@ -149,21 +129,12 @@ datatable( ) ``` -## Legacy archive data options (`legacy = TRUE`) -```{r, echo=FALSE} -datatable( - readr::read_csv("https://docs.google.com/spreadsheets/d/1f98kFdj9mxVDc1dv4xTZdx8iWgUiDYO-qiFJINvmTZs/export?format=csv&gid=1817673686",col_types = readr::cols()), - filter = 'top', - options = list(scrollX = TRUE, keys = TRUE, pageLength = 40), - rownames = FALSE -) -``` # Harmonized database examples ## DNA methylation data: Recurrent tumor samples -In this example we will access the harmonized database (`legacy = FALSE`) +In this example we will access the harmonized database and search for all DNA methylation data for recurrent glioblastoma multiform (GBM) and low grade gliomas (LGG) samples. @@ -172,7 +143,6 @@ and low grade gliomas (LGG) samples. query <- GDCquery( project = c("TCGA-GBM", "TCGA-LGG"), data.category = "DNA Methylation", - legacy = FALSE, platform = c("Illumina Human Methylation 450"), sample.type = "Recurrent Tumor" ) @@ -186,19 +156,18 @@ datatable( ## Samples with DNA methylation and gene expression data -In this example we will access the harmonized database (`legacy = FALSE`) +In this example we will access the harmonized database and search for all patients with DNA methylation (platform HumanMethylation450k) and gene expression data for Colon Adenocarcinoma tumor (TCGA-COAD). ```{r message=FALSE, warning = FALSE, eval = FALSE} -query.met <- GDCquery( +query_met <- GDCquery( project = "TCGA-COAD", data.category = "DNA Methylation", - legacy = FALSE, platform = c("Illumina Human Methylation 450") ) -query.exp <- GDCquery( +query_exp <- GDCquery( project = "TCGA-COAD", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", @@ -207,20 +176,19 @@ query.exp <- GDCquery( # Get all patients that have DNA methylation and gene expression. common.patients <- intersect( - substr(getResults(query.met, cols = "cases"), 1, 12), - substr(getResults(query.exp, cols = "cases"), 1, 12) + substr(getResults(query_met, cols = "cases"), 1, 12), + substr(getResults(query_exp, cols = "cases"), 1, 12) ) # Only seelct the first 5 patients -query.met <- GDCquery( +query_met <- GDCquery( project = "TCGA-COAD", data.category = "DNA Methylation", - legacy = FALSE, platform = c("Illumina Human Methylation 450"), barcode = common.patients[1:5] ) -query.exp <- GDCquery( +query_exp <- GDCquery( project = "TCGA-COAD", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", @@ -231,13 +199,13 @@ query.exp <- GDCquery( ```{r results_matched, message=FALSE, warning=FALSE, eval = FALSE} datatable( - getResults(query.met, cols = c("data_type","cases")), + getResults(query_met, cols = c("data_type","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) datatable( - getResults(query.exp, cols = c("data_type","cases")), + getResults(query_exp, cols = c("data_type","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE @@ -327,98 +295,13 @@ datatable( ``` -# Legacy archive examples - -## DNA methylation - -### Array-based assays - -This example shows how the user can search for glioblastoma multiform (GBM) -and DNA methylation data -for platform Illumina Human Methylation 450 and Illumina Human Methylation 27. - -```{r message=FALSE, warning=FALSE} -query <- GDCquery( - project = c("TCGA-GBM"), - legacy = TRUE, - data.category = "DNA methylation", - platform = c("Illumina Human Methylation 450", "Illumina Human Methylation 27") -) -datatable( - getResults(query, rows = 1:100), - filter = 'top', - options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), - rownames = FALSE -) -``` - -### whole-genome bisulfite sequencing (WGBS) - -```{r message = FALSE, warning = FALSE, eval = FALSE} - -query <- GDCquery( - project = c("TCGA-LUAD"), - legacy = TRUE, - data.category = "DNA methylation", - data.type = "Methylation percentage", - experimental.strategy = "Bisulfite-Seq" -) - -# VCF - controlled data -query <- GDCquery( - project = c("TCGA-LUAD"), - legacy = TRUE, - data.category = "DNA methylation", - data.type = "Bisulfite sequence alignment", - experimental.strategy = "Bisulfite-Seq" -) - - -# WGBS BAM files - controlled data -query <- GDCquery( - project = c("TCGA-LUAD"), - legacy = TRUE, - data.type = "Aligned reads", - data.category = "Raw sequencing data", - experimental.strategy = "Bisulfite-Seq" -) -``` - - -## Gene expression - -This exmaple shows how the user can search for glioblastoma multiform (GBM) -gene expression data with the normalized results for expression of a gene. -For more information about file.types check [GDC TCGA file types](https://gdc.cancer.gov/resources-tcga-users/legacy-archive-tcga-tag-descriptions) - -```{r message=FALSE, warning=FALSE} -# Gene expression aligned against hg19. -query.exp.hg19 <- GDCquery( - project = "TCGA-GBM", - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = "Illumina HiSeq", - file.type = "normalized_results", - experimental.strategy = "RNA-Seq", - barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), - legacy = TRUE -) - -datatable( - getResults(query.exp.hg19), - filter = 'top', - options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), - rownames = FALSE -) -``` - # Get Manifest file If you want to get the manifest file from the query object you can use the function *getManifest*. If you -set save to TRUEm a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created. +set save to `TRUE` a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created. ```{r message=FALSE, warning=FALSE} -getManifest(query.exp.hg19,save = FALSE) +getManifest(query,save = FALSE) ``` # ATAC-seq data @@ -440,10 +323,10 @@ datatable( You can use the function `GDCquery_ATAC_seq` filter the manifest table and use `GDCdownload` to save the data locally. ```{r message=FALSE, warning=FALSE,eval = FALSE} query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "rds") -GDCdownload(query,method = "client") +GDCdownload(query, method = "client") query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "bigWigs") -GDCdownload(query,method = "client") +GDCdownload(query, method = "client") ``` From d7020f777caf2e515573da53d2f87150944be20d Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Mon, 8 May 2023 12:53:01 -0400 Subject: [PATCH 2/9] Removing legacy argument from vignette --- vignettes/analysis.Rmd | 13 ++-- vignettes/clinical.Rmd | 155 ----------------------------------------- vignettes/mutation.Rmd | 54 +------------- 3 files changed, 8 insertions(+), 214 deletions(-) diff --git a/vignettes/analysis.Rmd b/vignettes/analysis.Rmd index daf877dff..bdbbdd413 100755 --- a/vignettes/analysis.Rmd +++ b/vignettes/analysis.Rmd @@ -656,13 +656,13 @@ to create a mean DNA methylation boxplot with the function ```{r,include=FALSE,echo=FALSE, fig.height=5, message=FALSE, warning=FALSE,eval=FALSE} query <- GDCquery( project = "TCGA-GBM", - data.category = "DNA methylation", + data.category = "DNA Methylation", platform = "Illumina Human Methylation 27", - legacy = TRUE, - barcode = c("TCGA-02-0058-01A-01D-0186-05", "TCGA-12-1597-01B-01D-0915-05", - "TCGA-12-0829-01A-01D-0392-05", "TCGA-06-0155-01B-01D-0521-05", - "TCGA-02-0099-01A-01D-0199-05", "TCGA-19-4068-01A-01D-1228-05", - "TCGA-19-1788-01A-01D-0595-05", "TCGA-16-0848-01A-01D-0392-05" + barcode = c( + "TCGA-02-0058-01A-01D-0186-05", "TCGA-12-1597-01B-01D-0915-05", + "TCGA-12-0829-01A-01D-0392-05", "TCGA-06-0155-01B-01D-0521-05", + "TCGA-02-0099-01A-01D-0199-05", "TCGA-19-4068-01A-01D-1228-05", + "TCGA-19-1788-01A-01D-0595-05", "TCGA-16-0848-01A-01D-0392-05" ) ) GDCdownload(query, method = "api") @@ -676,7 +676,6 @@ query <- GDCquery( project = "TCGA-GBM", data.category = "DNA methylation", platform = "Illumina Human Methylation 27", - legacy = TRUE, barcode = c( "TCGA-02-0058-01A-01D-0186-05", "TCGA-12-1597-01B-01D-0915-05", "TCGA-12-0829-01A-01D-0392-05", "TCGA-06-0155-01B-01D-0521-05", diff --git a/vignettes/clinical.Rmd b/vignettes/clinical.Rmd index c37f5a70e..33edd3033 100644 --- a/vignettes/clinical.Rmd +++ b/vignettes/clinical.Rmd @@ -267,65 +267,6 @@ clinical.admin %>% ``` -# Microsatellite data - - -MSI-Mono-Dinucleotide Assay is performed to test a panel of four mononucleotide repeat loci (polyadenine tracts BAT25, BAT26, BAT40, and transforming growth factor receptor type II) and three dinucleotide repeat loci (CA repeats in D2S123, D5S346, and D17S250). Two additional pentanucleotide loci (Penta D and Penta E) are included in this assay to evaluate sample identity. Multiplex fluorescent-labeled PCR and capillary electrophoresis were used to identify MSI if a variation in the number of microsatellite repeats was detected between tumor and matched non-neoplastic tissue or mononuclear blood cells. Equivocal or failed markers were re-evaluated by singleplex PCR. - -classifications: microsatellite-stable (MSS), low level MSI (MSI-L) if less than 40% of markers were altered and high level MSI (MSI-H) if greater than 40% of markers were altered. - -Reference: [TCGA wiki](https://wiki.nci.nih.gov/display/TCGA/Microsatellite+data) - -Level 3 data is included in BCR clinical-based submissions and can be downloaded as follows: - -```{r msi, results = 'hide', echo=TRUE, message=FALSE, warning=FALSE,eval = F} -query <- GDCquery( - project = "TCGA-COAD", - data.category = "Other", - legacy = TRUE, - access = "open", - data.type = "Auxiliary test", - barcode = c("TCGA-AD-A5EJ","TCGA-DM-A0X9") -) -GDCdownload(query) -msi_results <- GDCprepare_clinic(query, "msi") -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE} -msi_results %>% DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` - - -# Tissue slide image (SVS format) - -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE} -# Tissue slide image files from legacy database -query_legacy <- GDCquery( - project = "TCGA-COAD", - data.category = "Clinical", - data.type = "Tissue slide image", - legacy = TRUE, - barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") -) - -# Tissue slide image files from harmonized database -query_harmonized <- GDCquery( - project = "TCGA-OV", - data.category = "Biospecimen", - data.type = 'Slide Image' -) -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE} -query_legacy %>% - getResults %>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) - -query_harmonized %>% - getResults %>% - head %>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` # Diagnostic Slide (SVS format) @@ -348,102 +289,6 @@ query_harmonized %>% ``` - -# Legacy archive files -The clinical data types available in legacy database are: - -* Biospecimen data (Biotab format) -* Tissue slide image (SVS format) -* Clinical Supplement (XML format) -* Pathology report (PDF) -* Clinical data (Biotab format) - -## Pathology report (PDF) -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE} -# Pathology report from legacy portal -query_legacy <- GDCquery( - project = "TCGA-COAD", - data.category = "Clinical", - data.type = "Pathology report", - legacy = TRUE, - barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") -) -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE} -query_legacy %>% - getResults %>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` - -## Tissue slide image (SVS format) - -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} -# Tissue slide image -query <- GDCquery( - project = "TCGA-COAD", - data.category = "Clinical", - data.type = "Tissue slide image", - legacy = TRUE, - barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") -) -``` - -```{r echo = TRUE, message = FALSE, warning = FALSE} -query %>% - getResults %>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` - -## Clinical Supplement (XML format) - -```{r results = 'hide', echo = TRUE, message = FALSE, warning = FALSE} -# Clinical Supplement -query <- GDCquery( - project = "TCGA-COAD", - data.category = "Clinical", - data.type = "Clinical Supplement", - legacy = TRUE, - barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") -) -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE} -query %>% - getResults %>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` - - -## Clinical data (Biotab format) -```{r biotab_legacy,results = 'hide', echo=TRUE, message=FALSE, warning=FALSE} -# Clinical data -query <- GDCquery( - project = "TCGA-COAD", - data.category = "Clinical", - data.type = "Clinical data", - legacy = TRUE, - file.type = "txt" -) -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE} -query %>% - getResults %>% - select(-matches("cases"))%>% - DT::datatable(options = list(scrollX = TRUE, keys = TRUE)) -``` - -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE, eval = FALSE} -GDCdownload(query) -clinical_biotab <- GDCprepare(query) -``` - -```{r echo=TRUE, message=FALSE, warning=FALSE, eval = FALSE} -names(clinical_biotab) -datatable(clinical_biotab$clinical_radiation_coad, options = list(scrollX = TRUE, keys = TRUE)) -``` - # Filter functions Also, some functions to work with clinical data are provided. diff --git a/vignettes/mutation.Rmd b/vignettes/mutation.Rmd index 9371fb1c5..8559b346a 100644 --- a/vignettes/mutation.Rmd +++ b/vignettes/mutation.Rmd @@ -40,8 +40,7 @@ For more information please access https://github.com/NCI-GDC/gdc-maf-tool and query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", - access = "open", - legacy = FALSE, + access = "open", data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) @@ -60,54 +59,6 @@ datatable(maf[1:20,], rownames = FALSE) ``` -## Mutation data (hg19) - -This example will download MAF (mutation annotation files) aligned against hg19 (Old TCGA maf files) - - -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE} -query.maf.hg19 <- GDCquery( - project = "TCGA-CHOL", - data.category = "Simple nucleotide variation", - data.type = "Simple somatic mutation", - access = "open", - legacy = TRUE -) -``` -```{r echo = TRUE, message = FALSE, warning = FALSE} -# Check maf availables -getResults(query.maf.hg19) %>% - dplyr::select(-contains("sample_type")) %>% - dplyr::select(-contains("cases")) %>% - DT::datatable( - filter = 'top', - options = list(scrollX = TRUE, keys = TRUE, pageLength = 10), - rownames = FALSE - ) -``` -```{r results = 'hide', echo=TRUE, message=FALSE, warning=FALSE,eval=FALSE} -query.maf.hg19 <- GDCquery(project = "TCGA-CHOL", - data.category = "Simple nucleotide variation", - data.type = "Simple somatic mutation", - access = "open", - file.type = "bcgsc.ca_CHOL.IlluminaHiSeq_DNASeq.1.somatic.maf", - legacy = TRUE) -GDCdownload(query.maf.hg19) -maf <- GDCprepare(query.maf.hg19) -``` - -```{r message=FALSE, warning=FALSE, include=FALSE} -data <- bcgsc.ca_CHOL.IlluminaHiSeq_DNASeq.1.somatic.maf -``` -```{r echo = TRUE, message = FALSE, warning = FALSE} -# Only first 50 to make render faster -datatable(maf[1:20,], - filter = 'top', - options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), - rownames = FALSE) -``` - - ## Mutation data MC3 file This will download the MC3 MAF file from https://gdc.cancer.gov/about-data/publications/mc3-2017, @@ -128,8 +79,7 @@ library(dplyr) query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", - access = "open", - legacy = FALSE, + access = "open", data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) From 727c6028cb5392777f6a10701ed82aa8c09ae771 Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Mon, 8 May 2023 12:57:46 -0400 Subject: [PATCH 3/9] Version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f3358f98a..2cffcc63d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TCGAbiolinks Type: Package Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data -Version: 2.29.1 +Version: 2.29.2 Date: 2022-17-08 Author: Antonio Colaprico, Tiago Chedraoui Silva, From 3ca02453f3dc1936aa60efdc09687bdf323ffcec Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Wed, 10 May 2023 09:41:15 -0400 Subject: [PATCH 4/9] Removing legacy option --- DESCRIPTION | 2 +- R/analyze.R | 60 +++++++++++++------------------------------- man/matchedMetExp.Rd | 6 ++--- 3 files changed, 20 insertions(+), 48 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2cffcc63d..a60661c85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TCGAbiolinks Type: Package Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data -Version: 2.29.2 +Version: 2.29.3 Date: 2022-17-08 Author: Antonio Colaprico, Tiago Chedraoui Silva, diff --git a/R/analyze.R b/R/analyze.R index ee2a64384..f7a41b4b4 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -1987,59 +1987,33 @@ getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) { #' DNA methylation and Gene expression data from GDC database #' @param project A GDC project #' @param n Number of samples to return. If NULL return all (default) -#' @param legacy Access legacy (hg19) or harmonized database (hg38). #' @return A vector of barcodes #' @export #' @examples #' # Get ACC samples with both DNA methylation (HM450K) and gene expression aligned to hg19 -#' samples <- matchedMetExp("TCGA-UCS", legacy = TRUE) +#' samples <- matchedMetExp("TCGA-UCS") matchedMetExp <- function( project, - legacy = FALSE, n = NULL ) { - if (legacy) { - # get primary solid tumor samples: DNA methylation - message("Download DNA methylation information") - met450k <- GDCquery( - project = project, - data.category = "DNA methylation", - platform = "Illumina Human Methylation 450", - legacy = TRUE, - sample.type = c("Primary Tumor") - ) - - # get primary solid tumor samples: RNAseq - message("Download gene expression information") - exp <- GDCquery( - project = project, - data.category = "Gene expression", - data.type = "Gene expression quantification", - platform = "Illumina HiSeq", - file.type = "results", - sample.type = c("Primary Tumor"), - legacy = TRUE - ) - } else { - # get primary solid tumor samples: DNA methylation - message("Download DNA methylation information") - met450k <- GDCquery( - project = project, - data.category = "DNA Methylation", - platform = "Illumina Human Methylation 450", - sample.type = c("Primary Tumor") - ) - # get primary solid tumor samples: RNAseq - message("Download gene expression information") - exp <- GDCquery( - project = project, - data.category = "Transcriptome Profiling", - data.type = "Gene Expression Quantification", - workflow.type = "STAR - Counts" - ) + # get primary solid tumor samples: DNA methylation + message("Download DNA methylation information") + met450k <- GDCquery( + project = project, + data.category = "DNA Methylation", + platform = "Illumina Human Methylation 450", + sample.type = c("Primary Tumor") + ) - } + # get primary solid tumor samples: RNAseq + message("Download gene expression information") + exp <- GDCquery( + project = project, + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts" + ) # Get patients with samples in both platforms met450k_tp <- met450k$results[[1]]$cases diff --git a/man/matchedMetExp.Rd b/man/matchedMetExp.Rd index baaf77a6b..538a5b9f6 100644 --- a/man/matchedMetExp.Rd +++ b/man/matchedMetExp.Rd @@ -4,13 +4,11 @@ \alias{matchedMetExp} \title{Get GDC primary tumors samples with both DNA methylation (HM450K) and Gene expression data} \usage{ -matchedMetExp(project, legacy = FALSE, n = NULL) +matchedMetExp(project, n = NULL) } \arguments{ \item{project}{A GDC project} -\item{legacy}{Access legacy (hg19) or harmonized database (hg38).} - \item{n}{Number of samples to return. If NULL return all (default)} } \value{ @@ -22,5 +20,5 @@ DNA methylation and Gene expression data from GDC database } \examples{ # Get ACC samples with both DNA methylation (HM450K) and gene expression aligned to hg19 -samples <- matchedMetExp("TCGA-UCS", legacy = TRUE) +samples <- matchedMetExp("TCGA-UCS") } From 73c8c6a25eca5eebe130314f078a2190951a013e Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Fri, 12 May 2023 20:42:17 -0400 Subject: [PATCH 5/9] Removing legacy from API functions --- R/api.R | 142 ++++++++++++++++++++-------------- man/getDataCategorySummary.Rd | 6 +- man/getNbCases.Rd | 4 +- man/getNbFiles.Rd | 4 +- man/getProjectSummary.Rd | 4 +- man/getSampleFilesSummary.Rd | 4 +- 6 files changed, 90 insertions(+), 74 deletions(-) diff --git a/R/api.R b/R/api.R index f3b4acbbe..072cbb86b 100644 --- a/R/api.R +++ b/R/api.R @@ -5,7 +5,6 @@ #' data_category + data_type + experimental_strategy + platform #' Almost like https://portal.gdc.cancer.gov/exploration #' @param project A GDC project -#' @param legacy Access legacy database ? Deafult: FALSE #' @param files.access Filter by file access ("open" or "controlled"). #' Default: no filter #' @export @@ -18,13 +17,16 @@ #' @importFrom tidyr spread unite #' @importFrom plyr ldply count #' @author Tiago Chedraoui Silva -getSampleFilesSummary <- function(project, legacy = FALSE, files.access = NA) { +getSampleFilesSummary <- function( + project, + files.access = NA +) { out <- NULL for(proj in project){ checkProjectInput(proj) message("Accessing information for project: ", proj) - url <- getSampleSummaryUrl(proj,legacy) + url <- getSampleSummaryUrl(proj) x <- getURL(url,fromJSON,simplifyDataFrame = TRUE) y <- x$data$hits$files names(y) <- x$data$hits$submitter_id @@ -46,75 +48,95 @@ getSampleFilesSummary <- function(project, legacy = FALSE, files.access = NA) { return(out) } -getSampleSummaryUrl <- function(project,legacy = FALSE, files.access = NA){ +getSampleSummaryUrl <- function( + project, + files.access = NA +){ # Get manifest using the API - baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/cases/?","https://api.gdc.cancer.gov/cases/?") + baseURL <- "https://api.gdc.cancer.gov/cases/?" options.pretty <- "pretty=true" options.expand <- "expand=summary,summary.data_categories,files" - #option.size <- paste0("size=",getNbFiles(project,data.category,legacy)) option.size <- paste0("size=",1000) option.format <- paste0("format=JSON") - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":['), # Start json request - URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), - project, - URLencode('"]}}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":['), # Start json request + URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), + project, + URLencode('"]}}') + ) if(!any(is.na(files.access))) { options.filter <- paste0(options.filter,addFilter("files.access", files.access)) } # Close json request options.filter <- paste0(options.filter, URLencode(']}')) - url <- paste0(baseURL,paste(options.pretty, - options.expand, - option.size, - options.filter, - option.format, - sep = "&")) + url <- paste0( + baseURL, + paste(options.pretty, + options.expand, + option.size, + options.filter, + option.format, + sep = "&" + ) + ) return(url) } -getSubmitterIDUrl <- function(project,legacy = FALSE, files.access = NA){ +getSubmitterIDUrl <- function( + project, + files.access = NA +){ # Get manifest using the API - baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/cases/?","https://api.gdc.cancer.gov/cases/?") + baseURL <- "https://api.gdc.cancer.gov/cases/?" options.pretty <- "pretty=true" options.expand <- "expand=files.access" - #option.size <- paste0("size=",getNbFiles(project,data.category,legacy)) option.fields = "fields=submitter_id" option.size <- paste0("size=",1000) option.format <- paste0("format=JSON") - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":['), # Start json request - URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), - project, - URLencode('"]}}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":['), # Start json request + URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), + project, + URLencode('"]}}') + ) if(!any(is.na(files.access))) { options.filter <- paste0(options.filter,addFilter("files.access", files.access)) } # Close json request options.filter <- paste0(options.filter, URLencode(']}')) - url <- paste0(baseURL,paste(options.pretty, - options.expand, - option.fields, - option.size, - options.filter, - option.format, - sep = "&")) + url <- paste0( + baseURL, + paste( + options.pretty, + options.expand, + option.fields, + option.size, + options.filter, + option.format, + sep = "&" + ) + ) return(url) } # getSubmitterID("TCGA-BRCA") # getSubmitterID("MMRF-COMPASS") -getSubmitterID <- function(project,legacy = FALSE, files.access = NA){ +getSubmitterID <- function( + project, + files.access = NA +){ - url <- getSubmitterIDUrl(project,legacy,files.access) + url <- getSubmitterIDUrl(project,files.access) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), @@ -186,8 +208,8 @@ getBarcodefromAliquot <- function(aliquot){ #' @param FUN function that calls the API #' @author Tiago Chedraoui Silva splitAPICall <- function(FUN, step = 20, items){ - info <- NULL - info <- tryCatch({ + info <- NULL + info <- tryCatch({ for(i in 0:(ceiling(length(items)/step) - 1)){ start <- 1 + step * i end <- ifelse(((i + 1) * step) > length(items), length(items),((i + 1) * step)) @@ -197,7 +219,7 @@ splitAPICall <- function(FUN, step = 20, items){ info <- plyr::rbind.fill(info, FUN(items[start:end])) } } - info + info }, error = function(e) { step <- 2 for(i in 0:(ceiling(length(items)/step) - 1)){ @@ -210,7 +232,7 @@ splitAPICall <- function(FUN, step = 20, items){ } } }) - info + info } @@ -220,24 +242,28 @@ splitAPICall <- function(FUN, step = 20, items){ #' Create a Summary table for each sample in a project saying if it contains #' or not files for a certain data category #' @param project A GDC project -#' @param legacy Access legacy (hg19) or harmonized database (hg38). #' @return A data frame #' @export #' @importFrom stats xtabs #' @examples -#' summary <- getDataCategorySummary("TCGA-ACC", legacy = TRUE) +#' summary <- getDataCategorySummary("TCGA-ACC") #' @author Tiago Chedraoui Silva -getDataCategorySummary <- function(project, legacy = FALSE){ - baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?") - url <- paste0(baseURL,"&expand=cases&size=100000&fields=cases.submitter_id,data_category&filters=", - URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.project.project_id","value":["'), - URLencode(project), - URLencode('"]}}]}')) +getDataCategorySummary <- function(project){ + baseURL <- "https://api.gdc.cancer.gov/files/?" + url <- paste0( + baseURL,"&expand=cases&size=100000&fields=cases.submitter_id,data_category&filters=", + URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.project.project_id","value":["'), + URLencode(project), + URLencode('"]}}]}') + ) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), error = function(e) { - fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE) + fromJSON( + content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), + simplifyDataFrame = TRUE + ) } ) json <- json$data$hits @@ -251,7 +277,6 @@ getDataCategorySummary <- function(project, legacy = FALSE){ #' @title Get Project Summary from GDC #' @param project A GDC project -#' @param legacy Select between Harmonized or Legacy database #' @examples #' getProjectSummary("TCGA-ACC") #' \dontrun{ @@ -259,9 +284,9 @@ getDataCategorySummary <- function(project, legacy = FALSE){ #' } #' @export #' @author Tiago Chedraoui Silva -getProjectSummary <- function(project, legacy = FALSE){ +getProjectSummary <- function(project){ checkProjectInput(project) - baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/projects/","https://api.gdc.cancer.gov/projects/") + baseURL <- "https://api.gdc.cancer.gov/projects/" url <- paste0(baseURL, project,"?expand=summary,summary.data_categories&pretty=true") return(fromJSON(url,simplifyDataFrame = TRUE)$data$summary) } @@ -269,17 +294,19 @@ getProjectSummary <- function(project, legacy = FALSE){ #' @title Get Number of cases in GDC for a project #' @param project A GDC project #' @param data.category A GDC project data category -#' @param legacy Select between Harmonized or Legacy database #' @examples #' \dontrun{ #' getNbCases("TCGA-ACC","Clinical") #' getNbCases("CPTAC-2","Clinical") #' } #' @author Tiago Chedraoui Silva -getNbCases <- function(project, data.category, legacy = FALSE){ - summary <- getProjectSummary(project, legacy) +getNbCases <- function( + project, + data.category +){ + summary <- getProjectSummary(project) if(data.category %in% summary$data_categories$data_category){ - summary <- getProjectSummary(project, legacy)$data_categories + summary <- getProjectSummary(project)$data_categories nb <- summary[summary$data_category == data.category,"case_count"] } else { nb <- summary$case_count @@ -290,17 +317,16 @@ getNbCases <- function(project, data.category, legacy = FALSE){ #' @title Get Number of files in GDC for a project #' @param project A GDC project #' @param data.category A GDC project data category -#' @param legacy Select between Harmonized or Legacy database #' @examples #' \dontrun{ #' getNbFiles("TCGA-ACC","Clinical") #' getNbFiles("CPTAC-2","Clinical") #' } #' @author Tiago Chedraoui Silva -getNbFiles <- function(project, data.category, legacy = FALSE){ - summary <- getProjectSummary(project, legacy) +getNbFiles <- function(project, data.category){ + summary <- getProjectSummary(project) if(data.category %in% summary$data_categories$data_category){ - summary <- getProjectSummary(project, legacy)$data_categories + summary <- getProjectSummary(project)$data_categories nb <- summary[summary$data_category == data.category,"file_count"] } else { nb <- summary$file_count diff --git a/man/getDataCategorySummary.Rd b/man/getDataCategorySummary.Rd index 55fd6d55d..f3783b4ac 100644 --- a/man/getDataCategorySummary.Rd +++ b/man/getDataCategorySummary.Rd @@ -5,12 +5,10 @@ \title{Create a Summary table for each sample in a project saying if it contains or not files for a certain data category} \usage{ -getDataCategorySummary(project, legacy = FALSE) +getDataCategorySummary(project) } \arguments{ \item{project}{A GDC project} - -\item{legacy}{Access legacy (hg19) or harmonized database (hg38).} } \value{ A data frame @@ -20,7 +18,7 @@ Create a Summary table for each sample in a project saying if it contains or not files for a certain data category } \examples{ -summary <- getDataCategorySummary("TCGA-ACC", legacy = TRUE) +summary <- getDataCategorySummary("TCGA-ACC") } \author{ Tiago Chedraoui Silva diff --git a/man/getNbCases.Rd b/man/getNbCases.Rd index a5fd531f2..87541e5c6 100644 --- a/man/getNbCases.Rd +++ b/man/getNbCases.Rd @@ -4,14 +4,12 @@ \alias{getNbCases} \title{Get Number of cases in GDC for a project} \usage{ -getNbCases(project, data.category, legacy = FALSE) +getNbCases(project, data.category) } \arguments{ \item{project}{A GDC project} \item{data.category}{A GDC project data category} - -\item{legacy}{Select between Harmonized or Legacy database} } \description{ Get Number of cases in GDC for a project diff --git a/man/getNbFiles.Rd b/man/getNbFiles.Rd index 6c5224104..cf951f336 100644 --- a/man/getNbFiles.Rd +++ b/man/getNbFiles.Rd @@ -4,14 +4,12 @@ \alias{getNbFiles} \title{Get Number of files in GDC for a project} \usage{ -getNbFiles(project, data.category, legacy = FALSE) +getNbFiles(project, data.category) } \arguments{ \item{project}{A GDC project} \item{data.category}{A GDC project data category} - -\item{legacy}{Select between Harmonized or Legacy database} } \description{ Get Number of files in GDC for a project diff --git a/man/getProjectSummary.Rd b/man/getProjectSummary.Rd index 3d570c3ae..ab0622be1 100644 --- a/man/getProjectSummary.Rd +++ b/man/getProjectSummary.Rd @@ -4,12 +4,10 @@ \alias{getProjectSummary} \title{Get Project Summary from GDC} \usage{ -getProjectSummary(project, legacy = FALSE) +getProjectSummary(project) } \arguments{ \item{project}{A GDC project} - -\item{legacy}{Select between Harmonized or Legacy database} } \description{ Get Project Summary from GDC diff --git a/man/getSampleFilesSummary.Rd b/man/getSampleFilesSummary.Rd index aa27a956e..0071196b1 100644 --- a/man/getSampleFilesSummary.Rd +++ b/man/getSampleFilesSummary.Rd @@ -4,13 +4,11 @@ \alias{getSampleFilesSummary} \title{Retrieve summary of files per sample in a project} \usage{ -getSampleFilesSummary(project, legacy = FALSE, files.access = NA) +getSampleFilesSummary(project, files.access = NA) } \arguments{ \item{project}{A GDC project} -\item{legacy}{Access legacy database ? Deafult: FALSE} - \item{files.access}{Filter by file access ("open" or "controlled"). Default: no filter} } From d0a43d3e9b9c640ef778283e52b3267ff0f3b008 Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Fri, 12 May 2023 20:48:27 -0400 Subject: [PATCH 6/9] Update News --- NEWS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 458f3daa1..2f4f54c80 100644 --- a/NEWS +++ b/NEWS @@ -32,16 +32,16 @@ CHANGES IN VERSION 2.7.13 CHANGES IN VERSION 2.5.0 ------------------------- -Changing chunks.per.download to files.per.chunk +* Changing chunks.per.download to files.per.chunk CHANGES IN VERSION 2.2.0 ------------------------- NEW FEATURES - o Add GISTIC2 information and Mutation information in the summarizedExperiment while preparing GDC data +* Add GISTIC2 information and Mutation information in the summarizedExperiment while preparing GDC data SIGNIFICANT USER-LEVEL CHANGES - o Speed up GDCquery +* Speed up GDCquery TCGAbiolinks 2.0 --------------------------------------------------------------- From b2bc706f2275867fbedb62707f0fe1c71a7bb26a Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Thu, 18 May 2023 10:53:48 -0400 Subject: [PATCH 7/9] Making some changes to the latest version. Removing legacy/harmonized from folder structure Fixing #573 Fixing #574 Needs more checking with CNV and CPTAC-3 projects --- NAMESPACE | 1 + NEWS | 1 + R/clinical.R | 36 +++-- R/download.R | 4 +- R/prepare.R | 191 +++++++++++++++---------- R/query.R | 88 ++++-------- man/GDCprepare_clinic.Rd | 4 +- man/GDCquery.Rd | 1 - tests/testthat/test-prepare-download.R | 39 +++-- tests/testthat/test-query.R | 48 ++++--- vignettes/clinical.Rmd | 15 +- 11 files changed, 232 insertions(+), 196 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d39dce727..9437c6330 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ importFrom(dplyr,mutate_all) importFrom(dplyr,pull) importFrom(dplyr,row_number) importFrom(dplyr,slice) +importFrom(dplyr,summarise) importFrom(grDevices,dev.list) importFrom(grDevices,dev.off) importFrom(grDevices,pdf) diff --git a/NEWS b/NEWS index 2f4f54c80..e57e3bca2 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,7 @@ CHANGES IN VERSION 2.29.1 ------------------------- * Removing support to legacy archive since it will be shutdown by GDC soon. +* When saving files we will not include folders prefix legacy/harmonized anymore CHANGES IN VERSION 2.21.1 ------------------------- diff --git a/R/clinical.R b/R/clinical.R index 1419a5a13..79fe7506b 100644 --- a/R/clinical.R +++ b/R/clinical.R @@ -170,6 +170,7 @@ TCGAquery_MatchedCoupledSampleTypes <- function(barcode,typesample){ #' @export #' @importFrom data.table rbindlist as.data.table #' @importFrom jsonlite fromJSON +#' @importFrom dplyr summarise #' @examples #' clinical <- GDCquery_clinic( #' project = "TCGA-ACC", @@ -348,7 +349,10 @@ GDCquery_clinic <- function( # we will collapse them into one single row # concatanating all columns using ; aux <- x %>% dplyr::group_by(submitter_id) %>% - summarise(across(everything(),~ paste(unique(.), collapse = ";"))) + dplyr::summarise( + across(everything(),~ paste(unique(.), collapse = ";")) + ) + aux$treatments <- list(dplyr::bind_rows(x$treatments)) aux } @@ -440,7 +444,7 @@ GDCquery_clinic <- function( #' query <- GDCquery( #' project = "TCGA-COAD", #' data.category = "Clinical", -#' file.type = "xml", +#' data.format = "bcr xml", #' barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") #' ) #' GDCdownload(query) @@ -452,7 +456,7 @@ GDCquery_clinic <- function( #' query <- GDCquery( #' project = "TCGA-COAD", #' data.category = "Biospecimen", -#' file.type = "xml", +#' data.format = "bcr xml", #' data.type = "Biospecimen Supplement", #' barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") #' ) @@ -503,9 +507,9 @@ GDCprepare_clinic <- function( } # Get all the clincal xml files - source <- "harmonized" + files <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_id), @@ -586,16 +590,18 @@ GDCprepare_clinic <- function( message("Updating days_to_last_followup and vital_status from follow_up information using last entry") followup <- parseFollowup(files,xpath,clinical.info) - followup_last <- followup %>% dplyr::group_by(bcr_patient_barcode) %>% dplyr::summarise( - days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE), - vital_status = vital_status[ - ifelse( - any(followup$days_to_last_followup %in% ""), - which(followup$days_to_last_followup %in% ""), - which.max(days_to_last_followup) - ) - ] - ) + followup_last <- followup %>% + dplyr::group_by(bcr_patient_barcode) %>% + dplyr::summarise( + days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE), + vital_status = vital_status[ + ifelse( + any(followup$days_to_last_followup %in% ""), + which(followup$days_to_last_followup %in% ""), + which.max(days_to_last_followup) + ) + ] + ) clin$days_to_last_followup <- followup_last$days_to_last_followup[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)] clin$vital_status <- followup_last$vital_status[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)] } diff --git a/R/download.R b/R/download.R index be812ff9f..16beabc96 100644 --- a/R/download.R +++ b/R/download.R @@ -64,8 +64,6 @@ GDCdownload <- function( stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.") } - source <- "harmonized" - dir.create(directory, showWarnings = FALSE, recursive = TRUE) for(proj in unique(unlist(query$project))){ message("Downloading data for project ", proj) @@ -77,7 +75,7 @@ GDCdownload <- function( path <- unique( file.path( - proj, source, + proj, gsub(" ","_", results$data_category), gsub(" ","_",results$data_type)) ) diff --git a/R/prepare.R b/R/prepare.R index fbfc3f172..92b419ee9 100644 --- a/R/prepare.R +++ b/R/prepare.R @@ -60,21 +60,20 @@ GDCprepare <- function( isServeOK() if(missing(query)) stop("Please set query parameter") - test.duplicated.cases <- - ( - any( - duplicated(query$results[[1]]$cases)) & # any duplicated - !(query$data.type %in% c( - "Clinical data", - "Protein expression quantification", - "Raw intensities", - "Masked Intensities", - "Clinical Supplement", - "Masked Somatic Mutation", - "Biospecimen Supplement" - ) - ) - ) + test.duplicated.cases <- ( + any( + duplicated(query$results[[1]]$cases)) & # any duplicated + !(query$data.type %in% c( + "Clinical data", + "Protein expression quantification", + "Raw intensities", + "Masked Intensities", + "Clinical Supplement", + "Masked Somatic Mutation", + "Biospecimen Supplement" + ) + ) + ) if(test.duplicated.cases) { @@ -90,10 +89,10 @@ GDCprepare <- function( if (!save & remove.files.prepared) { stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost") } - # We save the files in project/source/data.category/data.type/file_id/file_name - source <- "harmonized" + + # We save the files in project/data.category/data.type/file_id/file_name files <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_id), @@ -105,14 +104,19 @@ GDCprepare <- function( # For IDAT prepare since we need to put all IDATs in the same folder the code below will not work # a second run if (!all(file.exists(files))) { - # We have to check we movedthe files - if (query$data.type == "Masked Intensities" | query$data.category == "Raw microarray data"){ + # We have to check we moved the files + if ( + unique(query$results[[1]]$data_type) == "Masked Intensities" | + unique(query$results[[1]]$data_category) == "Raw microarray data" + ){ + files.idat <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_name) ) + files.idat <- file.path(directory, files.idat) if (!all(file.exists(files) | file.exists(files.idat))) { stop( @@ -135,7 +139,7 @@ GDCprepare <- function( } cases <- ifelse( - grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()), + grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-3",query$results[[1]]$project %>% unlist()), query$results[[1]]$cases, query$results[[1]]$sample.submitter_id ) @@ -165,9 +169,15 @@ GDCprepare <- function( if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number Scores") { data <- readGISTIC(files, query$results[[1]]$cases) } else if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number") { - data <- readGeneLevelCopyNumber(files, query$results[[1]]$cases,summarizedExperiment = summarizedExperiment) + data <- read_gene_level_copy_number( + files = files, + cases = query$results[[1]]$sample.submitter_id, + summarizedExperiment = summarizedExperiment + ) } else { - data <- readCopyNumberVariation(files, query$results[[1]]$cases) + data <- read_copy_number_variation( + files = files, cases = query$results[[1]]$cases + ) } } else if (grepl("Methylation Beta Value",unique(query$results[[1]]$data_type), ignore.case = TRUE)) { data <- readDNAmethylation( @@ -184,29 +194,30 @@ GDCprepare <- function( summarizedExperiment = summarizedExperiment, platform = unique(query$results[[1]]$platform) ) - } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { + } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { data <- readProteomeProfiling(files, cases = cases) - } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) { + } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) { data <- readProteinExpression(files, cases = cases) - if(summarizedExperiment) { + + if (summarizedExperiment) { message("SummarizedExperiment not implemented, if you need samples metadata use the function TCGAbiolinks:::colDataPrepare") } } else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) { - if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ + if (grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ data <- readSimpleNucleotideVariationMaf(files) } } else if (grepl("Clinical|Biospecimen", query$data.category, ignore.case = TRUE)){ - data <- readClinical(files, query$data.type, cases = cases) + data <- read_clinical(files, query$data.type, cases = cases) summarizedExperiment <- FALSE } else if (grepl("Gene expression",query$data.category,ignore.case = TRUE)) { if (query$data.type == "Gene expression quantification") - data <- readGeneExpressionQuantification( + data <- read_gene_expression_quantification( files = files, cases = cases, summarizedExperiment = summarizedExperiment, @@ -215,7 +226,7 @@ GDCprepare <- function( ) if (query$data.type == "miRNA gene quantification") - data <- readGeneExpressionQuantification( + data <- read_gene_expression_quantification( files = files, cases = cases, summarizedExperiment = FALSE, @@ -288,22 +299,22 @@ GDCprepare <- function( # save is true, due to the check in the beggining of the code if(remove.files.prepared){ # removes files and empty directories - remove.files.recursively(files) + remove_files_recursively(files) } } return(data) } -remove.files.recursively <- function(files){ +remove_files_recursively <- function(files){ files2rm <- dirname(files) unlink(files2rm,recursive = TRUE) files2rm <- dirname(files2rm) # data category - if(length(list.files(files2rm)) == 0) remove.files.recursively(files2rm) + if(length(list.files(files2rm)) == 0) remove_files_recursively(files2rm) } -readClinical <- function(files, data.type, cases){ +read_clinical <- function(files, data.type, cases){ if(data.type == "Clinical data"){ suppressMessages({ ret <- plyr::alply(files,.margins = 1,readr::read_tsv, .progress = "text") @@ -366,14 +377,17 @@ readSingleCellAnalysis <- function( } #' @importFrom tidyr separate -readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ +readExonQuantification <- function ( + files, + cases, + summarizedExperiment = TRUE +){ pb <- txtProgressBar(min = 0, max = length(files), style = 3) assay.list <- NULL for (i in seq_along(files)) { data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE) - if(!missing(cases)) { assay.list <- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)]) # We will use this because there might be more than one col for each samples @@ -385,12 +399,14 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ } else { df <- merge(df, data, by=colnames(data)[1], all = TRUE) } + setTxtProgressBar(pb, i) } setDF(df) rownames(df) <- df[,1] df <- df %>% separate(exon,into = c("seqnames","coordinates","strand"),sep = ":") %>% separate(coordinates,into = c("start","end"),sep = "-") + if(summarizedExperiment) { suppressWarnings({ assays <- lapply(assay.list, function (x) { @@ -398,8 +414,10 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ }) }) names(assays) <- assay.list - regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", - "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") + regex <- paste0( + "[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", + "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}" + ) samples <- na.omit(unique(str_match(colnames(df),regex)[,1])) colData <- colDataPrepare(samples) assays <- lapply(assays, function(x){ @@ -509,7 +527,7 @@ readSimpleNucleotideVariationMaf <- function(files){ return(ret) } -readGeneExpressionQuantification <- function( +read_gene_expression_quantification <- function( files, cases, genome = "hg19", @@ -558,7 +576,7 @@ readGeneExpressionQuantification <- function( df <- bind_cols(ret[[1]][,1],df) if (summarizedExperiment) { - df <- makeSEfromGeneExpressionQuantification(df, assay.list, genome = genome) + df <- make_se_from_gene_exoression_quantification(df, assay.list, genome = genome) } else { rownames(df) <- df$gene_id df$gene_id <- NULL @@ -567,7 +585,7 @@ readGeneExpressionQuantification <- function( } -makeSEfromGeneExpressionQuantification <- function( +make_se_from_gene_exoression_quantification <- function( df, assay.list, genome = "hg19" @@ -1127,10 +1145,10 @@ colDataPrepare <- function(barcode){ # Check if this breaks the package if(any(grepl("C3N-|C3L-",barcode))) { ret <- data.frame( - sample = sapply(barcode, function(x) stringr::str_split(x,";") %>% unlist()) %>% - unlist %>% unique,stringsAsFactors = FALSE + sample = map(barcode,.f = function(x) stringr::str_split(x,";") %>% unlist) %>% unlist() ) } + if(is.null(ret)) { ret <- data.frame( sample = barcode %>% unique, @@ -1147,14 +1165,12 @@ colDataPrepare <- function(barcode){ step = 10, items = ret$sample ) - if(!is.null(patient.info)) { ret$sample_submitter_id <- ret$sample %>% as.character() ret <- left_join(ret %>% as.data.frame, patient.info %>% unique, by = "sample_submitter_id") } ret$bcr_patient_barcode <- ret$sample %>% as.character() ret$sample_submitter_id <- ret$sample %>% as.character() - if(!"project_id" %in% colnames(ret)) { if("disease_type" %in% colnames(ret)){ aux <- getGDCprojects()[,c(5,7)] @@ -1168,7 +1184,15 @@ colDataPrepare <- function(barcode){ } # na.omit should not be here, exceptional case - if(is.null(ret)) return(data.frame(row.names = barcode, barcode,stringsAsFactors = FALSE)) + if(is.null(ret)) { + return( + data.frame( + row.names = barcode, + barcode, + stringsAsFactors = FALSE + ) + ) + } # Add purity information from http://www.nature.com/articles/ncomms9971 # purity <- getPurityinfo() @@ -1193,16 +1217,18 @@ colDataPrepare <- function(barcode){ if(any(ret$project_id == "CPTAC-3",na.rm = T)) { + print(ret) + save(ret,file = "test.rda") # only merge mixed samples - mixed.samples <- grep(";",barcode,value = T) - if(length(mixed.samples) > 0){ - mixed.samples <- unique(unlist(str_split(mixed.samples,";"))) + mixed_samples <- grep(";",barcode,value = T) + if(length(mixed_samples) > 0){ + mixed_samples <- mixed_samples %>% str_split(";") %>% unlist %>% unique - ret.mixed.samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed.samples) %>% - dplyr::group_by(submitter_id,sample_type) %>% + ret_mixed_samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed_samples) %>% + dplyr::group_by(submitter_id) %>% dplyr::summarise_all(~trimws(paste(unique(.), collapse = ';'))) %>% as.data.frame() - ret <- rbind(ret.mixed.samples,ret) + ret <- rbind(ret_mixed_samples,ret) } idx <- match(barcode,ret$bcr_patient_barcode) @@ -1567,7 +1593,11 @@ readTranscriptomeProfiling <- function( return(df) } -readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){ +read_gene_level_copy_number <- function( + files, + cases, + summarizedExperiment = FALSE +){ message("Reading Gene Level Copy Number files") gistic.df <- NULL gistic.list <- plyr::alply(files,1,.fun = function(file) { @@ -1594,13 +1624,13 @@ readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){ ) if(summarizedExperiment) { - se <- makeSEfromGeneLevelCopyNumber(df, cases) + se <- make_se_from_gene_level_copy_number(df, cases) return(se) } return(df) } -makeSEfromGeneLevelCopyNumber <- function(df, cases){ +make_se_from_gene_level_copy_number <- function(df, cases){ message("Creating a SummarizedExperiment object") rowRanges <- GRanges( seqnames = df$chromosome, @@ -1660,7 +1690,7 @@ readGISTIC <- function(files, cases){ # Reads Copy Number Variation files to a data frame, basically it will rbind it #' @importFrom purrr map2_dfr -readCopyNumberVariation <- function(files, cases){ +read_copy_number_variation <- function(files, cases){ message("Reading copy number variation files") col_types <- ifelse(any(grepl('ascat2', files)),"ccnnnnn","ccnnnd") @@ -1700,10 +1730,12 @@ getFFPE <- function(patient){ options.pretty <- "pretty=true" options.expand <- "expand=samples" option.size <- paste0("size=",length(patient)) - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), - paste0('"',paste(patient,collapse = '","')), - URLencode('"]}}]}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), + paste0('"',paste(patient,collapse = '","')), + URLencode('"]}}]}') + ) url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), @@ -1727,10 +1759,12 @@ getAliquot_ids <- function(barcode){ option.size <- paste0("size=",length(barcode)) #message(paste(barcode,collapse = '","')) #message(paste0('"',paste(barcode,collapse = '","'))) - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), - paste0('"',paste(barcode,collapse = '","')), - URLencode('"]}}]}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), + paste0('"',paste(barcode,collapse = '","')), + URLencode('"]}}]}') + ) #message(paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))) url <- paste0(baseURL,paste(options.pretty,options.fields, option.size, options.filter, sep = "&")) json <- tryCatch( @@ -1852,7 +1886,10 @@ getBarcodeInfo <- function(barcode) { } } if(!is.null(results$exposures)) { - exposures <- rbindlist(lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x),fill = TRUE) + exposures <- rbindlist( + lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x), + fill = TRUE + ) exposures[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(exposures)))) { exposures$submitter_id <- gsub("-exposure|_exposure.*|-EXP","", exposures$submitter_id) @@ -1872,7 +1909,11 @@ getBarcodeInfo <- function(barcode) { demographic <- results$demographic demographic[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(demographic)))) { - demographic$submitter_id <- gsub("-demographic|_demographic.*|-DEMO|demo-","", results$demographic$submitter_id) + demographic$submitter_id <- gsub( + "-demographic|_demographic.*|-DEMO|demo-", + "", + results$demographic$submitter_id + ) } else { demographic$submitter_id <- submitter_id } @@ -1893,7 +1934,9 @@ getBarcodeInfo <- function(barcode) { .fun = function(x){ demographic[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>% - dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))}) + dplyr::slice( + rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) + }) } df <- dplyr::bind_cols(df %>% as.data.frame,demographic) @@ -1910,9 +1953,11 @@ getBarcodeInfo <- function(barcode) { projects.info <- cbind("submitter_id" = submitter_id, projects.info) suppressWarnings({ - df <- left_join(df, - projects.info, - by = "submitter_id") + df <- left_join( + df, + projects.info, + by = "submitter_id" + ) }) } else { @@ -1922,7 +1967,9 @@ getBarcodeInfo <- function(barcode) { .fun = function(x){ projects.info[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>% - dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))}) + dplyr::slice( + rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) + }) } df <- dplyr::bind_cols(df,projects.info) diff --git a/R/query.R b/R/query.R index 5a86d06f1..020eb13ae 100644 --- a/R/query.R +++ b/R/query.R @@ -153,7 +153,6 @@ GDCquery <- function( workflow.type, access, platform, - file.type, barcode, data.format, experimental.strategy, @@ -168,31 +167,31 @@ GDCquery <- function( } else if(all(sample.type == FALSE)) { sample.type <- NA } + if(missing(data.type)) { data.type <- NA } else if(data.type == FALSE) { data.type <- NA } + if(missing(barcode)) { barcode <- NA } else if(length(barcode) == 1) { if(barcode == FALSE) barcode <- NA } + if(missing(platform)) { platform <- NA } else if(any(platform == FALSE)) { platform <- NA } - if(missing(file.type)) { - file.type <- NA - } else if(file.type == FALSE) { - file.type <- NA - } + if(missing(workflow.type)) { workflow.type <- NA } else if(workflow.type == FALSE) { workflow.type <- NA } + if(missing(experimental.strategy)) { experimental.strategy <- NA } else if(experimental.strategy == FALSE) { @@ -203,6 +202,7 @@ GDCquery <- function( } else if(access == FALSE) { access <- NA } + if(missing(data.format)) { data.format <- NA } else if(data.format == FALSE) { @@ -226,7 +226,6 @@ GDCquery <- function( data.type = data.type, workflow.type = workflow.type, platform = platform, - file.type = file.type, files.access = access, experimental.strategy = experimental.strategy, sample.type = sample.type @@ -247,7 +246,6 @@ GDCquery <- function( data.type = data.type, workflow.type = NA, platform = NA, - file.type = file.type, experimental.strategy = experimental.strategy, files.access = access, sample.type = sample.type @@ -332,8 +330,12 @@ GDCquery <- function( message("ooo By experimental.strategy") results <- results[tolower(results$experimental_strategy) %in% tolower(experimental.strategy),] } else { - message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:", - paste(unique(results$experimental_strategy),collapse = "\n=>"))) + message( + paste0( + "The argument experimental_strategy does not match any of the results.\nPossible values:", + paste(unique(results$experimental_strategy),collapse = "\n=>") + ) + ) } } @@ -342,8 +344,12 @@ GDCquery <- function( message("ooo By data.format") results <- results[tolower(results$data_format) %in% tolower(data.format),] } else { - message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:", - paste(unique(results$data_format),collapse = "\n=>"))) + message( + paste0( + "The argument experimental_strategy does not match any of the results.\nPossible values:", + paste(unique(results$data_format),collapse = "\n=>") + ) + ) } } @@ -367,57 +373,12 @@ GDCquery <- function( results <- results[results$analysis_workflow_type %in% workflow.type,] } - - # Filter by file.type - if(!is.na(file.type)){ - message("ooo By file.type") - pat <- file.type - invert <- FALSE - - # RNA-seq - if(file.type == "normalized_results") pat <- "normalized_results" - if(file.type == "results") pat <- "[^normalized_]results" - - - if(file.type == "nocnv_hg18" | file.type == "nocnv_hg18.seg") pat <- "nocnv_hg18" - if(file.type == "cnv_hg18" | file.type == "hg18.seg") pat <- "[^nocnv_]hg18.seg" - if(file.type == "nocnv_hg19" | file.type == "nocnv_hg19.seg") pat <- "nocnv_hg19" - if(file.type == "cnv_hg19" | file.type == "hg19.seg") pat <- "[^nocnv_]hg19.seg" - - # miRNA-seq - # examples: - # TCGA-E9-A1R5-01A-11R-A14L-13.mirna.quantification.txt - if(file.type == "mirna") { - pat <- "hg19.*mirna" - invert <- TRUE - } - # TCGA-F5-6464-01A-11H-1735-13.hg19.mirna.quantification.txt - if(file.type == "hg19.mirna") pat <- "hg19.mirna" - - # TCGA-AC-A4ZE-01A-11R-A41G-13.hg19.mirbase20.mirna.quantification.txt - if(file.type == "hg19.mirbase20.mirna") pat <- "hg19.mirbase20.mirna" - - # TCGA-CJ-4878-01A-01R-1304-13.isoform.quantification.txt - if(file.type == "hg19.isoform") pat <- "hg19.*isoform" - if(file.type == "isoform") { - pat <- "hg19.*isoform" - invert <- TRUE - } - idx <- grep(pat,results$file_name,invert = invert) - if(length(idx) == 0) { - print(knitr::kable(sort(results$file_name)[1:10],col.names = "Files")) - stop("We were not able to filter using this file type. Examples of available files are above. Please check the vignette for possible entries") - } - results <- results[idx,] - } - # get barcode of the samples # 1) Normally for each sample we will have only single information # however the mutation call uses both normal and tumor which are both # reported by the API if(!data.category %in% c( "Clinical", - "Copy Number Variation", "Biospecimen", "Other", "Simple Nucleotide Variation", @@ -495,11 +456,12 @@ GDCquery <- function( # Auxiliary test files does not have information linked toit. # get frm file names results$cases <- str_extract_all(results$file_name,"TCGA-[:alnum:]{2}-[:alnum:]{4}") %>% unlist - } else if(data.category %in% c( - "Copy Number Variation", - "Simple nucleotide variation", - "Simple Nucleotide Variation") - ){ + } else if( + data.category %in% c( + "Simple nucleotide variation", + "Simple Nucleotide Variation" + ) + ) { cases <- plyr::laply( .data = results$cases, .fun = function(x) { @@ -621,7 +583,6 @@ GDCquery <- function( data.type = data.type, access = I(list(access)), experimental.strategy = I(list(experimental.strategy)), - file.type = file.type, platform = I(list(platform)), sample.type = I(list(sample.type)), barcode = I(list(barcode)), @@ -636,7 +597,6 @@ getGDCquery <- function( data.type, workflow.type, platform, - file.type, files.access, sample.type, experimental.strategy diff --git a/man/GDCprepare_clinic.Rd b/man/GDCprepare_clinic.Rd index d0f51f05c..c24daa1ca 100644 --- a/man/GDCprepare_clinic.Rd +++ b/man/GDCprepare_clinic.Rd @@ -26,7 +26,7 @@ based on the desired information query <- GDCquery( project = "TCGA-COAD", data.category = "Clinical", - file.type = "xml", + data.format = "bcr xml", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) GDCdownload(query) @@ -38,7 +38,7 @@ clinical.admin <- GDCprepare_clinic(query,"admin") query <- GDCquery( project = "TCGA-COAD", data.category = "Biospecimen", - file.type = "xml", + data.format = "bcr xml", data.type = "Biospecimen Supplement", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) diff --git a/man/GDCquery.Rd b/man/GDCquery.Rd index 10153d843..4cb973423 100644 --- a/man/GDCquery.Rd +++ b/man/GDCquery.Rd @@ -11,7 +11,6 @@ GDCquery( workflow.type, access, platform, - file.type, barcode, data.format, experimental.strategy, diff --git a/tests/testthat/test-prepare-download.R b/tests/testthat/test-prepare-download.R index 6b5ee2f9c..4ebaa0bac 100644 --- a/tests/testthat/test-prepare-download.R +++ b/tests/testthat/test-prepare-download.R @@ -124,20 +124,21 @@ test_that("Non TCGA data is processed", { skip_on_bioc() skip_if_offline() - proj <- "MMRF-COMMPASS" query <- GDCquery( - project = proj, - data.category = "Transcriptome Profiling", - data.type = "Gene Expression Quantification", - workflow.type = "STAR - Counts" - ) - query <- GDCquery( - project = proj, + project = "MMRF-COMMPASS", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts", - barcode = getResults(query)$cases[1:4] + barcode = c( + "MMRF_2737_1_BM_CD138pos_T2_TSMRU_L14993", + "MMRF_2739_1_BM_CD138pos_T2_TSMRU_L15000", + "MMRF_1865_1_BM_CD138pos_T2_TSMRU_L05342" + ) ) + GDCdownload(query,directory = "ex") + data <- GDCprepare(query,directory = "ex") + expect_true(ncol(data) == 3) + unlink("ex", recursive = TRUE, force = TRUE) }) test_that("Gene Level Copy Number is being correctly prepare", { @@ -155,6 +156,26 @@ test_that("Gene Level Copy Number is being correctly prepare", { data <- GDCprepare(query,directory = "ex") expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7"))) + expect_true(data$days_to_last_follow_up == c(3038,NA)) + unlink("ex", recursive = TRUE, force = TRUE) +}) + +test_that("Gene Level Copy Number is being correctly prepare for CPTAC-3", { + skip_on_bioc() + skip_if_offline() + + query_CPTAC = GDCquery( + project = "CPTAC-3", + data.category = "Copy Number Variation", + data.type = "Gene Level Copy Number", + barcode = c("CPT0115240002","CPT0088960002") + ) + + GDCdownload(query_CPTAC,directory = "ex") + data <- GDCprepare(query_CPTAC,directory = "ex") + expect_true(ncol(data) == 2) + expect_true(data$submitter_id == c("C3L-02544","C3N-01179")) + expect_true(data$days_to_last_follow_up == c("889","1816")) unlink("ex", recursive = TRUE, force = TRUE) }) diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index f51c2297a..79fea5a56 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -96,21 +96,29 @@ test_that("GDCquery can filter by barcode", { ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) barcode <- c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01") - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy Number Variation", - data.type = "Copy Number Segment", - barcode = barcode) + query <- GDCquery( + project = "TCGA-ACC", + data.category = "Copy Number Variation", + data.type = "Copy Number Segment", + barcode = barcode + ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) barcode <- c("TCGA-OR-A5KU", "TCGA-OR-A5JK") - query <- GDCquery(project = "TCGA-ACC", - data.category = "Clinical", - file.type = "xml", - barcode = barcode) + query <- GDCquery( + project = "TCGA-ACC", + data.category = "Clinical", + data.format = "bcr xml", + barcode = barcode + ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) # Will work if barcode was not found - query <- GDCquery(project = "TCGA-BRCA", data.category = "Clinical",file.type = "xml", - barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q")) + query <- GDCquery( + project = "TCGA-BRCA", + data.category = "Clinical", + data.format = "bcr xml", + barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") + ) expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases)) }) @@ -119,13 +127,19 @@ test_that("GDCquery can filter by access level", { skip_on_bioc() skip_if_offline() - query <- GDCquery(project = "TCGA-KIRP", - data.category = "Simple Nucleotide Variation", - access = "open") + query <- GDCquery( + project = "TCGA-KIRP", + data.category = "Simple Nucleotide Variation", + access = "open" + ) expect_equal(unique(query$results[[1]]$access),"open") - query <- GDCquery(project = "TCGA-KIRP", - data.category = "Simple Nucleotide Variation", - access = "controlled") + + query <- GDCquery( + project = "TCGA-KIRP", + data.category = "Simple Nucleotide Variation", + data.type = "Raw Simple Somatic Mutation", + access = "controlled" + ) expect_equal(unique(query$results[[1]]$access),"controlled") }) @@ -133,7 +147,7 @@ test_that("getNbFiles and getNbCases works", { skip_on_bioc() skip_if_offline() - aux <- getProjectSummary("TCGA-LUAD",TRUE) + aux <- getProjectSummary(project = "TCGA-LUAD") files <- getNbFiles("TCGA-LUAD","Raw microarray data") cases <- getNbCases("TCGA-LUAD","Raw microarray data") expect_true(cases < files) diff --git a/vignettes/clinical.Rmd b/vignettes/clinical.Rmd index 33edd3033..6ab770fd1 100644 --- a/vignettes/clinical.Rmd +++ b/vignettes/clinical.Rmd @@ -74,17 +74,6 @@ query <- GDCquery( GDCdownload(query) clinical.BCRtab.all <- GDCprepare(query) names(clinical.BCRtab.all) - -query <- GDCquery( - project = "TCGA-ACC", - data.category = "Clinical", - data.type = "Clinical Supplement", - data.format = "BCR Biotab", - file.type = "radiation" -) -GDCdownload(query) -clinical.BCRtab.radiation <- GDCprepare(query) - ``` ```{r echo=TRUE, message=FALSE, warning=FALSE} @@ -219,7 +208,7 @@ Below are several examples fetching clinical data directly from the clinical XML query <- GDCquery( project = "TCGA-COAD", data.category = "Clinical", - file.type = "xml", + data.format = "bcr xml", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) GDCdownload(query) @@ -376,7 +365,7 @@ getclinical <- function(proj){ message(proj) while(1){ result = tryCatch({ - query <- GDCquery(project = proj, data.category = "Clinical",file.type = "xml") + query <- GDCquery(project = proj, data.category = "Clinical",data.format = "bcr xml") GDCdownload(query) clinical <- GDCprepare_clinic(query, clinical.info = "patient") for(i in c("admin","radiation","follow_up","drug","new_tumor_event")){ From b6b7e5aac6f27c66b0cc5e3fda9f4569a240d86e Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Thu, 18 May 2023 11:33:11 -0400 Subject: [PATCH 8/9] Removing debug code, increasing timeout for query by default. --- R/prepare.R | 2 -- R/query.R | 39 ++++++++++++++++++++++++++++--------- tests/testthat/test-query.R | 17 ++++++++++++++++ vignettes/query.Rmd | 1 - 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/R/prepare.R b/R/prepare.R index 92b419ee9..9d3d4d2f1 100644 --- a/R/prepare.R +++ b/R/prepare.R @@ -1217,8 +1217,6 @@ colDataPrepare <- function(barcode){ if(any(ret$project_id == "CPTAC-3",na.rm = T)) { - print(ret) - save(ret,file = "test.rda") # only merge mixed samples mixed_samples <- grep(";",barcode,value = T) if(length(mixed_samples) > 0){ diff --git a/R/query.R b/R/query.R index 020eb13ae..b3152fd6d 100644 --- a/R/query.R +++ b/R/query.R @@ -231,12 +231,20 @@ GDCquery <- function( sample.type = sample.type ) message("ooo Project: ", proj) + original_timeout <- getOption('timeout') + options(timeout=600) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), error = function(e) { message(paste("Error: ", e, sep = " ")) message("We will retry to access GDC!") - fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE) + fromJSON( + content( + getURL(url,GET,timeout(600)), + as = "text", + encoding = "UTF-8" + ), simplifyDataFrame = TRUE + ) } ) if(json$data$pagination$count == 0) { @@ -259,6 +267,7 @@ GDCquery <- function( } ) } + options(timeout=original_timeout) json$data$hits$acl <- NULL @@ -551,12 +560,12 @@ GDCquery <- function( print.header("Checking data","subsection") message("ooo Checking if there are duplicated cases") - if(any(duplicated(results$cases))) { + if (any(duplicated(results$cases))) { message("Warning: There are more than one file for the same case. Please verify query results. You can use the command View(getResults(query)) in rstudio") } message("ooo Checking if there are results for the query") - if(nrow(results) == 0) stop("Sorry, no results were found for this query") + if (nrow(results) == 0) stop("Sorry, no results were found for this query") # Try ordering (needs dplyr 1.0 - still not published) results <- tryCatch({ @@ -674,11 +683,19 @@ expandBarcodeInfo <- function(barcode){ ret <- ret[match(barcode,ret$barcode),] } if(any(grepl("TCGA",barcode))) { - ret <- data.frame(barcode = barcode, - patient = substr(barcode, 1, 12), - sample = substr(barcode, 1, 16), - tissue.code = substr(barcode, 14, 15)) - ret <- merge(ret,getBarcodeDefinition(), by = "tissue.code", sort = FALSE, all.x = TRUE) + ret <- data.frame( + barcode = barcode, + patient = substr(barcode, 1, 12), + sample = substr(barcode, 1, 16), + tissue.code = substr(barcode, 14, 15) + ) + ret <- merge( + ret, + getBarcodeDefinition(), + by = "tissue.code", + sort = FALSE, + all.x = TRUE + ) ret <- ret[match(barcode,ret$barcode),] } return(ret) @@ -717,7 +734,11 @@ getBarcodeDefinition <- function(type = "TCGA"){ "Cell Lines", "Primary Xenograft Tissue", "Cell Line Derived Xenograft Tissue") - aux <- data.frame(tissue.code = tissue.code,shortLetterCode,tissue.definition) + aux <- data.frame( + tissue.code = tissue.code, + shortLetterCode, + tissue.definition + ) } else { tissue.code <- c( diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 79fea5a56..d7f795913 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -152,3 +152,20 @@ test_that("getNbFiles and getNbCases works", { cases <- getNbCases("TCGA-LUAD","Raw microarray data") expect_true(cases < files) }) + +test_that("getNbFiles and getNbCases works", { + skip_on_bioc() + skip_if_offline() + + # This test was added for further study of the TARGET-AML data + # There are multiple files for the same patient and the query + # gives a warning although the cases are different + # should we change to verification and warning output ? + query_target <- GDCquery( + project = "TARGET-AML", + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts", + barcode = c("TARGET-20-PANLXK","TARGET-20-PATIAK") + ) +}) diff --git a/vignettes/query.Rmd b/vignettes/query.Rmd index 3dc0400a4..fadb1bb88 100644 --- a/vignettes/query.Rmd +++ b/vignettes/query.Rmd @@ -31,7 +31,6 @@ library(DT) # Useful information -
Understanding the barcode
From 3b51a98761ab8b596d20f1e019f2758389962060 Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Mon, 22 May 2023 14:17:55 -0400 Subject: [PATCH 9/9] Fixing tests --- R/prepare.R | 2 +- tests/testthat/test-prepare-download.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/prepare.R b/R/prepare.R index 9d3d4d2f1..6ac1f27de 100644 --- a/R/prepare.R +++ b/R/prepare.R @@ -139,7 +139,7 @@ GDCprepare <- function( } cases <- ifelse( - grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-3",query$results[[1]]$project %>% unlist()), + grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()), query$results[[1]]$cases, query$results[[1]]$sample.submitter_id ) diff --git a/tests/testthat/test-prepare-download.R b/tests/testthat/test-prepare-download.R index 4ebaa0bac..b9db0ced1 100644 --- a/tests/testthat/test-prepare-download.R +++ b/tests/testthat/test-prepare-download.R @@ -156,7 +156,7 @@ test_that("Gene Level Copy Number is being correctly prepare", { data <- GDCprepare(query,directory = "ex") expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7"))) - expect_true(data$days_to_last_follow_up == c(3038,NA)) + expect_equal(data$days_to_last_follow_up,c(3038,NA)) unlink("ex", recursive = TRUE, force = TRUE) }) @@ -174,8 +174,8 @@ test_that("Gene Level Copy Number is being correctly prepare for CPTAC-3", { GDCdownload(query_CPTAC,directory = "ex") data <- GDCprepare(query_CPTAC,directory = "ex") expect_true(ncol(data) == 2) - expect_true(data$submitter_id == c("C3L-02544","C3N-01179")) - expect_true(data$days_to_last_follow_up == c("889","1816")) + expect_equal(data$submitter_id, c("C3L-02544","C3N-01179")) + expect_equal(data$days_to_last_follow_up, c("889","1816")) unlink("ex", recursive = TRUE, force = TRUE) })