From b2bc706f2275867fbedb62707f0fe1c71a7bb26a Mon Sep 17 00:00:00 2001 From: Tiago Silva Date: Thu, 18 May 2023 10:53:48 -0400 Subject: [PATCH] Making some changes to the latest version. Removing legacy/harmonized from folder structure Fixing #573 Fixing #574 Needs more checking with CNV and CPTAC-3 projects --- NAMESPACE | 1 + NEWS | 1 + R/clinical.R | 36 +++-- R/download.R | 4 +- R/prepare.R | 191 +++++++++++++++---------- R/query.R | 88 ++++-------- man/GDCprepare_clinic.Rd | 4 +- man/GDCquery.Rd | 1 - tests/testthat/test-prepare-download.R | 39 +++-- tests/testthat/test-query.R | 48 ++++--- vignettes/clinical.Rmd | 15 +- 11 files changed, 232 insertions(+), 196 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d39dce727..9437c6330 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ importFrom(dplyr,mutate_all) importFrom(dplyr,pull) importFrom(dplyr,row_number) importFrom(dplyr,slice) +importFrom(dplyr,summarise) importFrom(grDevices,dev.list) importFrom(grDevices,dev.off) importFrom(grDevices,pdf) diff --git a/NEWS b/NEWS index 2f4f54c80..e57e3bca2 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,7 @@ CHANGES IN VERSION 2.29.1 ------------------------- * Removing support to legacy archive since it will be shutdown by GDC soon. +* When saving files we will not include folders prefix legacy/harmonized anymore CHANGES IN VERSION 2.21.1 ------------------------- diff --git a/R/clinical.R b/R/clinical.R index 1419a5a13..79fe7506b 100644 --- a/R/clinical.R +++ b/R/clinical.R @@ -170,6 +170,7 @@ TCGAquery_MatchedCoupledSampleTypes <- function(barcode,typesample){ #' @export #' @importFrom data.table rbindlist as.data.table #' @importFrom jsonlite fromJSON +#' @importFrom dplyr summarise #' @examples #' clinical <- GDCquery_clinic( #' project = "TCGA-ACC", @@ -348,7 +349,10 @@ GDCquery_clinic <- function( # we will collapse them into one single row # concatanating all columns using ; aux <- x %>% dplyr::group_by(submitter_id) %>% - summarise(across(everything(),~ paste(unique(.), collapse = ";"))) + dplyr::summarise( + across(everything(),~ paste(unique(.), collapse = ";")) + ) + aux$treatments <- list(dplyr::bind_rows(x$treatments)) aux } @@ -440,7 +444,7 @@ GDCquery_clinic <- function( #' query <- GDCquery( #' project = "TCGA-COAD", #' data.category = "Clinical", -#' file.type = "xml", +#' data.format = "bcr xml", #' barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") #' ) #' GDCdownload(query) @@ -452,7 +456,7 @@ GDCquery_clinic <- function( #' query <- GDCquery( #' project = "TCGA-COAD", #' data.category = "Biospecimen", -#' file.type = "xml", +#' data.format = "bcr xml", #' data.type = "Biospecimen Supplement", #' barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") #' ) @@ -503,9 +507,9 @@ GDCprepare_clinic <- function( } # Get all the clincal xml files - source <- "harmonized" + files <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_id), @@ -586,16 +590,18 @@ GDCprepare_clinic <- function( message("Updating days_to_last_followup and vital_status from follow_up information using last entry") followup <- parseFollowup(files,xpath,clinical.info) - followup_last <- followup %>% dplyr::group_by(bcr_patient_barcode) %>% dplyr::summarise( - days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE), - vital_status = vital_status[ - ifelse( - any(followup$days_to_last_followup %in% ""), - which(followup$days_to_last_followup %in% ""), - which.max(days_to_last_followup) - ) - ] - ) + followup_last <- followup %>% + dplyr::group_by(bcr_patient_barcode) %>% + dplyr::summarise( + days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE), + vital_status = vital_status[ + ifelse( + any(followup$days_to_last_followup %in% ""), + which(followup$days_to_last_followup %in% ""), + which.max(days_to_last_followup) + ) + ] + ) clin$days_to_last_followup <- followup_last$days_to_last_followup[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)] clin$vital_status <- followup_last$vital_status[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)] } diff --git a/R/download.R b/R/download.R index be812ff9f..16beabc96 100644 --- a/R/download.R +++ b/R/download.R @@ -64,8 +64,6 @@ GDCdownload <- function( stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.") } - source <- "harmonized" - dir.create(directory, showWarnings = FALSE, recursive = TRUE) for(proj in unique(unlist(query$project))){ message("Downloading data for project ", proj) @@ -77,7 +75,7 @@ GDCdownload <- function( path <- unique( file.path( - proj, source, + proj, gsub(" ","_", results$data_category), gsub(" ","_",results$data_type)) ) diff --git a/R/prepare.R b/R/prepare.R index fbfc3f172..92b419ee9 100644 --- a/R/prepare.R +++ b/R/prepare.R @@ -60,21 +60,20 @@ GDCprepare <- function( isServeOK() if(missing(query)) stop("Please set query parameter") - test.duplicated.cases <- - ( - any( - duplicated(query$results[[1]]$cases)) & # any duplicated - !(query$data.type %in% c( - "Clinical data", - "Protein expression quantification", - "Raw intensities", - "Masked Intensities", - "Clinical Supplement", - "Masked Somatic Mutation", - "Biospecimen Supplement" - ) - ) - ) + test.duplicated.cases <- ( + any( + duplicated(query$results[[1]]$cases)) & # any duplicated + !(query$data.type %in% c( + "Clinical data", + "Protein expression quantification", + "Raw intensities", + "Masked Intensities", + "Clinical Supplement", + "Masked Somatic Mutation", + "Biospecimen Supplement" + ) + ) + ) if(test.duplicated.cases) { @@ -90,10 +89,10 @@ GDCprepare <- function( if (!save & remove.files.prepared) { stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost") } - # We save the files in project/source/data.category/data.type/file_id/file_name - source <- "harmonized" + + # We save the files in project/data.category/data.type/file_id/file_name files <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_id), @@ -105,14 +104,19 @@ GDCprepare <- function( # For IDAT prepare since we need to put all IDATs in the same folder the code below will not work # a second run if (!all(file.exists(files))) { - # We have to check we movedthe files - if (query$data.type == "Masked Intensities" | query$data.category == "Raw microarray data"){ + # We have to check we moved the files + if ( + unique(query$results[[1]]$data_type) == "Masked Intensities" | + unique(query$results[[1]]$data_category) == "Raw microarray data" + ){ + files.idat <- file.path( - query$results[[1]]$project, source, + query$results[[1]]$project, gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_name) ) + files.idat <- file.path(directory, files.idat) if (!all(file.exists(files) | file.exists(files.idat))) { stop( @@ -135,7 +139,7 @@ GDCprepare <- function( } cases <- ifelse( - grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()), + grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-3",query$results[[1]]$project %>% unlist()), query$results[[1]]$cases, query$results[[1]]$sample.submitter_id ) @@ -165,9 +169,15 @@ GDCprepare <- function( if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number Scores") { data <- readGISTIC(files, query$results[[1]]$cases) } else if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number") { - data <- readGeneLevelCopyNumber(files, query$results[[1]]$cases,summarizedExperiment = summarizedExperiment) + data <- read_gene_level_copy_number( + files = files, + cases = query$results[[1]]$sample.submitter_id, + summarizedExperiment = summarizedExperiment + ) } else { - data <- readCopyNumberVariation(files, query$results[[1]]$cases) + data <- read_copy_number_variation( + files = files, cases = query$results[[1]]$cases + ) } } else if (grepl("Methylation Beta Value",unique(query$results[[1]]$data_type), ignore.case = TRUE)) { data <- readDNAmethylation( @@ -184,29 +194,30 @@ GDCprepare <- function( summarizedExperiment = summarizedExperiment, platform = unique(query$results[[1]]$platform) ) - } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { + } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { data <- readProteomeProfiling(files, cases = cases) - } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) { + } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) { data <- readProteinExpression(files, cases = cases) - if(summarizedExperiment) { + + if (summarizedExperiment) { message("SummarizedExperiment not implemented, if you need samples metadata use the function TCGAbiolinks:::colDataPrepare") } } else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) { - if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ + if (grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ data <- readSimpleNucleotideVariationMaf(files) } } else if (grepl("Clinical|Biospecimen", query$data.category, ignore.case = TRUE)){ - data <- readClinical(files, query$data.type, cases = cases) + data <- read_clinical(files, query$data.type, cases = cases) summarizedExperiment <- FALSE } else if (grepl("Gene expression",query$data.category,ignore.case = TRUE)) { if (query$data.type == "Gene expression quantification") - data <- readGeneExpressionQuantification( + data <- read_gene_expression_quantification( files = files, cases = cases, summarizedExperiment = summarizedExperiment, @@ -215,7 +226,7 @@ GDCprepare <- function( ) if (query$data.type == "miRNA gene quantification") - data <- readGeneExpressionQuantification( + data <- read_gene_expression_quantification( files = files, cases = cases, summarizedExperiment = FALSE, @@ -288,22 +299,22 @@ GDCprepare <- function( # save is true, due to the check in the beggining of the code if(remove.files.prepared){ # removes files and empty directories - remove.files.recursively(files) + remove_files_recursively(files) } } return(data) } -remove.files.recursively <- function(files){ +remove_files_recursively <- function(files){ files2rm <- dirname(files) unlink(files2rm,recursive = TRUE) files2rm <- dirname(files2rm) # data category - if(length(list.files(files2rm)) == 0) remove.files.recursively(files2rm) + if(length(list.files(files2rm)) == 0) remove_files_recursively(files2rm) } -readClinical <- function(files, data.type, cases){ +read_clinical <- function(files, data.type, cases){ if(data.type == "Clinical data"){ suppressMessages({ ret <- plyr::alply(files,.margins = 1,readr::read_tsv, .progress = "text") @@ -366,14 +377,17 @@ readSingleCellAnalysis <- function( } #' @importFrom tidyr separate -readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ +readExonQuantification <- function ( + files, + cases, + summarizedExperiment = TRUE +){ pb <- txtProgressBar(min = 0, max = length(files), style = 3) assay.list <- NULL for (i in seq_along(files)) { data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE) - if(!missing(cases)) { assay.list <- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)]) # We will use this because there might be more than one col for each samples @@ -385,12 +399,14 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ } else { df <- merge(df, data, by=colnames(data)[1], all = TRUE) } + setTxtProgressBar(pb, i) } setDF(df) rownames(df) <- df[,1] df <- df %>% separate(exon,into = c("seqnames","coordinates","strand"),sep = ":") %>% separate(coordinates,into = c("start","end"),sep = "-") + if(summarizedExperiment) { suppressWarnings({ assays <- lapply(assay.list, function (x) { @@ -398,8 +414,10 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){ }) }) names(assays) <- assay.list - regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", - "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") + regex <- paste0( + "[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", + "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}" + ) samples <- na.omit(unique(str_match(colnames(df),regex)[,1])) colData <- colDataPrepare(samples) assays <- lapply(assays, function(x){ @@ -509,7 +527,7 @@ readSimpleNucleotideVariationMaf <- function(files){ return(ret) } -readGeneExpressionQuantification <- function( +read_gene_expression_quantification <- function( files, cases, genome = "hg19", @@ -558,7 +576,7 @@ readGeneExpressionQuantification <- function( df <- bind_cols(ret[[1]][,1],df) if (summarizedExperiment) { - df <- makeSEfromGeneExpressionQuantification(df, assay.list, genome = genome) + df <- make_se_from_gene_exoression_quantification(df, assay.list, genome = genome) } else { rownames(df) <- df$gene_id df$gene_id <- NULL @@ -567,7 +585,7 @@ readGeneExpressionQuantification <- function( } -makeSEfromGeneExpressionQuantification <- function( +make_se_from_gene_exoression_quantification <- function( df, assay.list, genome = "hg19" @@ -1127,10 +1145,10 @@ colDataPrepare <- function(barcode){ # Check if this breaks the package if(any(grepl("C3N-|C3L-",barcode))) { ret <- data.frame( - sample = sapply(barcode, function(x) stringr::str_split(x,";") %>% unlist()) %>% - unlist %>% unique,stringsAsFactors = FALSE + sample = map(barcode,.f = function(x) stringr::str_split(x,";") %>% unlist) %>% unlist() ) } + if(is.null(ret)) { ret <- data.frame( sample = barcode %>% unique, @@ -1147,14 +1165,12 @@ colDataPrepare <- function(barcode){ step = 10, items = ret$sample ) - if(!is.null(patient.info)) { ret$sample_submitter_id <- ret$sample %>% as.character() ret <- left_join(ret %>% as.data.frame, patient.info %>% unique, by = "sample_submitter_id") } ret$bcr_patient_barcode <- ret$sample %>% as.character() ret$sample_submitter_id <- ret$sample %>% as.character() - if(!"project_id" %in% colnames(ret)) { if("disease_type" %in% colnames(ret)){ aux <- getGDCprojects()[,c(5,7)] @@ -1168,7 +1184,15 @@ colDataPrepare <- function(barcode){ } # na.omit should not be here, exceptional case - if(is.null(ret)) return(data.frame(row.names = barcode, barcode,stringsAsFactors = FALSE)) + if(is.null(ret)) { + return( + data.frame( + row.names = barcode, + barcode, + stringsAsFactors = FALSE + ) + ) + } # Add purity information from http://www.nature.com/articles/ncomms9971 # purity <- getPurityinfo() @@ -1193,16 +1217,18 @@ colDataPrepare <- function(barcode){ if(any(ret$project_id == "CPTAC-3",na.rm = T)) { + print(ret) + save(ret,file = "test.rda") # only merge mixed samples - mixed.samples <- grep(";",barcode,value = T) - if(length(mixed.samples) > 0){ - mixed.samples <- unique(unlist(str_split(mixed.samples,";"))) + mixed_samples <- grep(";",barcode,value = T) + if(length(mixed_samples) > 0){ + mixed_samples <- mixed_samples %>% str_split(";") %>% unlist %>% unique - ret.mixed.samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed.samples) %>% - dplyr::group_by(submitter_id,sample_type) %>% + ret_mixed_samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed_samples) %>% + dplyr::group_by(submitter_id) %>% dplyr::summarise_all(~trimws(paste(unique(.), collapse = ';'))) %>% as.data.frame() - ret <- rbind(ret.mixed.samples,ret) + ret <- rbind(ret_mixed_samples,ret) } idx <- match(barcode,ret$bcr_patient_barcode) @@ -1567,7 +1593,11 @@ readTranscriptomeProfiling <- function( return(df) } -readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){ +read_gene_level_copy_number <- function( + files, + cases, + summarizedExperiment = FALSE +){ message("Reading Gene Level Copy Number files") gistic.df <- NULL gistic.list <- plyr::alply(files,1,.fun = function(file) { @@ -1594,13 +1624,13 @@ readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){ ) if(summarizedExperiment) { - se <- makeSEfromGeneLevelCopyNumber(df, cases) + se <- make_se_from_gene_level_copy_number(df, cases) return(se) } return(df) } -makeSEfromGeneLevelCopyNumber <- function(df, cases){ +make_se_from_gene_level_copy_number <- function(df, cases){ message("Creating a SummarizedExperiment object") rowRanges <- GRanges( seqnames = df$chromosome, @@ -1660,7 +1690,7 @@ readGISTIC <- function(files, cases){ # Reads Copy Number Variation files to a data frame, basically it will rbind it #' @importFrom purrr map2_dfr -readCopyNumberVariation <- function(files, cases){ +read_copy_number_variation <- function(files, cases){ message("Reading copy number variation files") col_types <- ifelse(any(grepl('ascat2', files)),"ccnnnnn","ccnnnd") @@ -1700,10 +1730,12 @@ getFFPE <- function(patient){ options.pretty <- "pretty=true" options.expand <- "expand=samples" option.size <- paste0("size=",length(patient)) - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), - paste0('"',paste(patient,collapse = '","')), - URLencode('"]}}]}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), + paste0('"',paste(patient,collapse = '","')), + URLencode('"]}}]}') + ) url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), @@ -1727,10 +1759,12 @@ getAliquot_ids <- function(barcode){ option.size <- paste0("size=",length(barcode)) #message(paste(barcode,collapse = '","')) #message(paste0('"',paste(barcode,collapse = '","'))) - options.filter <- paste0("filters=", - URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), - paste0('"',paste(barcode,collapse = '","')), - URLencode('"]}}]}')) + options.filter <- paste0( + "filters=", + URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), + paste0('"',paste(barcode,collapse = '","')), + URLencode('"]}}]}') + ) #message(paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))) url <- paste0(baseURL,paste(options.pretty,options.fields, option.size, options.filter, sep = "&")) json <- tryCatch( @@ -1852,7 +1886,10 @@ getBarcodeInfo <- function(barcode) { } } if(!is.null(results$exposures)) { - exposures <- rbindlist(lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x),fill = TRUE) + exposures <- rbindlist( + lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x), + fill = TRUE + ) exposures[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(exposures)))) { exposures$submitter_id <- gsub("-exposure|_exposure.*|-EXP","", exposures$submitter_id) @@ -1872,7 +1909,11 @@ getBarcodeInfo <- function(barcode) { demographic <- results$demographic demographic[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(demographic)))) { - demographic$submitter_id <- gsub("-demographic|_demographic.*|-DEMO|demo-","", results$demographic$submitter_id) + demographic$submitter_id <- gsub( + "-demographic|_demographic.*|-DEMO|demo-", + "", + results$demographic$submitter_id + ) } else { demographic$submitter_id <- submitter_id } @@ -1893,7 +1934,9 @@ getBarcodeInfo <- function(barcode) { .fun = function(x){ demographic[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>% - dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))}) + dplyr::slice( + rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) + }) } df <- dplyr::bind_cols(df %>% as.data.frame,demographic) @@ -1910,9 +1953,11 @@ getBarcodeInfo <- function(barcode) { projects.info <- cbind("submitter_id" = submitter_id, projects.info) suppressWarnings({ - df <- left_join(df, - projects.info, - by = "submitter_id") + df <- left_join( + df, + projects.info, + by = "submitter_id" + ) }) } else { @@ -1922,7 +1967,9 @@ getBarcodeInfo <- function(barcode) { .fun = function(x){ projects.info[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>% - dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))}) + dplyr::slice( + rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) + }) } df <- dplyr::bind_cols(df,projects.info) diff --git a/R/query.R b/R/query.R index 5a86d06f1..020eb13ae 100644 --- a/R/query.R +++ b/R/query.R @@ -153,7 +153,6 @@ GDCquery <- function( workflow.type, access, platform, - file.type, barcode, data.format, experimental.strategy, @@ -168,31 +167,31 @@ GDCquery <- function( } else if(all(sample.type == FALSE)) { sample.type <- NA } + if(missing(data.type)) { data.type <- NA } else if(data.type == FALSE) { data.type <- NA } + if(missing(barcode)) { barcode <- NA } else if(length(barcode) == 1) { if(barcode == FALSE) barcode <- NA } + if(missing(platform)) { platform <- NA } else if(any(platform == FALSE)) { platform <- NA } - if(missing(file.type)) { - file.type <- NA - } else if(file.type == FALSE) { - file.type <- NA - } + if(missing(workflow.type)) { workflow.type <- NA } else if(workflow.type == FALSE) { workflow.type <- NA } + if(missing(experimental.strategy)) { experimental.strategy <- NA } else if(experimental.strategy == FALSE) { @@ -203,6 +202,7 @@ GDCquery <- function( } else if(access == FALSE) { access <- NA } + if(missing(data.format)) { data.format <- NA } else if(data.format == FALSE) { @@ -226,7 +226,6 @@ GDCquery <- function( data.type = data.type, workflow.type = workflow.type, platform = platform, - file.type = file.type, files.access = access, experimental.strategy = experimental.strategy, sample.type = sample.type @@ -247,7 +246,6 @@ GDCquery <- function( data.type = data.type, workflow.type = NA, platform = NA, - file.type = file.type, experimental.strategy = experimental.strategy, files.access = access, sample.type = sample.type @@ -332,8 +330,12 @@ GDCquery <- function( message("ooo By experimental.strategy") results <- results[tolower(results$experimental_strategy) %in% tolower(experimental.strategy),] } else { - message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:", - paste(unique(results$experimental_strategy),collapse = "\n=>"))) + message( + paste0( + "The argument experimental_strategy does not match any of the results.\nPossible values:", + paste(unique(results$experimental_strategy),collapse = "\n=>") + ) + ) } } @@ -342,8 +344,12 @@ GDCquery <- function( message("ooo By data.format") results <- results[tolower(results$data_format) %in% tolower(data.format),] } else { - message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:", - paste(unique(results$data_format),collapse = "\n=>"))) + message( + paste0( + "The argument experimental_strategy does not match any of the results.\nPossible values:", + paste(unique(results$data_format),collapse = "\n=>") + ) + ) } } @@ -367,57 +373,12 @@ GDCquery <- function( results <- results[results$analysis_workflow_type %in% workflow.type,] } - - # Filter by file.type - if(!is.na(file.type)){ - message("ooo By file.type") - pat <- file.type - invert <- FALSE - - # RNA-seq - if(file.type == "normalized_results") pat <- "normalized_results" - if(file.type == "results") pat <- "[^normalized_]results" - - - if(file.type == "nocnv_hg18" | file.type == "nocnv_hg18.seg") pat <- "nocnv_hg18" - if(file.type == "cnv_hg18" | file.type == "hg18.seg") pat <- "[^nocnv_]hg18.seg" - if(file.type == "nocnv_hg19" | file.type == "nocnv_hg19.seg") pat <- "nocnv_hg19" - if(file.type == "cnv_hg19" | file.type == "hg19.seg") pat <- "[^nocnv_]hg19.seg" - - # miRNA-seq - # examples: - # TCGA-E9-A1R5-01A-11R-A14L-13.mirna.quantification.txt - if(file.type == "mirna") { - pat <- "hg19.*mirna" - invert <- TRUE - } - # TCGA-F5-6464-01A-11H-1735-13.hg19.mirna.quantification.txt - if(file.type == "hg19.mirna") pat <- "hg19.mirna" - - # TCGA-AC-A4ZE-01A-11R-A41G-13.hg19.mirbase20.mirna.quantification.txt - if(file.type == "hg19.mirbase20.mirna") pat <- "hg19.mirbase20.mirna" - - # TCGA-CJ-4878-01A-01R-1304-13.isoform.quantification.txt - if(file.type == "hg19.isoform") pat <- "hg19.*isoform" - if(file.type == "isoform") { - pat <- "hg19.*isoform" - invert <- TRUE - } - idx <- grep(pat,results$file_name,invert = invert) - if(length(idx) == 0) { - print(knitr::kable(sort(results$file_name)[1:10],col.names = "Files")) - stop("We were not able to filter using this file type. Examples of available files are above. Please check the vignette for possible entries") - } - results <- results[idx,] - } - # get barcode of the samples # 1) Normally for each sample we will have only single information # however the mutation call uses both normal and tumor which are both # reported by the API if(!data.category %in% c( "Clinical", - "Copy Number Variation", "Biospecimen", "Other", "Simple Nucleotide Variation", @@ -495,11 +456,12 @@ GDCquery <- function( # Auxiliary test files does not have information linked toit. # get frm file names results$cases <- str_extract_all(results$file_name,"TCGA-[:alnum:]{2}-[:alnum:]{4}") %>% unlist - } else if(data.category %in% c( - "Copy Number Variation", - "Simple nucleotide variation", - "Simple Nucleotide Variation") - ){ + } else if( + data.category %in% c( + "Simple nucleotide variation", + "Simple Nucleotide Variation" + ) + ) { cases <- plyr::laply( .data = results$cases, .fun = function(x) { @@ -621,7 +583,6 @@ GDCquery <- function( data.type = data.type, access = I(list(access)), experimental.strategy = I(list(experimental.strategy)), - file.type = file.type, platform = I(list(platform)), sample.type = I(list(sample.type)), barcode = I(list(barcode)), @@ -636,7 +597,6 @@ getGDCquery <- function( data.type, workflow.type, platform, - file.type, files.access, sample.type, experimental.strategy diff --git a/man/GDCprepare_clinic.Rd b/man/GDCprepare_clinic.Rd index d0f51f05c..c24daa1ca 100644 --- a/man/GDCprepare_clinic.Rd +++ b/man/GDCprepare_clinic.Rd @@ -26,7 +26,7 @@ based on the desired information query <- GDCquery( project = "TCGA-COAD", data.category = "Clinical", - file.type = "xml", + data.format = "bcr xml", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) GDCdownload(query) @@ -38,7 +38,7 @@ clinical.admin <- GDCprepare_clinic(query,"admin") query <- GDCquery( project = "TCGA-COAD", data.category = "Biospecimen", - file.type = "xml", + data.format = "bcr xml", data.type = "Biospecimen Supplement", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) diff --git a/man/GDCquery.Rd b/man/GDCquery.Rd index 10153d843..4cb973423 100644 --- a/man/GDCquery.Rd +++ b/man/GDCquery.Rd @@ -11,7 +11,6 @@ GDCquery( workflow.type, access, platform, - file.type, barcode, data.format, experimental.strategy, diff --git a/tests/testthat/test-prepare-download.R b/tests/testthat/test-prepare-download.R index 6b5ee2f9c..4ebaa0bac 100644 --- a/tests/testthat/test-prepare-download.R +++ b/tests/testthat/test-prepare-download.R @@ -124,20 +124,21 @@ test_that("Non TCGA data is processed", { skip_on_bioc() skip_if_offline() - proj <- "MMRF-COMMPASS" query <- GDCquery( - project = proj, - data.category = "Transcriptome Profiling", - data.type = "Gene Expression Quantification", - workflow.type = "STAR - Counts" - ) - query <- GDCquery( - project = proj, + project = "MMRF-COMMPASS", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts", - barcode = getResults(query)$cases[1:4] + barcode = c( + "MMRF_2737_1_BM_CD138pos_T2_TSMRU_L14993", + "MMRF_2739_1_BM_CD138pos_T2_TSMRU_L15000", + "MMRF_1865_1_BM_CD138pos_T2_TSMRU_L05342" + ) ) + GDCdownload(query,directory = "ex") + data <- GDCprepare(query,directory = "ex") + expect_true(ncol(data) == 3) + unlink("ex", recursive = TRUE, force = TRUE) }) test_that("Gene Level Copy Number is being correctly prepare", { @@ -155,6 +156,26 @@ test_that("Gene Level Copy Number is being correctly prepare", { data <- GDCprepare(query,directory = "ex") expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7"))) + expect_true(data$days_to_last_follow_up == c(3038,NA)) + unlink("ex", recursive = TRUE, force = TRUE) +}) + +test_that("Gene Level Copy Number is being correctly prepare for CPTAC-3", { + skip_on_bioc() + skip_if_offline() + + query_CPTAC = GDCquery( + project = "CPTAC-3", + data.category = "Copy Number Variation", + data.type = "Gene Level Copy Number", + barcode = c("CPT0115240002","CPT0088960002") + ) + + GDCdownload(query_CPTAC,directory = "ex") + data <- GDCprepare(query_CPTAC,directory = "ex") + expect_true(ncol(data) == 2) + expect_true(data$submitter_id == c("C3L-02544","C3N-01179")) + expect_true(data$days_to_last_follow_up == c("889","1816")) unlink("ex", recursive = TRUE, force = TRUE) }) diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index f51c2297a..79fea5a56 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -96,21 +96,29 @@ test_that("GDCquery can filter by barcode", { ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) barcode <- c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01") - query <- GDCquery(project = "TCGA-ACC", - data.category = "Copy Number Variation", - data.type = "Copy Number Segment", - barcode = barcode) + query <- GDCquery( + project = "TCGA-ACC", + data.category = "Copy Number Variation", + data.type = "Copy Number Segment", + barcode = barcode + ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) barcode <- c("TCGA-OR-A5KU", "TCGA-OR-A5JK") - query <- GDCquery(project = "TCGA-ACC", - data.category = "Clinical", - file.type = "xml", - barcode = barcode) + query <- GDCquery( + project = "TCGA-ACC", + data.category = "Clinical", + data.format = "bcr xml", + barcode = barcode + ) expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases)))) # Will work if barcode was not found - query <- GDCquery(project = "TCGA-BRCA", data.category = "Clinical",file.type = "xml", - barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q")) + query <- GDCquery( + project = "TCGA-BRCA", + data.category = "Clinical", + data.format = "bcr xml", + barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") + ) expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases)) }) @@ -119,13 +127,19 @@ test_that("GDCquery can filter by access level", { skip_on_bioc() skip_if_offline() - query <- GDCquery(project = "TCGA-KIRP", - data.category = "Simple Nucleotide Variation", - access = "open") + query <- GDCquery( + project = "TCGA-KIRP", + data.category = "Simple Nucleotide Variation", + access = "open" + ) expect_equal(unique(query$results[[1]]$access),"open") - query <- GDCquery(project = "TCGA-KIRP", - data.category = "Simple Nucleotide Variation", - access = "controlled") + + query <- GDCquery( + project = "TCGA-KIRP", + data.category = "Simple Nucleotide Variation", + data.type = "Raw Simple Somatic Mutation", + access = "controlled" + ) expect_equal(unique(query$results[[1]]$access),"controlled") }) @@ -133,7 +147,7 @@ test_that("getNbFiles and getNbCases works", { skip_on_bioc() skip_if_offline() - aux <- getProjectSummary("TCGA-LUAD",TRUE) + aux <- getProjectSummary(project = "TCGA-LUAD") files <- getNbFiles("TCGA-LUAD","Raw microarray data") cases <- getNbCases("TCGA-LUAD","Raw microarray data") expect_true(cases < files) diff --git a/vignettes/clinical.Rmd b/vignettes/clinical.Rmd index 33edd3033..6ab770fd1 100644 --- a/vignettes/clinical.Rmd +++ b/vignettes/clinical.Rmd @@ -74,17 +74,6 @@ query <- GDCquery( GDCdownload(query) clinical.BCRtab.all <- GDCprepare(query) names(clinical.BCRtab.all) - -query <- GDCquery( - project = "TCGA-ACC", - data.category = "Clinical", - data.type = "Clinical Supplement", - data.format = "BCR Biotab", - file.type = "radiation" -) -GDCdownload(query) -clinical.BCRtab.radiation <- GDCprepare(query) - ``` ```{r echo=TRUE, message=FALSE, warning=FALSE} @@ -219,7 +208,7 @@ Below are several examples fetching clinical data directly from the clinical XML query <- GDCquery( project = "TCGA-COAD", data.category = "Clinical", - file.type = "xml", + data.format = "bcr xml", barcode = c("TCGA-RU-A8FL","TCGA-AA-3972") ) GDCdownload(query) @@ -376,7 +365,7 @@ getclinical <- function(proj){ message(proj) while(1){ result = tryCatch({ - query <- GDCquery(project = proj, data.category = "Clinical",file.type = "xml") + query <- GDCquery(project = proj, data.category = "Clinical",data.format = "bcr xml") GDCdownload(query) clinical <- GDCprepare_clinic(query, clinical.info = "patient") for(i in c("admin","radiation","follow_up","drug","new_tumor_event")){