From b2bc706f2275867fbedb62707f0fe1c71a7bb26a Mon Sep 17 00:00:00 2001
From: Tiago Silva <tiagosilva@Tiagos-MBP-2.centurylink.net>
Date: Thu, 18 May 2023 10:53:48 -0400
Subject: [PATCH] Making some changes to the latest version. Removing
 legacy/harmonized from folder structure

Fixing #573
Fixing #574

Needs more checking with CNV and CPTAC-3 projects
---
 NAMESPACE                              |   1 +
 NEWS                                   |   1 +
 R/clinical.R                           |  36 +++--
 R/download.R                           |   4 +-
 R/prepare.R                            | 191 +++++++++++++++----------
 R/query.R                              |  88 ++++--------
 man/GDCprepare_clinic.Rd               |   4 +-
 man/GDCquery.Rd                        |   1 -
 tests/testthat/test-prepare-download.R |  39 +++--
 tests/testthat/test-query.R            |  48 ++++---
 vignettes/clinical.Rmd                 |  15 +-
 11 files changed, 232 insertions(+), 196 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index d39dce727..9437c6330 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -109,6 +109,7 @@ importFrom(dplyr,mutate_all)
 importFrom(dplyr,pull)
 importFrom(dplyr,row_number)
 importFrom(dplyr,slice)
+importFrom(dplyr,summarise)
 importFrom(grDevices,dev.list)
 importFrom(grDevices,dev.off)
 importFrom(grDevices,pdf)
diff --git a/NEWS b/NEWS
index 2f4f54c80..e57e3bca2 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,7 @@ CHANGES IN VERSION 2.29.1
 -------------------------
 
 * Removing support to legacy archive since it will be shutdown by GDC soon.
+* When saving files we will not include folders prefix  legacy/harmonized anymore
 
 CHANGES IN VERSION 2.21.1
 -------------------------
diff --git a/R/clinical.R b/R/clinical.R
index 1419a5a13..79fe7506b 100644
--- a/R/clinical.R
+++ b/R/clinical.R
@@ -170,6 +170,7 @@ TCGAquery_MatchedCoupledSampleTypes <- function(barcode,typesample){
 #' @export
 #' @importFrom data.table rbindlist as.data.table
 #' @importFrom jsonlite fromJSON
+#' @importFrom dplyr summarise
 #' @examples
 #' clinical <- GDCquery_clinic(
 #'    project = "TCGA-ACC",
@@ -348,7 +349,10 @@ GDCquery_clinic <- function(
                                 # we will collapse them into one single row
                                 # concatanating all columns using ;
                                 aux <- x %>% dplyr::group_by(submitter_id) %>%
-                                    summarise(across(everything(),~ paste(unique(.), collapse = ";")))
+                                    dplyr::summarise(
+                                        across(everything(),~ paste(unique(.), collapse = ";"))
+                                    )
+
                                 aux$treatments <- list(dplyr::bind_rows(x$treatments))
                                 aux
                             }
@@ -440,7 +444,7 @@ GDCquery_clinic <- function(
 #' query <- GDCquery(
 #'   project = "TCGA-COAD",
 #'   data.category = "Clinical",
-#'   file.type = "xml",
+#'   data.format = "bcr xml",
 #'   barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")
 #' )
 #' GDCdownload(query)
@@ -452,7 +456,7 @@ GDCquery_clinic <- function(
 #' query <- GDCquery(
 #'    project = "TCGA-COAD",
 #'    data.category = "Biospecimen",
-#'    file.type = "xml",
+#'    data.format = "bcr xml",
 #'    data.type = "Biospecimen Supplement",
 #'    barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")
 #' )
@@ -503,9 +507,9 @@ GDCprepare_clinic <- function(
     }
 
     # Get all the clincal xml files
-    source <- "harmonized"
+
     files <- file.path(
-        query$results[[1]]$project, source,
+        query$results[[1]]$project,
         gsub(" ","_",query$results[[1]]$data_category),
         gsub(" ","_",query$results[[1]]$data_type),
         gsub(" ","_",query$results[[1]]$file_id),
@@ -586,16 +590,18 @@ GDCprepare_clinic <- function(
         message("Updating days_to_last_followup and vital_status from follow_up information using last entry")
         followup <- parseFollowup(files,xpath,clinical.info)
 
-        followup_last <- followup %>% dplyr::group_by(bcr_patient_barcode) %>% dplyr::summarise(
-            days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE),
-            vital_status = vital_status[
-                ifelse(
-                    any(followup$days_to_last_followup %in% ""),
-                    which(followup$days_to_last_followup %in% ""),
-                    which.max(days_to_last_followup)
-                )
-            ]
-        )
+        followup_last <- followup %>%
+            dplyr::group_by(bcr_patient_barcode) %>%
+            dplyr::summarise(
+                days_to_last_followup = max(as.numeric(days_to_last_followup),na.rm = TRUE),
+                vital_status = vital_status[
+                    ifelse(
+                        any(followup$days_to_last_followup %in% ""),
+                        which(followup$days_to_last_followup %in% ""),
+                        which.max(days_to_last_followup)
+                    )
+                ]
+            )
         clin$days_to_last_followup <- followup_last$days_to_last_followup[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)]
         clin$vital_status <- followup_last$vital_status[match(clin$bcr_patient_barcode,followup_last$bcr_patient_barcode)]
     }
diff --git a/R/download.R b/R/download.R
index be812ff9f..16beabc96 100644
--- a/R/download.R
+++ b/R/download.R
@@ -64,8 +64,6 @@ GDCdownload <- function(
         stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.")
     }
 
-    source <- "harmonized"
-
     dir.create(directory, showWarnings = FALSE, recursive = TRUE)
     for(proj in unique(unlist(query$project))){
         message("Downloading data for project ", proj)
@@ -77,7 +75,7 @@ GDCdownload <- function(
 
         path <- unique(
             file.path(
-                proj, source,
+                proj,
                 gsub(" ","_", results$data_category),
                 gsub(" ","_",results$data_type))
         )
diff --git a/R/prepare.R b/R/prepare.R
index fbfc3f172..92b419ee9 100644
--- a/R/prepare.R
+++ b/R/prepare.R
@@ -60,21 +60,20 @@ GDCprepare <- function(
     isServeOK()
     if(missing(query)) stop("Please set query parameter")
 
-    test.duplicated.cases <-
-        (
-            any(
-                duplicated(query$results[[1]]$cases)) & # any duplicated
-                !(query$data.type %in% c(
-                    "Clinical data",
-                    "Protein expression quantification",
-                    "Raw intensities",
-                    "Masked Intensities",
-                    "Clinical Supplement",
-                    "Masked Somatic Mutation",
-                    "Biospecimen Supplement"
-                )
-                )
-        )
+    test.duplicated.cases <- (
+        any(
+            duplicated(query$results[[1]]$cases)) & # any duplicated
+            !(query$data.type %in% c(
+                "Clinical data",
+                "Protein expression quantification",
+                "Raw intensities",
+                "Masked Intensities",
+                "Clinical Supplement",
+                "Masked Somatic Mutation",
+                "Biospecimen Supplement"
+            )
+            )
+    )
 
 
     if(test.duplicated.cases) {
@@ -90,10 +89,10 @@ GDCprepare <- function(
     if (!save & remove.files.prepared) {
         stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost")
     }
-    # We save the files in project/source/data.category/data.type/file_id/file_name
-    source <- "harmonized"
+
+    # We save the files in project/data.category/data.type/file_id/file_name
     files <- file.path(
-        query$results[[1]]$project, source,
+        query$results[[1]]$project,
         gsub(" ","_",query$results[[1]]$data_category),
         gsub(" ","_",query$results[[1]]$data_type),
         gsub(" ","_",query$results[[1]]$file_id),
@@ -105,14 +104,19 @@ GDCprepare <- function(
     # For IDAT prepare since we need to put all IDATs in the same folder the code below will not work
     # a second run
     if (!all(file.exists(files))) {
-        # We have to check we movedthe files
-        if (query$data.type == "Masked Intensities" | query$data.category == "Raw microarray data"){
+        # We have to check we moved the files
+        if (
+            unique(query$results[[1]]$data_type) == "Masked Intensities" |
+            unique(query$results[[1]]$data_category) == "Raw microarray data"
+        ){
+
             files.idat <- file.path(
-                query$results[[1]]$project, source,
+                query$results[[1]]$project,
                 gsub(" ","_",query$results[[1]]$data_category),
                 gsub(" ","_",query$results[[1]]$data_type),
                 gsub(" ","_",query$results[[1]]$file_name)
             )
+
             files.idat <- file.path(directory, files.idat)
             if (!all(file.exists(files) | file.exists(files.idat))) {
                 stop(
@@ -135,7 +139,7 @@ GDCprepare <- function(
     }
 
     cases <- ifelse(
-        grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()),
+        grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-3",query$results[[1]]$project %>% unlist()),
         query$results[[1]]$cases,
         query$results[[1]]$sample.submitter_id
     )
@@ -165,9 +169,15 @@ GDCprepare <- function(
         if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number Scores") {
             data <- readGISTIC(files, query$results[[1]]$cases)
         } else if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number") {
-            data <- readGeneLevelCopyNumber(files, query$results[[1]]$cases,summarizedExperiment = summarizedExperiment)
+            data <- read_gene_level_copy_number(
+                files = files,
+                cases = query$results[[1]]$sample.submitter_id,
+                summarizedExperiment = summarizedExperiment
+            )
         } else {
-            data <- readCopyNumberVariation(files, query$results[[1]]$cases)
+            data <- read_copy_number_variation(
+                files = files, cases = query$results[[1]]$cases
+            )
         }
     }  else if (grepl("Methylation Beta Value",unique(query$results[[1]]$data_type), ignore.case = TRUE)) {
         data <- readDNAmethylation(
@@ -184,29 +194,30 @@ GDCprepare <- function(
             summarizedExperiment = summarizedExperiment,
             platform =  unique(query$results[[1]]$platform)
         )
-    }  else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
+    } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
 
         data <- readProteomeProfiling(files, cases = cases)
 
-    }  else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) {
+    } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) {
 
         data <- readProteinExpression(files, cases = cases)
-        if(summarizedExperiment) {
+
+        if (summarizedExperiment) {
             message("SummarizedExperiment not implemented, if you need samples metadata use the function TCGAbiolinks:::colDataPrepare")
         }
 
     }  else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) {
 
-        if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
+        if (grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
             data <- readSimpleNucleotideVariationMaf(files)
         }
 
     }  else if (grepl("Clinical|Biospecimen", query$data.category, ignore.case = TRUE)){
-        data <- readClinical(files, query$data.type, cases = cases)
+        data <- read_clinical(files, query$data.type, cases = cases)
         summarizedExperiment <- FALSE
     } else if (grepl("Gene expression",query$data.category,ignore.case = TRUE)) {
         if (query$data.type == "Gene expression quantification")
-            data <- readGeneExpressionQuantification(
+            data <- read_gene_expression_quantification(
                 files = files,
                 cases = cases,
                 summarizedExperiment = summarizedExperiment,
@@ -215,7 +226,7 @@ GDCprepare <- function(
             )
 
         if (query$data.type == "miRNA gene quantification")
-            data <- readGeneExpressionQuantification(
+            data <- read_gene_expression_quantification(
                 files = files,
                 cases = cases,
                 summarizedExperiment = FALSE,
@@ -288,22 +299,22 @@ GDCprepare <- function(
         # save is true, due to the check in the beggining of the code
         if(remove.files.prepared){
             # removes files and empty directories
-            remove.files.recursively(files)
+            remove_files_recursively(files)
         }
     }
 
     return(data)
 }
 
-remove.files.recursively <- function(files){
+remove_files_recursively <- function(files){
     files2rm <- dirname(files)
     unlink(files2rm,recursive = TRUE)
     files2rm <- dirname(files2rm) # data category
-    if(length(list.files(files2rm)) == 0) remove.files.recursively(files2rm)
+    if(length(list.files(files2rm)) == 0) remove_files_recursively(files2rm)
 }
 
 
-readClinical <- function(files, data.type, cases){
+read_clinical <- function(files, data.type, cases){
     if(data.type == "Clinical data"){
         suppressMessages({
             ret <- plyr::alply(files,.margins = 1,readr::read_tsv, .progress = "text")
@@ -366,14 +377,17 @@ readSingleCellAnalysis <- function(
 }
 
 #' @importFrom tidyr separate
-readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){
+readExonQuantification <- function (
+        files,
+        cases,
+        summarizedExperiment = TRUE
+){
     pb <- txtProgressBar(min = 0, max = length(files), style = 3)
     assay.list <- NULL
 
     for (i in seq_along(files)) {
         data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE)
 
-
         if(!missing(cases)) {
             assay.list <- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)])
             # We will use this because there might be more than one col for each samples
@@ -385,12 +399,14 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){
         } else {
             df <- merge(df, data, by=colnames(data)[1], all = TRUE)
         }
+
         setTxtProgressBar(pb, i)
     }
     setDF(df)
     rownames(df) <- df[,1]
     df <- df %>% separate(exon,into = c("seqnames","coordinates","strand"),sep = ":") %>%
         separate(coordinates,into = c("start","end"),sep = "-")
+
     if(summarizedExperiment) {
         suppressWarnings({
             assays <- lapply(assay.list, function (x) {
@@ -398,8 +414,10 @@ readExonQuantification <- function (files, cases, summarizedExperiment = TRUE){
             })
         })
         names(assays) <- assay.list
-        regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}",
-                        "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}")
+        regex <- paste0(
+            "[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}",
+            "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}"
+        )
         samples <- na.omit(unique(str_match(colnames(df),regex)[,1]))
         colData <-  colDataPrepare(samples)
         assays <- lapply(assays, function(x){
@@ -509,7 +527,7 @@ readSimpleNucleotideVariationMaf <- function(files){
     return(ret)
 }
 
-readGeneExpressionQuantification <- function(
+read_gene_expression_quantification <- function(
         files,
         cases,
         genome = "hg19",
@@ -558,7 +576,7 @@ readGeneExpressionQuantification <- function(
     df <- bind_cols(ret[[1]][,1],df)
 
     if (summarizedExperiment) {
-        df <- makeSEfromGeneExpressionQuantification(df, assay.list, genome = genome)
+        df <- make_se_from_gene_exoression_quantification(df, assay.list, genome = genome)
     } else {
         rownames(df) <- df$gene_id
         df$gene_id <- NULL
@@ -567,7 +585,7 @@ readGeneExpressionQuantification <- function(
 }
 
 
-makeSEfromGeneExpressionQuantification <- function(
+make_se_from_gene_exoression_quantification <- function(
         df,
         assay.list,
         genome = "hg19"
@@ -1127,10 +1145,10 @@ colDataPrepare <- function(barcode){
     # Check if this breaks the package
     if(any(grepl("C3N-|C3L-",barcode))) {
         ret <- data.frame(
-            sample =  sapply(barcode, function(x) stringr::str_split(x,";") %>% unlist()) %>%
-                unlist %>% unique,stringsAsFactors = FALSE
+            sample = map(barcode,.f = function(x) stringr::str_split(x,";") %>% unlist) %>% unlist()
         )
     }
+
     if(is.null(ret)) {
         ret <- data.frame(
             sample = barcode %>% unique,
@@ -1147,14 +1165,12 @@ colDataPrepare <- function(barcode){
         step = 10,
         items = ret$sample
     )
-
     if(!is.null(patient.info)) {
         ret$sample_submitter_id <- ret$sample %>% as.character()
         ret <- left_join(ret %>% as.data.frame, patient.info %>% unique, by = "sample_submitter_id")
     }
     ret$bcr_patient_barcode <- ret$sample %>% as.character()
     ret$sample_submitter_id <- ret$sample %>% as.character()
-
     if(!"project_id" %in% colnames(ret)) {
         if("disease_type" %in% colnames(ret)){
             aux <- getGDCprojects()[,c(5,7)]
@@ -1168,7 +1184,15 @@ colDataPrepare <- function(barcode){
     }
 
     # na.omit should not be here, exceptional case
-    if(is.null(ret)) return(data.frame(row.names = barcode, barcode,stringsAsFactors = FALSE))
+    if(is.null(ret)) {
+        return(
+            data.frame(
+                row.names = barcode,
+                barcode,
+                stringsAsFactors = FALSE
+            )
+        )
+    }
 
     # Add purity information from http://www.nature.com/articles/ncomms9971
     # purity  <- getPurityinfo()
@@ -1193,16 +1217,18 @@ colDataPrepare <- function(barcode){
 
     if(any(ret$project_id == "CPTAC-3",na.rm = T)) {
 
+        print(ret)
+        save(ret,file = "test.rda")
         # only merge mixed samples
-        mixed.samples <- grep(";",barcode,value = T)
-        if(length(mixed.samples) > 0){
-            mixed.samples <- unique(unlist(str_split(mixed.samples,";")))
+        mixed_samples <- grep(";",barcode,value = T)
+        if(length(mixed_samples) > 0){
+            mixed_samples <- mixed_samples %>% str_split(";") %>% unlist %>% unique
 
-            ret.mixed.samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed.samples) %>%
-                dplyr::group_by(submitter_id,sample_type) %>%
+            ret_mixed_samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed_samples) %>%
+                dplyr::group_by(submitter_id) %>%
                 dplyr::summarise_all(~trimws(paste(unique(.), collapse = ';'))) %>%
                 as.data.frame()
-            ret <- rbind(ret.mixed.samples,ret)
+            ret <- rbind(ret_mixed_samples,ret)
         }
         idx <- match(barcode,ret$bcr_patient_barcode)
 
@@ -1567,7 +1593,11 @@ readTranscriptomeProfiling <- function(
     return(df)
 }
 
-readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){
+read_gene_level_copy_number <- function(
+        files,
+        cases,
+        summarizedExperiment = FALSE
+){
     message("Reading Gene Level Copy Number files")
     gistic.df <- NULL
     gistic.list <- plyr::alply(files,1,.fun = function(file) {
@@ -1594,13 +1624,13 @@ readGeneLevelCopyNumber <- function(files, cases, summarizedExperiment = FALSE){
     )
 
     if(summarizedExperiment) {
-        se <- makeSEfromGeneLevelCopyNumber(df, cases)
+        se <- make_se_from_gene_level_copy_number(df, cases)
         return(se)
     }
     return(df)
 }
 
-makeSEfromGeneLevelCopyNumber <- function(df, cases){
+make_se_from_gene_level_copy_number <- function(df, cases){
     message("Creating a SummarizedExperiment object")
     rowRanges <- GRanges(
         seqnames = df$chromosome,
@@ -1660,7 +1690,7 @@ readGISTIC <- function(files, cases){
 
 # Reads Copy Number Variation files to a data frame, basically it will rbind it
 #' @importFrom purrr map2_dfr
-readCopyNumberVariation <- function(files, cases){
+read_copy_number_variation <- function(files, cases){
     message("Reading copy number variation files")
 
     col_types <- ifelse(any(grepl('ascat2', files)),"ccnnnnn","ccnnnd")
@@ -1700,10 +1730,12 @@ getFFPE <- function(patient){
     options.pretty <- "pretty=true"
     options.expand <- "expand=samples"
     option.size <- paste0("size=",length(patient))
-    options.filter <- paste0("filters=",
-                             URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
-                             paste0('"',paste(patient,collapse = '","')),
-                             URLencode('"]}}]}'))
+    options.filter <- paste0(
+        "filters=",
+        URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
+        paste0('"',paste(patient,collapse = '","')),
+        URLencode('"]}}]}')
+    )
     url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))
     json  <- tryCatch(
         getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE),
@@ -1727,10 +1759,12 @@ getAliquot_ids <- function(barcode){
     option.size <- paste0("size=",length(barcode))
     #message(paste(barcode,collapse = '","'))
     #message(paste0('"',paste(barcode,collapse = '","')))
-    options.filter <- paste0("filters=",
-                             URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
-                             paste0('"',paste(barcode,collapse = '","')),
-                             URLencode('"]}}]}'))
+    options.filter <- paste0(
+        "filters=",
+        URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
+        paste0('"',paste(barcode,collapse = '","')),
+        URLencode('"]}}]}')
+    )
     #message(paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")))
     url <- paste0(baseURL,paste(options.pretty,options.fields, option.size, options.filter, sep = "&"))
     json  <- tryCatch(
@@ -1852,7 +1886,10 @@ getBarcodeInfo <- function(barcode) {
         }
     }
     if(!is.null(results$exposures)) {
-        exposures <- rbindlist(lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x),fill = TRUE)
+        exposures <- rbindlist(
+            lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x),
+            fill = TRUE
+        )
         exposures[,c("updated_datetime","created_datetime","state")] <- NULL
         if(any(grepl("submitter_id", colnames(exposures)))) {
             exposures$submitter_id <- gsub("-exposure|_exposure.*|-EXP","", exposures$submitter_id)
@@ -1872,7 +1909,11 @@ getBarcodeInfo <- function(barcode) {
         demographic <- results$demographic
         demographic[,c("updated_datetime","created_datetime","state")] <- NULL
         if(any(grepl("submitter_id", colnames(demographic)))) {
-            demographic$submitter_id <- gsub("-demographic|_demographic.*|-DEMO|demo-","", results$demographic$submitter_id)
+            demographic$submitter_id <- gsub(
+                "-demographic|_demographic.*|-DEMO|demo-",
+                "",
+                results$demographic$submitter_id
+            )
         } else {
             demographic$submitter_id <- submitter_id
         }
@@ -1893,7 +1934,9 @@ getBarcodeInfo <- function(barcode) {
                     .fun = function(x){
                         demographic[x,] %>% # replicate diagnoses the number of samples
                             as.data.frame() %>%
-                            dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
+                            dplyr::slice(
+                                rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))
+                    })
             }
 
             df <- dplyr::bind_cols(df  %>% as.data.frame,demographic)
@@ -1910,9 +1953,11 @@ getBarcodeInfo <- function(barcode) {
         projects.info <-  cbind("submitter_id" = submitter_id, projects.info)
 
         suppressWarnings({
-            df <- left_join(df,
-                            projects.info,
-                            by = "submitter_id")
+            df <- left_join(
+                df,
+                projects.info,
+                by = "submitter_id"
+            )
         })
     } else {
 
@@ -1922,7 +1967,9 @@ getBarcodeInfo <- function(barcode) {
                 .fun = function(x){
                     projects.info[x,] %>% # replicate diagnoses the number of samples
                         as.data.frame() %>%
-                        dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
+                        dplyr::slice(
+                            rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))
+                })
         }
 
         df <- dplyr::bind_cols(df,projects.info)
diff --git a/R/query.R b/R/query.R
index 5a86d06f1..020eb13ae 100644
--- a/R/query.R
+++ b/R/query.R
@@ -153,7 +153,6 @@ GDCquery <- function(
         workflow.type,
         access,
         platform,
-        file.type,
         barcode,
         data.format,
         experimental.strategy,
@@ -168,31 +167,31 @@ GDCquery <- function(
         } else if(all(sample.type == FALSE)) {
             sample.type <- NA
         }
+
         if(missing(data.type)) {
             data.type <- NA
         } else if(data.type == FALSE) {
             data.type <- NA
         }
+
         if(missing(barcode)) {
             barcode <- NA
         } else if(length(barcode) == 1) {
             if(barcode == FALSE) barcode <- NA
         }
+
         if(missing(platform)) {
             platform <- NA
         } else if(any(platform == FALSE)) {
             platform <- NA
         }
-        if(missing(file.type)) {
-            file.type <- NA
-        } else if(file.type == FALSE) {
-            file.type <- NA
-        }
+
         if(missing(workflow.type)) {
             workflow.type <- NA
         } else if(workflow.type == FALSE) {
             workflow.type <- NA
         }
+
         if(missing(experimental.strategy)) {
             experimental.strategy <- NA
         } else if(experimental.strategy == FALSE) {
@@ -203,6 +202,7 @@ GDCquery <- function(
         } else if(access == FALSE) {
             access <- NA
         }
+
         if(missing(data.format)) {
             data.format <- NA
         } else if(data.format == FALSE) {
@@ -226,7 +226,6 @@ GDCquery <- function(
             data.type = data.type,
             workflow.type = workflow.type,
             platform = platform,
-            file.type = file.type,
             files.access = access,
             experimental.strategy = experimental.strategy,
             sample.type = sample.type
@@ -247,7 +246,6 @@ GDCquery <- function(
                 data.type = data.type,
                 workflow.type = NA,
                 platform = NA,
-                file.type = file.type,
                 experimental.strategy = experimental.strategy,
                 files.access = access,
                 sample.type = sample.type
@@ -332,8 +330,12 @@ GDCquery <- function(
             message("ooo By experimental.strategy")
             results <- results[tolower(results$experimental_strategy) %in% tolower(experimental.strategy),]
         } else {
-            message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:",
-                           paste(unique(results$experimental_strategy),collapse = "\n=>")))
+            message(
+                paste0(
+                    "The argument experimental_strategy does not match any of the results.\nPossible values:",
+                    paste(unique(results$experimental_strategy),collapse = "\n=>")
+                )
+            )
         }
     }
 
@@ -342,8 +344,12 @@ GDCquery <- function(
             message("ooo By data.format")
             results <- results[tolower(results$data_format) %in% tolower(data.format),]
         } else {
-            message(paste0("The argument experimental_strategy does not match any of the results.\nPossible values:",
-                           paste(unique(results$data_format),collapse = "\n=>")))
+            message(
+                paste0(
+                    "The argument experimental_strategy does not match any of the results.\nPossible values:",
+                    paste(unique(results$data_format),collapse = "\n=>")
+                )
+            )
         }
     }
 
@@ -367,57 +373,12 @@ GDCquery <- function(
         results <- results[results$analysis_workflow_type %in% workflow.type,]
     }
 
-
-    # Filter by file.type
-    if(!is.na(file.type)){
-        message("ooo By file.type")
-        pat <- file.type
-        invert <- FALSE
-
-        # RNA-seq
-        if(file.type == "normalized_results") pat <- "normalized_results"
-        if(file.type == "results") pat <- "[^normalized_]results"
-
-
-        if(file.type == "nocnv_hg18" | file.type == "nocnv_hg18.seg") pat <- "nocnv_hg18"
-        if(file.type == "cnv_hg18" | file.type == "hg18.seg") pat <- "[^nocnv_]hg18.seg"
-        if(file.type == "nocnv_hg19" | file.type == "nocnv_hg19.seg") pat <- "nocnv_hg19"
-        if(file.type == "cnv_hg19" | file.type == "hg19.seg") pat <- "[^nocnv_]hg19.seg"
-
-        # miRNA-seq
-        # examples:
-        # TCGA-E9-A1R5-01A-11R-A14L-13.mirna.quantification.txt
-        if(file.type == "mirna") {
-            pat <-  "hg19.*mirna"
-            invert <- TRUE
-        }
-        # TCGA-F5-6464-01A-11H-1735-13.hg19.mirna.quantification.txt
-        if(file.type == "hg19.mirna") pat <- "hg19.mirna"
-
-        # TCGA-AC-A4ZE-01A-11R-A41G-13.hg19.mirbase20.mirna.quantification.txt
-        if(file.type == "hg19.mirbase20.mirna") pat <- "hg19.mirbase20.mirna"
-
-        # TCGA-CJ-4878-01A-01R-1304-13.isoform.quantification.txt
-        if(file.type == "hg19.isoform") pat <- "hg19.*isoform"
-        if(file.type == "isoform") {
-            pat <-  "hg19.*isoform"
-            invert <- TRUE
-        }
-        idx <- grep(pat,results$file_name,invert = invert)
-        if(length(idx) == 0) {
-            print(knitr::kable(sort(results$file_name)[1:10],col.names = "Files"))
-            stop("We were not able to filter using this file type. Examples of available files are above. Please check the vignette for possible entries")
-        }
-        results <- results[idx,]
-    }
-
     # get barcode of the samples
     # 1) Normally for each sample we will have only single information
     # however the mutation call uses both normal and tumor which are both
     # reported by the API
     if(!data.category %in% c(
         "Clinical",
-        "Copy Number Variation",
         "Biospecimen",
         "Other",
         "Simple Nucleotide Variation",
@@ -495,11 +456,12 @@ GDCquery <- function(
         # Auxiliary test files does not have information linked toit.
         # get frm file names
         results$cases <- str_extract_all(results$file_name,"TCGA-[:alnum:]{2}-[:alnum:]{4}") %>% unlist
-    } else if(data.category %in% c(
-        "Copy Number Variation",
-        "Simple nucleotide variation",
-        "Simple Nucleotide Variation")
-    ){
+    } else if(
+        data.category %in% c(
+            "Simple nucleotide variation",
+            "Simple Nucleotide Variation"
+        )
+    ) {
         cases <- plyr::laply(
             .data = results$cases,
             .fun =  function(x) {
@@ -621,7 +583,6 @@ GDCquery <- function(
         data.type = data.type,
         access = I(list(access)),
         experimental.strategy =  I(list(experimental.strategy)),
-        file.type = file.type,
         platform = I(list(platform)),
         sample.type = I(list(sample.type)),
         barcode = I(list(barcode)),
@@ -636,7 +597,6 @@ getGDCquery <- function(
         data.type,
         workflow.type,
         platform,
-        file.type,
         files.access,
         sample.type,
         experimental.strategy
diff --git a/man/GDCprepare_clinic.Rd b/man/GDCprepare_clinic.Rd
index d0f51f05c..c24daa1ca 100644
--- a/man/GDCprepare_clinic.Rd
+++ b/man/GDCprepare_clinic.Rd
@@ -26,7 +26,7 @@ based on the desired information
 query <- GDCquery(
   project = "TCGA-COAD",
   data.category = "Clinical",
-  file.type = "xml",
+  data.format = "bcr xml",
   barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")
 )
 GDCdownload(query)
@@ -38,7 +38,7 @@ clinical.admin <- GDCprepare_clinic(query,"admin")
 query <- GDCquery(
    project = "TCGA-COAD",
    data.category = "Biospecimen",
-   file.type = "xml",
+   data.format = "bcr xml",
    data.type = "Biospecimen Supplement",
    barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")
 )
diff --git a/man/GDCquery.Rd b/man/GDCquery.Rd
index 10153d843..4cb973423 100644
--- a/man/GDCquery.Rd
+++ b/man/GDCquery.Rd
@@ -11,7 +11,6 @@ GDCquery(
   workflow.type,
   access,
   platform,
-  file.type,
   barcode,
   data.format,
   experimental.strategy,
diff --git a/tests/testthat/test-prepare-download.R b/tests/testthat/test-prepare-download.R
index 6b5ee2f9c..4ebaa0bac 100644
--- a/tests/testthat/test-prepare-download.R
+++ b/tests/testthat/test-prepare-download.R
@@ -124,20 +124,21 @@ test_that("Non TCGA data is processed", {
     skip_on_bioc()
     skip_if_offline()
 
-    proj <- "MMRF-COMMPASS"
     query <- GDCquery(
-        project = proj,
-        data.category = "Transcriptome Profiling",
-        data.type = "Gene Expression Quantification",
-        workflow.type = "STAR - Counts"
-    )
-    query <- GDCquery(
-        project = proj,
+        project = "MMRF-COMMPASS",
         data.category = "Transcriptome Profiling",
         data.type = "Gene Expression Quantification",
         workflow.type = "STAR - Counts",
-        barcode = getResults(query)$cases[1:4]
+        barcode = c(
+            "MMRF_2737_1_BM_CD138pos_T2_TSMRU_L14993",
+            "MMRF_2739_1_BM_CD138pos_T2_TSMRU_L15000",
+            "MMRF_1865_1_BM_CD138pos_T2_TSMRU_L05342"
+        )
     )
+    GDCdownload(query,directory = "ex")
+    data <- GDCprepare(query,directory = "ex")
+    expect_true(ncol(data) == 3)
+    unlink("ex", recursive = TRUE, force = TRUE)
 })
 
 test_that("Gene Level Copy Number is being correctly prepare", {
@@ -155,6 +156,26 @@ test_that("Gene Level Copy Number is being correctly prepare", {
     data <- GDCprepare(query,directory = "ex")
 
     expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7")))
+    expect_true(data$days_to_last_follow_up == c(3038,NA))
+    unlink("ex", recursive = TRUE, force = TRUE)
+})
+
+test_that("Gene Level Copy Number is being correctly prepare for CPTAC-3", {
+    skip_on_bioc()
+    skip_if_offline()
+
+    query_CPTAC = GDCquery(
+        project = "CPTAC-3",
+        data.category = "Copy Number Variation",
+        data.type = "Gene Level Copy Number",
+        barcode = c("CPT0115240002","CPT0088960002")
+    )
+
+    GDCdownload(query_CPTAC,directory = "ex")
+    data <- GDCprepare(query_CPTAC,directory = "ex")
+    expect_true(ncol(data) == 2)
+    expect_true(data$submitter_id == c("C3L-02544","C3N-01179"))
+    expect_true(data$days_to_last_follow_up == c("889","1816"))
     unlink("ex", recursive = TRUE, force = TRUE)
 })
 
diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
index f51c2297a..79fea5a56 100644
--- a/tests/testthat/test-query.R
+++ b/tests/testthat/test-query.R
@@ -96,21 +96,29 @@ test_that("GDCquery can filter by barcode", {
     )
     expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases))))
     barcode <- c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01")
-    query <- GDCquery(project = "TCGA-ACC",
-                      data.category = "Copy Number Variation",
-                      data.type = "Copy Number Segment",
-                      barcode = barcode)
+    query <- GDCquery(
+        project = "TCGA-ACC",
+        data.category = "Copy Number Variation",
+        data.type = "Copy Number Segment",
+        barcode = barcode
+    )
     expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases))))
     barcode <- c("TCGA-OR-A5KU", "TCGA-OR-A5JK")
-    query <- GDCquery(project = "TCGA-ACC",
-                      data.category = "Clinical",
-                      file.type = "xml",
-                      barcode = barcode)
+    query <- GDCquery(
+        project = "TCGA-ACC",
+        data.category = "Clinical",
+        data.format = "bcr xml",
+        barcode = barcode
+    )
     expect_true(all(sort(barcode) == sort(unique(query$results[[1]]$cases))))
 
     # Will work if barcode was not found
-    query <- GDCquery(project = "TCGA-BRCA", data.category = "Clinical",file.type = "xml",
-                      barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q"))
+    query <- GDCquery(
+        project = "TCGA-BRCA",
+        data.category = "Clinical",
+        data.format = "bcr xml",
+        barcode = c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q")
+    )
     expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases))
 })
 
@@ -119,13 +127,19 @@ test_that("GDCquery can filter by access level", {
     skip_on_bioc()
     skip_if_offline()
 
-    query <- GDCquery(project = "TCGA-KIRP",
-                      data.category = "Simple Nucleotide Variation",
-                      access = "open")
+    query <- GDCquery(
+        project = "TCGA-KIRP",
+        data.category = "Simple Nucleotide Variation",
+        access = "open"
+    )
     expect_equal(unique(query$results[[1]]$access),"open")
-    query <- GDCquery(project = "TCGA-KIRP",
-                      data.category = "Simple Nucleotide Variation",
-                      access = "controlled")
+
+    query <- GDCquery(
+        project = "TCGA-KIRP",
+        data.category = "Simple Nucleotide Variation",
+        data.type = "Raw Simple Somatic Mutation",
+        access = "controlled"
+    )
     expect_equal(unique(query$results[[1]]$access),"controlled")
 })
 
@@ -133,7 +147,7 @@ test_that("getNbFiles and getNbCases works", {
     skip_on_bioc()
     skip_if_offline()
 
-    aux <- getProjectSummary("TCGA-LUAD",TRUE)
+    aux <- getProjectSummary(project = "TCGA-LUAD")
     files <- getNbFiles("TCGA-LUAD","Raw microarray data")
     cases <- getNbCases("TCGA-LUAD","Raw microarray data")
     expect_true(cases < files)
diff --git a/vignettes/clinical.Rmd b/vignettes/clinical.Rmd
index 33edd3033..6ab770fd1 100644
--- a/vignettes/clinical.Rmd
+++ b/vignettes/clinical.Rmd
@@ -74,17 +74,6 @@ query <- GDCquery(
 GDCdownload(query)
 clinical.BCRtab.all <- GDCprepare(query)
 names(clinical.BCRtab.all)
-
-query <- GDCquery(
-    project = "TCGA-ACC", 
-    data.category = "Clinical",
-    data.type = "Clinical Supplement", 
-    data.format = "BCR Biotab",
-    file.type = "radiation"
-)
-GDCdownload(query)
-clinical.BCRtab.radiation <- GDCprepare(query)
-
 ```
 
 ```{r  echo=TRUE, message=FALSE, warning=FALSE}
@@ -219,7 +208,7 @@ Below are several examples fetching clinical data directly from the clinical XML
 query <- GDCquery(
     project = "TCGA-COAD", 
     data.category = "Clinical", 
-    file.type = "xml", 
+    data.format = "bcr xml",
     barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")
 )
 GDCdownload(query)
@@ -376,7 +365,7 @@ getclinical <- function(proj){
     message(proj)
     while(1){
         result = tryCatch({
-            query <- GDCquery(project = proj, data.category = "Clinical",file.type = "xml")
+            query <- GDCquery(project = proj, data.category = "Clinical",data.format = "bcr xml")
             GDCdownload(query)
             clinical <- GDCprepare_clinic(query, clinical.info = "patient")
             for(i in c("admin","radiation","follow_up","drug","new_tumor_event")){