Fixed bug for trueA and updated vignette

CompEpigen · Dec 5, 2018 · cd867fd · cd867fd
1 parent 38c98f4
commit cd867fd
Show file tree

Hide file tree

Showing 8 changed files with 487 additions and 194 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,4 +16,4 @@ Depends:
 	R.utils
 VignetteBuilder: knitr    
 License: GPL-3
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
diff --git a/R/data_preparation.R b/R/data_preparation.R
@@ -176,6 +176,7 @@ prepare_data<-function(
 		if(!is.na(TRUE_A_TOKEN)){
 
 			trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
+			trueA <- apply(trueA,c(1,2),as.numeric)
 			rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
 
 			save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
@@ -184,10 +185,11 @@ prepare_data<-function(
 
 		if(!is.na(HOUSEMAN_A_TOKEN)){
 
-			Ahouseman2012<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
-			rownames(Ahouseman2012)<-NULL
+			trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
+			trueA <- apply(trueA,c(1,2),as.numeric)
+			rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])
 
-			save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
+			save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
 
 		}else if(ESTIMATE_HOUSEMAN_PROP){
 
@@ -199,12 +201,12 @@ prepare_data<-function(
 			print("Estimating proportions using the Houseman et al, 2012 method")
 			res<-estimateProportionsCP(rnb.set, REF_CT_COLUMN, NA, 2000, full.output = TRUE)
 
-			Ahouseman2012<-t(res$contributions.nonneg)
-			Ahouseman2012[Ahouseman2012<1e-5]<-0
+			trueA<-t(res$contributions.nonneg)
+			trueA[trueA<1e-5]<-0
 
-			Ahouseman2012<-sweep(Ahouseman2012, 2, colSums(Ahouseman2012),"/")
+			trueA<-sweep(trueA, 2, colSums(trueA),"/")
 
-			save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
+			save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
 		}
 	}
 
@@ -587,8 +589,12 @@ filter.annotation.biseq<-function(
 #' @param REF_CT_COLUMN Column name in \code{RNB_SET} used to extract methylation information on the reference cell types.
 #' @param PHENO_COLUMNS Vector of column names in the phenotypic table of \code{RNB_SET} that is kept and exported for further 
 #'                 exploration.
+#' @param PREPARE_TRUE_PROPORTIONS Flag indicating if true proportions are either available in \code{RNB_SET} or to be estimated 
+#'                          with Houseman's reference-based deconvolution approach.
 #' @param TRUE_A_TOKEN String present in the column names of \code{RNB_SET} used for selecting the true proportions of the corresponding
 #'                      cell types.
+#' @param HOUSEMAN_A_TOKEN Similar to \code{TRUE_A_TOKEN}, but not containing the true proportions, rather the estimated proportions
+#'                      by Houseman's method.
 #' @param ID_COLUMN Sample-specific ID column name in \code{RNB_SET}
 #' @param FILTER_COVERAGE Flag indicating, if site-filtering based on coverage is to be conducted.
 #' @param MIN_COVERAGE Minimum number of reads required in each sample for the site to be considered for adding to MeDeCom.
@@ -612,7 +618,9 @@ prepare_data_BS <- function(
 		SAMPLE_SELECTION_GREP=NA,
 		REF_CT_COLUMN=NA,
 		PHENO_COLUMNS=NA,
+		PREPARE_TRUE_PROPORTIONS=FALSE,
 		TRUE_A_TOKEN=NA,
+		HOUSEMAN_A_TOKEN=NA,
 		ID_COLUMN=rnb.getOption("identifiers.column"),
 		FILTER_COVERAGE = hasCovg(RNB_SET),
 		MIN_COVERAGE=5,
@@ -659,13 +667,26 @@ prepare_data_BS <- function(
 		sample_ids<-pd[,ID_COLUMN]
 		saveRDS(sample_ids, file=sprintf("%s/sample_ids.RDS", OUTPUTDIR))	
 	}
-	if(!is.na(TRUE_A_TOKEN)){
-
-	  trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
-	  rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
-
-	  save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
+	if(PREPARE_TRUE_PROPORTIONS){
+	  if(!is.na(TRUE_A_TOKEN)){
+
+	    trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
+	    trueA <- apply(trueA,c(1,2),as.numeric)
+	    rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
+
+	    save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
+
+	  }
 
+	  if(!is.na(HOUSEMAN_A_TOKEN)){
+
+	    trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
+	    trueA <- apply(trueA,c(1,2),as.numeric)
+	    rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])
+
+	    save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
+
+	  }
 	}
 	if(!is.na(REF_CT_COLUMN)){
 	  ct<-pd[[REF_CT_COLUMN]]

diff --git a/R/start_analysis.R b/R/start_analysis.R
@@ -357,8 +357,11 @@ start_medecom_analysis<-function(
 	    ref.meth <- trueT
 	    save(ref.meth,file=file.path(store.path,"ref_meth.RData"))
 	  }
+	  if(!is.null(trueA)){
+	    ref.props <- trueA
+	    save(ref.props,file=file.path(store.path,"ref_props.RData"))
+	  }
 	}
-
 	return(result)
 }
 
@@ -544,7 +547,9 @@ start_decomp_pipeline <- function(rnb.set,
                                 SAMPLE_SELECTION_GREP = sample.selection.grep,
                                 REF_CT_COLUMN=ref.ct.column,
                                 PHENO_COLUMNS=pheno.cols,
+                                PREPARE_TRUE_PROPORTIONS=prepare.true.proportions,
                                 TRUE_A_TOKEN=true.A.token,
+                                HOUSEMAN_A_TOKEN=houseman.A.token,
                                 ID_COLUMN=id.column,
                                 FILTER_COVERAGE = filter.coverage,
                                 MIN_COVERAGE=min.coverage,

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 Large automated pipeline for running MeDeCom 
 
 # Using Decomp
-*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.Rmd).
+*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.md).
 
 ## CpG filtering
 There are dedicated preprocessing steps for both array-based data sets (```prepare_data```) and sequencing-based data sets (```prepare_data_BS```).

diff --git a/man/prepare_data_BS.Rd b/man/prepare_data_BS.Rd
diff --git a/vignettes/DecompPipeline.Rmd b/vignettes/DecompPipeline.Rmd
@@ -1,18 +1,17 @@
 ---
-title: "DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom"
+title: 'DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom'
 author: "Michael Scherer, Pavlo Lutsik"
-date: "`r Sys.Date()`"
+date: '`r Sys.Date()`'
 output:
-  rmarkdown::html_document:
-    mathjax: default
-    toc: true
-    number_sections: false
-    fig_width: 5
+  html_document:
     fig_height: 5
-vignette: >
-  %\VignetteIndexEntry{MeDeCom}
-  %\VignetteEngine{knitr::rmarkdown}
-  \usepackage[utf8]{inputenc}
+    fig_width: 5
+    keep_md: yes
+    mathjax: default
+    number_sections: no
+    toc: yes
+  pdf_document:
+    toc: yes
 bibliography: biblio.bib
 ---
 
@@ -66,7 +65,7 @@ names(data.prep)
 
 For bisulfite sequencing data sets, different filtering criteria apply. First, a absolute coverage threshold can be specified with ```MIN_COVERAGE``` to remove all sites with lower coverage. Similar to array-based data sets, upper and lower quantile of coverage can be omitted using ```MIN_COVG_QUANT``` and ```MAX_COVG_QUANT```. In complete accordance with array-based data sets, sites having missing values, located at annotated SNPs and on sex chromosomes can be removed.
 
-```{r}
+```{r, eval=T}
 rnb.set <- load.rnb.set(system.file("extdata/small_rnbSet.zip",package="DecompPipeline"))
 data.prep.bs <- prepare_data_BS(RNB_SET = rnb.set,
                                 MIN_COVERAGE = 5,
@@ -96,7 +95,7 @@ Since performing MeDeCom on complete 450k/EPIC or BS datasets is still computati
 
 For most of the options (except for **houseman2012**, **jaffe2014**, and **range**) the number of selected sites can be specified using the parameter ```N_MARKERS```. In contrast to CpG filtering, subset selection is independent of the data type (array-based and BS). The function returns a list, with each entry containing row indices of the selected sites:
 
-```{r}
+```{r, eval=T}
 cg_subsets <- prepare_CG_subsets(rnb.set=data.prep$rnb.set.filtered,
                                  MARKER_SELECTION = c("houseman2012","var"),
                                  N_MARKERS = 1000
@@ -108,7 +107,7 @@ lengths(cg_subsets)
 
 After these preprocessing steps, you are ready to perfom the actual MeDeCom analysis using the ```start_medecom_analysis``` function. To store output in a format that is later on readable by FactorViz, you need to set the flag ```factorviz.outputs```. Further parameters are described in detail in the reference manual.
 
-```{r}
+```{r, eval=F}
 md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
                                  cg_groups = cg_subsets,
                                  Ks=2:5,
@@ -120,7 +119,7 @@ md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
 
 You can also peform all the steps above, by just calling a single function:
 
-```{r}
+```{r, eval=F}
 md.res <- start_decomp_pipeline(rnb.set=rnb.set,
                                 Ks=2:5,
                                 lambda.grid = c(0.01,0.001),