Skip to content

Commit

Permalink
Fixed bug for trueA and updated vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
schmic05 committed Dec 5, 2018
1 parent 38c98f4 commit cd867fd
Show file tree
Hide file tree
Showing 8 changed files with 487 additions and 194 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ Depends:
R.utils
VignetteBuilder: knitr
License: GPL-3
RoxygenNote: 6.1.0
RoxygenNote: 6.1.1
47 changes: 34 additions & 13 deletions R/data_preparation.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ prepare_data<-function(
if(!is.na(TRUE_A_TOKEN)){

trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
trueA <- apply(trueA,c(1,2),as.numeric)
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])

save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
Expand All @@ -184,10 +185,11 @@ prepare_data<-function(

if(!is.na(HOUSEMAN_A_TOKEN)){

Ahouseman2012<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
rownames(Ahouseman2012)<-NULL
trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
trueA <- apply(trueA,c(1,2),as.numeric)
rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])

save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))

}else if(ESTIMATE_HOUSEMAN_PROP){

Expand All @@ -199,12 +201,12 @@ prepare_data<-function(
print("Estimating proportions using the Houseman et al, 2012 method")
res<-estimateProportionsCP(rnb.set, REF_CT_COLUMN, NA, 2000, full.output = TRUE)

Ahouseman2012<-t(res$contributions.nonneg)
Ahouseman2012[Ahouseman2012<1e-5]<-0
trueA<-t(res$contributions.nonneg)
trueA[trueA<1e-5]<-0

Ahouseman2012<-sweep(Ahouseman2012, 2, colSums(Ahouseman2012),"/")
trueA<-sweep(trueA, 2, colSums(trueA),"/")

save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
}
}

Expand Down Expand Up @@ -587,8 +589,12 @@ filter.annotation.biseq<-function(
#' @param REF_CT_COLUMN Column name in \code{RNB_SET} used to extract methylation information on the reference cell types.
#' @param PHENO_COLUMNS Vector of column names in the phenotypic table of \code{RNB_SET} that is kept and exported for further
#' exploration.
#' @param PREPARE_TRUE_PROPORTIONS Flag indicating if true proportions are either available in \code{RNB_SET} or to be estimated
#' with Houseman's reference-based deconvolution approach.
#' @param TRUE_A_TOKEN String present in the column names of \code{RNB_SET} used for selecting the true proportions of the corresponding
#' cell types.
#' @param HOUSEMAN_A_TOKEN Similar to \code{TRUE_A_TOKEN}, but not containing the true proportions, rather the estimated proportions
#' by Houseman's method.
#' @param ID_COLUMN Sample-specific ID column name in \code{RNB_SET}
#' @param FILTER_COVERAGE Flag indicating, if site-filtering based on coverage is to be conducted.
#' @param MIN_COVERAGE Minimum number of reads required in each sample for the site to be considered for adding to MeDeCom.
Expand All @@ -612,7 +618,9 @@ prepare_data_BS <- function(
SAMPLE_SELECTION_GREP=NA,
REF_CT_COLUMN=NA,
PHENO_COLUMNS=NA,
PREPARE_TRUE_PROPORTIONS=FALSE,
TRUE_A_TOKEN=NA,
HOUSEMAN_A_TOKEN=NA,
ID_COLUMN=rnb.getOption("identifiers.column"),
FILTER_COVERAGE = hasCovg(RNB_SET),
MIN_COVERAGE=5,
Expand Down Expand Up @@ -659,13 +667,26 @@ prepare_data_BS <- function(
sample_ids<-pd[,ID_COLUMN]
saveRDS(sample_ids, file=sprintf("%s/sample_ids.RDS", OUTPUTDIR))
}
if(!is.na(TRUE_A_TOKEN)){

trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])

save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
if(PREPARE_TRUE_PROPORTIONS){
if(!is.na(TRUE_A_TOKEN)){

trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
trueA <- apply(trueA,c(1,2),as.numeric)
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])

save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))

}

if(!is.na(HOUSEMAN_A_TOKEN)){

trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
trueA <- apply(trueA,c(1,2),as.numeric)
rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])

save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))

}
}
if(!is.na(REF_CT_COLUMN)){
ct<-pd[[REF_CT_COLUMN]]
Expand Down
7 changes: 6 additions & 1 deletion R/start_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,11 @@ start_medecom_analysis<-function(
ref.meth <- trueT
save(ref.meth,file=file.path(store.path,"ref_meth.RData"))
}
if(!is.null(trueA)){
ref.props <- trueA
save(ref.props,file=file.path(store.path,"ref_props.RData"))
}
}

return(result)
}

Expand Down Expand Up @@ -544,7 +547,9 @@ start_decomp_pipeline <- function(rnb.set,
SAMPLE_SELECTION_GREP = sample.selection.grep,
REF_CT_COLUMN=ref.ct.column,
PHENO_COLUMNS=pheno.cols,
PREPARE_TRUE_PROPORTIONS=prepare.true.proportions,
TRUE_A_TOKEN=true.A.token,
HOUSEMAN_A_TOKEN=houseman.A.token,
ID_COLUMN=id.column,
FILTER_COVERAGE = filter.coverage,
MIN_COVERAGE=min.coverage,
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Large automated pipeline for running MeDeCom

# Using Decomp
*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.Rmd).
*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.md).

## CpG filtering
There are dedicated preprocessing steps for both array-based data sets (```prepare_data```) and sequencing-based data sets (```prepare_data_BS```).
Expand Down
10 changes: 9 additions & 1 deletion man/prepare_data_BS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 14 additions & 15 deletions vignettes/DecompPipeline.Rmd
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
---
title: "DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom"
title: 'DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom'
author: "Michael Scherer, Pavlo Lutsik"
date: "`r Sys.Date()`"
date: '`r Sys.Date()`'
output:
rmarkdown::html_document:
mathjax: default
toc: true
number_sections: false
fig_width: 5
html_document:
fig_height: 5
vignette: >
%\VignetteIndexEntry{MeDeCom}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
fig_width: 5
keep_md: yes
mathjax: default
number_sections: no
toc: yes
pdf_document:
toc: yes
bibliography: biblio.bib
---

Expand Down Expand Up @@ -66,7 +65,7 @@ names(data.prep)

For bisulfite sequencing data sets, different filtering criteria apply. First, a absolute coverage threshold can be specified with ```MIN_COVERAGE``` to remove all sites with lower coverage. Similar to array-based data sets, upper and lower quantile of coverage can be omitted using ```MIN_COVG_QUANT``` and ```MAX_COVG_QUANT```. In complete accordance with array-based data sets, sites having missing values, located at annotated SNPs and on sex chromosomes can be removed.

```{r}
```{r, eval=T}
rnb.set <- load.rnb.set(system.file("extdata/small_rnbSet.zip",package="DecompPipeline"))
data.prep.bs <- prepare_data_BS(RNB_SET = rnb.set,
MIN_COVERAGE = 5,
Expand Down Expand Up @@ -96,7 +95,7 @@ Since performing MeDeCom on complete 450k/EPIC or BS datasets is still computati

For most of the options (except for **houseman2012**, **jaffe2014**, and **range**) the number of selected sites can be specified using the parameter ```N_MARKERS```. In contrast to CpG filtering, subset selection is independent of the data type (array-based and BS). The function returns a list, with each entry containing row indices of the selected sites:

```{r}
```{r, eval=T}
cg_subsets <- prepare_CG_subsets(rnb.set=data.prep$rnb.set.filtered,
MARKER_SELECTION = c("houseman2012","var"),
N_MARKERS = 1000
Expand All @@ -108,7 +107,7 @@ lengths(cg_subsets)

After these preprocessing steps, you are ready to perfom the actual MeDeCom analysis using the ```start_medecom_analysis``` function. To store output in a format that is later on readable by FactorViz, you need to set the flag ```factorviz.outputs```. Further parameters are described in detail in the reference manual.

```{r}
```{r, eval=F}
md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
cg_groups = cg_subsets,
Ks=2:5,
Expand All @@ -120,7 +119,7 @@ md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,

You can also peform all the steps above, by just calling a single function:

```{r}
```{r, eval=F}
md.res <- start_decomp_pipeline(rnb.set=rnb.set,
Ks=2:5,
lambda.grid = c(0.01,0.001),
Expand Down
Loading

0 comments on commit cd867fd

Please sign in to comment.