diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9693bf4..c2a3d98 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ exclude: | (?x)( ^assets/| ^docs/.*.html| - ^data-raw/*.txt| + ^inst/extdata| ^man/ ) repos: diff --git a/DESCRIPTION b/DESCRIPTION index 40c34ff..97a201c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,9 +16,9 @@ BugReports: https://github.com/CCBR/reneeTools/issues Depends: R (>= 2.10) Imports: - assertthat, DESeq2, dplyr, + S7, tidyr Suggests: readr, diff --git a/NAMESPACE b/NAMESPACE index 024a199..fedbab0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,11 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(create_deseq_obj) +export(counts_dat_to_matrix) export(filter_low_counts) -export(read_raw_counts) +export(meta_tbl_to_dat) +export(reneeDataSetFromDataFrames) +export(reneeDataSetFromFiles) +export(run_deseq2) +if (getRversion() < "4.3.0") importFrom("S7", "@") importFrom(dplyr,"%>%") diff --git a/NEWS.md b/NEWS.md index cd5e27e..fc8ce76 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,10 @@ - Create a `NEWS.md` file to track changes to the package. -## New functions +## Main functions & classes +- `reneeDataSet` (#16) + - `reneeDataSetFromFiles()` (#16) + - `reneeDataSetFromDataFrames()` (#16) + - `run_deseq2()` (#16) - `filter_low_counts()` (#10) -- `create_deseq_obj()` (#13) diff --git a/R/counts.R b/R/counts.R new file mode 100644 index 0000000..8bb3a9b --- /dev/null +++ b/R/counts.R @@ -0,0 +1,25 @@ +#' Convert a data frame of gene counts to a matrix +#' +#' @param counts_tbl expected gene counts from RSEM as a data frame or tibble. +#' +#' @return matrix of gene counts with rows as gene IDs +#' @export +#' +#' @examples +#' counts_dat_to_matrix(head(gene_counts)) +counts_dat_to_matrix <- function(counts_tbl) { + gene_id <- GeneName <- NULL + counts_dat <- counts_tbl %>% + # deseq2 requires integer counts + dplyr::mutate(dplyr::across( + dplyr::where(is.numeric), + \(x) as.integer(round(x, 0)) + )) %>% + as.data.frame() + row.names(counts_dat) <- counts_dat %>% dplyr::pull("gene_id") + # convert counts tibble to matrix + counts_mat <- counts_dat %>% + dplyr::select(-c(gene_id, GeneName)) %>% + as.matrix() + return(counts_mat) +} diff --git a/R/deseq2.R b/R/deseq2.R index c6459e8..5a97575 100644 --- a/R/deseq2.R +++ b/R/deseq2.R @@ -1,37 +1,29 @@ -#' Create DESeq2 object from gene counts and sample metadata +#' Run DESeq2 on a reneeDataSet #' -#' @param counts_tbl expected gene counts from RSEM as a data frame or tibble. -#' @param meta_dat sample metadata as a data frame with rownames as sample IDs. -#' @param design model formula for experimental design. Columns must exist in `meta_dat`. +#' @param renee_ds reneeDataSet object +#' @param design model formula for experimental design. Columns must exist in `meta_dat`. +#' @param ... remaining variables are forwarded to `DESeq2::DESeq()`. #' -#' @return DESeqDataSet +#' @return reneeDataSet object with DESeq2 slot filled #' @export #' #' @examples -#' sample_meta <- data.frame( -#' row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), -#' condition = factor(c("knockout", "knockout", "wildtype", "wildtype"), -#' levels = c("wildtype", "knockout") +#' rds <- reneeDataSetFromFiles( +#' system.file("extdata", +#' "RSEM.genes.expected_count.all_samples.txt", +#' package = "reneeTools" +#' ), +#' system.file("extdata", "sample_metadata.tsv", +#' package = "reneeTools" #' ) #' ) -#' dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) -create_deseq_obj <- function(counts_tbl, meta_dat, design) { - gene_id <- GeneName <- NULL - counts_dat <- counts_tbl %>% - # deseq2 requires integer counts - dplyr::mutate(dplyr::across( - dplyr::where(is.numeric), - \(x) as.integer(round(x, 0)) - )) %>% - as.data.frame() - row.names(counts_dat) <- counts_dat %>% dplyr::pull("gene_id") - # convert counts tibble to matrix - counts_mat <- counts_dat %>% - dplyr::select(-c(gene_id, GeneName)) %>% - as.matrix() - - # sample IDs must be in the same order - assertthat::are_equal(colnames(counts_mat), rownames(meta_dat)) - - return(DESeq2::DESeqDataSetFromMatrix(counts_mat, meta_dat, design)) +#' run_deseq2(rds, ~condition) +run_deseq2 <- function(renee_ds, design, ...) { + dds <- DESeq2::DESeqDataSetFromMatrix( + renee_ds@counts, + renee_ds@sample_meta, + design + ) + renee_ds@analyses$deseq2_ds <- DESeq2::DESeq(dds, ...) + return(renee_ds) } diff --git a/R/metadata.R b/R/metadata.R new file mode 100644 index 0000000..730edad --- /dev/null +++ b/R/metadata.R @@ -0,0 +1,22 @@ +#' Convert sample metadata from a tibble to a dataframe with sample IDs as row names +#' +#' @param meta_tbl tibble with `sample_id` column +#' +#' @return dataframe where row names are the sample IDs +#' @export +#' +#' @examples +#' sample_meta_tbl <- readr::read_tsv(system.file("extdata", +#' "sample_metadata.tsv", +#' package = "reneeTools" +#' )) +#' head(sample_meta_tbl) +#' meta_tbl_to_dat(sample_meta_tbl) +meta_tbl_to_dat <- function(meta_tbl) { + sample_id <- NULL + meta_dat <- meta_tbl %>% + as.data.frame() %>% + dplyr::select(-sample_id) + rownames(meta_dat) <- meta_tbl %>% dplyr::pull(sample_id) + return(meta_dat) +} diff --git a/R/renee-class.R b/R/renee-class.R new file mode 100644 index 0000000..f908a68 --- /dev/null +++ b/R/renee-class.R @@ -0,0 +1,64 @@ +reneeDataSet <- S7::new_class("renee", + properties = list( + counts = S7::new_S3_class("matrix"), + sample_meta = S7::class_data.frame, + analyses = S7::class_list + ), + constructor = function(count_matrix, sample_meta_dat) { + S7::new_object(S7::S7_object(), + counts = count_matrix, + sample_meta = sample_meta_dat, + analyses = list() + ) + } +) + +#' Construct a reneeDataSet object from tsv files. +#' +#' @param gene_counts_filepath path to tsv file of expected gene counts from RSEM. +#' @param sample_meta_filepath path to tsv file with sample IDs and metadata for differential analysis. +#' +#' @return reneeDataSet object +#' @export +#' +#' @examples +#' reneeDataSetFromFiles( +#' system.file("extdata", "RSEM.genes.expected_count.all_samples.txt", package = "reneeTools"), +#' system.file("extdata", "sample_metadata.tsv", package = "reneeTools") +#' ) +reneeDataSetFromFiles <- function(gene_counts_filepath, sample_meta_filepath) { + count_dat <- readr::read_tsv(gene_counts_filepath) + sample_meta_dat <- readr::read_tsv(sample_meta_filepath) + return(reneeDataSetFromDataFrames(count_dat, sample_meta_dat)) +} + +#' Construct a reneeDataSet object from data frames +#' +#' @param gene_counts_dat expected gene counts from RSEM as a data frame or tibble. +#' Must contain a `gene_id` column and a column for each sample ID in the metadata. +#' @param sample_meta_dat sample metadata as a data frame or tibble. +#' Must contain a `sample_ID` column. +#' +#' @return reneeDataSet object +#' @export +#' +#' @examples +#' sample_meta <- data.frame( +#' sample_id = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), +#' condition = factor( +#' c("knockout", "knockout", "wildtype", "wildtype"), +#' levels = c("wildtype", "knockout") +#' ) +#' ) +#' reneeDataSetFromDataFrames(gene_counts, sample_meta) +reneeDataSetFromDataFrames <- function(gene_counts_dat, sample_meta_dat) { + count_mat <- gene_counts_dat %>% counts_dat_to_matrix() + sample_meta_dat <- sample_meta_dat %>% meta_tbl_to_dat() + + # sample IDs must be in the same order + if (!all(colnames(count_mat) == rownames(sample_meta_dat))) { + stop("Not all columns in the count matrix equal the rows in the sample metadata. Sample IDs must be in the same order.") + } + + return(reneeDataSet(count_mat, sample_meta_dat)) +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..3a6f2a2 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,9 @@ +# source: https://rconsortium.github.io/S7/articles/packages.html#method-registration +.onLoad <- function(...) { + S7::methods_register() +} + +# enable usage of @name in package code +# source: https://rconsortium.github.io/S7/articles/packages.html#backward-compatibility +#' @rawNamespace if (getRversion() < "4.3.0") importFrom("S7", "@") +NULL diff --git a/data-raw/gene_counts.R b/data-raw/gene_counts.R index 6b68dc8..36d2942 100644 --- a/data-raw/gene_counts.R +++ b/data-raw/gene_counts.R @@ -1,3 +1,3 @@ # WT_S1.RSEM.genes.results was generated from running RENEE v2.5.3 on the test dataset https://github.com/CCBR/RENEE/tree/e08f7db6c6e638cfd330caa182f64665d2ef37fa/.tests -gene_counts <- readr::read_tsv("data-raw/RSEM.genes.expected_count.all_samples.txt") +gene_counts <- readr::read_tsv(system.file("inst", "extdata", "RSEM.genes.expected_count.all_samples.txt", package = "reneeTools")) usethis::use_data(gene_counts, overwrite = TRUE) diff --git a/data-raw/RSEM.genes.expected_count.all_samples.txt b/inst/extdata/RSEM.genes.expected_count.all_samples.txt similarity index 100% rename from data-raw/RSEM.genes.expected_count.all_samples.txt rename to inst/extdata/RSEM.genes.expected_count.all_samples.txt diff --git a/inst/extdata/sample_metadata.tsv b/inst/extdata/sample_metadata.tsv new file mode 100644 index 0000000..48f3de4 --- /dev/null +++ b/inst/extdata/sample_metadata.tsv @@ -0,0 +1,5 @@ +sample_id condition +KO_S3 knockout +KO_S4 knockout +WT_S1 wildtype +WT_S2 wildtype diff --git a/man/counts_dat_to_matrix.Rd b/man/counts_dat_to_matrix.Rd new file mode 100644 index 0000000..7f4917a --- /dev/null +++ b/man/counts_dat_to_matrix.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/counts.R +\name{counts_dat_to_matrix} +\alias{counts_dat_to_matrix} +\title{Convert a data frame of gene counts to a matrix} +\usage{ +counts_dat_to_matrix(counts_tbl) +} +\arguments{ +\item{counts_tbl}{expected gene counts from RSEM as a data frame or tibble.} +} +\value{ +matrix of gene counts with rows as gene IDs +} +\description{ +Convert a data frame of gene counts to a matrix +} +\examples{ +counts_dat_to_matrix(head(gene_counts)) +} diff --git a/man/create_deseq_obj.Rd b/man/create_deseq_obj.Rd deleted file mode 100644 index e0b3d83..0000000 --- a/man/create_deseq_obj.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/deseq2.R -\name{create_deseq_obj} -\alias{create_deseq_obj} -\title{Create DESeq2 object from gene counts and sample metadata} -\usage{ -create_deseq_obj(counts_tbl, meta_dat, design) -} -\arguments{ -\item{counts_tbl}{expected gene counts from RSEM as a data frame or tibble.} - -\item{meta_dat}{sample metadata as a data frame with rownames as sample IDs.} - -\item{design}{model formula for experimental design. Columns must exist in \code{meta_dat}.} -} -\value{ -DESeqDataSet -} -\description{ -Create DESeq2 object from gene counts and sample metadata -} -\examples{ -sample_meta <- data.frame( - row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), - condition = factor(c("knockout", "knockout", "wildtype", "wildtype"), - levels = c("wildtype", "knockout") - ) -) -dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) -} diff --git a/man/meta_tbl_to_dat.Rd b/man/meta_tbl_to_dat.Rd new file mode 100644 index 0000000..162cbd7 --- /dev/null +++ b/man/meta_tbl_to_dat.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metadata.R +\name{meta_tbl_to_dat} +\alias{meta_tbl_to_dat} +\title{Convert sample metadata from a tibble to a dataframe with sample IDs as row names} +\usage{ +meta_tbl_to_dat(meta_tbl) +} +\arguments{ +\item{meta_tbl}{tibble with \code{sample_id} column} +} +\value{ +dataframe where row names are the sample IDs +} +\description{ +Convert sample metadata from a tibble to a dataframe with sample IDs as row names +} +\examples{ +sample_meta_tbl <- readr::read_tsv(system.file("extdata", + "sample_metadata.tsv", + package = "reneeTools" +)) +head(sample_meta_tbl) +meta_tbl_to_dat(sample_meta_tbl) +} diff --git a/man/read_raw_counts.Rd b/man/read_raw_counts.Rd deleted file mode 100644 index fb9fb09..0000000 --- a/man/read_raw_counts.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_raw_counts.R -\name{read_raw_counts} -\alias{read_raw_counts} -\title{read_raw_counts} -\usage{ -read_raw_counts(pathtofile, delimiter = "\\t") -} -\arguments{ -\item{pathtofile}{string absolute file path to raw counts matrix} - -\item{delimiter}{string default is tab} -} -\value{ -raw_counts_matrix -} -\description{ -read_raw_counts -} diff --git a/man/reneeDataSetFromDataFrames.Rd b/man/reneeDataSetFromDataFrames.Rd new file mode 100644 index 0000000..f2f959d --- /dev/null +++ b/man/reneeDataSetFromDataFrames.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/renee-class.R +\name{reneeDataSetFromDataFrames} +\alias{reneeDataSetFromDataFrames} +\title{Construct a reneeDataSet object from data frames} +\usage{ +reneeDataSetFromDataFrames(gene_counts_dat, sample_meta_dat) +} +\arguments{ +\item{gene_counts_dat}{expected gene counts from RSEM as a data frame or tibble. +Must contain a \code{gene_id} column and a column for each sample ID in the metadata.} + +\item{sample_meta_dat}{sample metadata as a data frame or tibble. +Must contain a \code{sample_ID} column.} +} +\value{ +reneeDataSet object +} +\description{ +Construct a reneeDataSet object from data frames +} +\examples{ +sample_meta <- data.frame( + sample_id = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), + condition = factor( + c("knockout", "knockout", "wildtype", "wildtype"), + levels = c("wildtype", "knockout") + ) +) +reneeDataSetFromDataFrames(gene_counts, sample_meta) +} diff --git a/man/reneeDataSetFromFiles.Rd b/man/reneeDataSetFromFiles.Rd new file mode 100644 index 0000000..5ecffe1 --- /dev/null +++ b/man/reneeDataSetFromFiles.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/renee-class.R +\name{reneeDataSetFromFiles} +\alias{reneeDataSetFromFiles} +\title{Construct a reneeDataSet object from tsv files.} +\usage{ +reneeDataSetFromFiles(gene_counts_filepath, sample_meta_filepath) +} +\arguments{ +\item{gene_counts_filepath}{path to tsv file of expected gene counts from RSEM.} + +\item{sample_meta_filepath}{path to tsv file with sample IDs and metadata for differential analysis.} +} +\value{ +reneeDataSet object +} +\description{ +Construct a reneeDataSet object from tsv files. +} +\examples{ +reneeDataSetFromFiles( + system.file("extdata", "RSEM.genes.expected_count.all_samples.txt", package = "reneeTools"), + system.file("extdata", "sample_metadata.tsv", package = "reneeTools") +) +} diff --git a/man/run_deseq2.Rd b/man/run_deseq2.Rd new file mode 100644 index 0000000..8dd04b8 --- /dev/null +++ b/man/run_deseq2.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2.R +\name{run_deseq2} +\alias{run_deseq2} +\title{Run DESeq2 on a reneeDataSet} +\usage{ +run_deseq2(renee_ds, design, ...) +} +\arguments{ +\item{renee_ds}{reneeDataSet object} + +\item{design}{model formula for experimental design. Columns must exist in \code{meta_dat}.} + +\item{...}{remaining variables are forwarded to \code{DESeq2::DESeq()}.} +} +\value{ +reneeDataSet object with DESeq2 slot filled +} +\description{ +Run DESeq2 on a reneeDataSet +} +\examples{ +rds <- reneeDataSetFromFiles( + system.file("extdata", + "RSEM.genes.expected_count.all_samples.txt", + package = "reneeTools" + ), + system.file("extdata", "sample_metadata.tsv", + package = "reneeTools" + ) +) +run_deseq2(rds, ~condition) +} diff --git a/tests/testthat/test-deseq2.R b/tests/testthat/test-deseq2.R index 321bb01..c62e2cb 100644 --- a/tests/testthat/test-deseq2.R +++ b/tests/testthat/test-deseq2.R @@ -1,26 +1,40 @@ set.seed(20231228) -test_that("create_deseq_obj works", { - sample_meta <- - data.frame( - row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), - condition = factor( - c("knockout", "knockout", "wildtype", "wildtype"), - levels = c("wildtype", "knockout") - ) +test_that("run_deseq2 works", { + renee_ds <- reneeDataSetFromFiles( + system.file( + "extdata", + "RSEM.genes.expected_count.all_samples.txt", + package = "reneeTools" + ), + system.file("extdata", "sample_metadata.tsv", + package = "reneeTools" ) - dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) - + ) + renee_ds@sample_meta <- renee_ds@sample_meta %>% + dplyr::mutate(condition = factor(condition, + levels = c("wildtype", "knockout") + )) + renee_ds <- run_deseq2(renee_ds, design = ~condition, fitType = "local") + dds <- renee_ds@analyses$deseq2_ds expect_equal( dds@colData %>% as.data.frame(), structure( - list(condition = structure( - c(2L, 2L, 1L, 1L), - levels = c( - "wildtype", - "knockout" + list( + condition = structure( + c(2L, 2L, 1L, 1L), + levels = c( + "wildtype", + "knockout" + ), + class = "factor" ), - class = "factor" - )), + sizeFactor = c( + 0.759835685651593, + 0.718608223926169, + 1.24466595457696, + 1.68179283050743 + ) + ), class = "data.frame", row.names = c( "KO_S3", @@ -33,9 +47,31 @@ test_that("create_deseq_obj works", { structure( list( counts.KO_S3 = c(25L, 16L, 19L), - counts.KO_S4 = c(22L, 10L, 26L), + counts.KO_S4 = c( + 22L, + 10L, 26L + ), counts.WT_S1 = c(74L, 0L, 10L), - counts.WT_S2 = c(104L, 0L, 8L) + counts.WT_S2 = c( + 104L, + 0L, 8L + ), + mu.1 = c(24.1516682192168, 13.3092987017581, 23.1890203839711), + mu.2 = c(22.8412375617529, 12.5871575689046, 21.9308214491443), + mu.3 = c(75.6181929866826, 0.158242295472865, 7.74702804399805), + mu.4 = c(102.175314069834, 0.213817014139956, 10.4677854923446), + H.1 = c(0.511844332569779, 0.504138999697003, 0.50668916671393), + H.2 = c(0.48815561528062, 0.495860729372594, 0.493310737076545), + H.3 = c(0.454896663393873, 0.499997728127273, 0.447711231007698), + H.4 = c(0.545103297274594, 0.499997728127273, 0.552288438770429), + cooks.1 = c( + 0.00787282712563543, + 0.0262422886768066, + 0.0478403297046226 + ), + cooks.2 = c(0.00740399240405431, 0.025741204681852, 0.0464156166543046), + cooks.3 = c(0.00250160501520332, 0.127533456822227, 0.0779681548830785), + cooks.4 = c(0.00307311736197857, 0.161328557073165, 0.100411826759898) ), class = "data.frame", row.names = c( diff --git a/tests/testthat/test-renee-class.R b/tests/testthat/test-renee-class.R new file mode 100644 index 0000000..9193620 --- /dev/null +++ b/tests/testthat/test-renee-class.R @@ -0,0 +1,39 @@ +test_that("reneeDataSetFromFiles works", { + rds <- reneeDataSetFromFiles( + system.file("extdata", "RSEM.genes.expected_count.all_samples.txt", package = "reneeTools"), + system.file("extdata", "sample_metadata.tsv", package = "reneeTools") + ) + expect_equal( + rds@counts %>% head(), + structure(c( + 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L + ), dim = c(6L, 4L), dimnames = list( + c( + "ENSG00000121410.11", "ENSG00000268895.5", "ENSG00000148584.15", + "ENSG00000175899.14", "ENSG00000245105.3", "ENSG00000166535.20" + ), c("KO_S3", "KO_S4", "WT_S1", "WT_S2") + )) + ) + expect_equal( + rds@sample_meta, + structure(list(condition = c( + "knockout", "knockout", "wildtype", + "wildtype" + )), row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), class = "data.frame") + ) +}) + +test_that("reneeDataSetFromDataFrames detect problems", { + sample_meta <- data.frame( + sample_id = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), + condition = factor( + c("knockout", "knockout", "wildtype", "wildtype"), + levels = c("wildtype", "knockout") + ) + ) + expect_error( + reneeDataSetFromDataFrames(gene_counts[, 1:4], sample_meta), + "Not all columns" + ) +})