Add unit tests

morrislab · Apr 18, 2019 · 655e17d · 655e17d
1 parent 6190e0d
commit 655e17d
Show file tree

Hide file tree

Showing 8 changed files with 310 additions and 0 deletions.
diff --git a/tests/R/test_apa_id.R b/tests/R/test_apa_id.R
@@ -0,0 +1,17 @@
+context("Test APA_ID generation")
+
+x <- c("ENSG00000001", "ENSG00000001", "ENSG00000999", "ENSG00000001")
+
+test_that("A number is added to Ensembl ID", {
+    expect_equal(apa_id(x), paste(x, c(1,2,1,1), sep="_"))
+    expect_equal(apa_id(sort(x)), paste(sort(x), c(1:3,1), sep="_"))
+    expect_equal(apa_id(x[3]), paste(x[3], "1", sep="_"))
+})
+
+context("Test APA_ID suffix update")
+
+test_that("A suffix is added for single UTR", {
+    expect_equal(update_apa_id(x[3], 5, 10), paste(x[3], "S", sep="_"))
+    expect_equal(update_apa_id(x[1:2], c(5, 5), c(10, 9)), paste(x[1:2], c("D", "P"), sep="_"))
+    expect_equal(update_apa_id(x[1:2], c(5, 5), c(4, 3)), paste(x[1:2], c("P", "D"), sep="_"))
+})
diff --git a/tests/R/test_format_multi_ensembl_ids.R b/tests/R/test_format_multi_ensembl_ids.R
@@ -0,0 +1,80 @@
+library(stringr)
+context("Test formatting of single-Ensembl IDs")
+
+id <- "ENSMUST0000011043_ENSMUSG00000111044_mm9_chr1"
+
+test_that("A single Ensembl ID remains unchanged", {
+    expect_equal(format_multi_ensembl_ids(id), id)
+})
+
+test_that("A vector of single Ensembl IDs remains unchanged", {
+    expect_equal(format_multi_ensembl_ids(c(id, id)), c(id, id))
+})
+
+context("Test formatting of multi-Ensembl IDs")
+
+id <- "ENSMUST00000111043_ENSMUSG00000048482,ENSMUST00000111044_ENSMUSG00000048482_mm9_chr1"
+expected <- "ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048482_mm9_chr1"
+
+test_that("A suffix is added for single UTR", {
+    expect_equal(format_multi_ensembl_ids(id), expected)
+})
+
+test_that("A vector of multi-Ensembl IDs is re-formatted", {
+    expect_equal(format_multi_ensembl_ids(c(id, id)), c(expected, expected))
+})
+
+context("Test vector of mixed (single and multi) Ensembl IDs")
+
+id <- c("ENSMUST00000111043_ENSMUSG00000048482,ENSMUST00000111044_ENSMUSG00000048482_hg19_chr1",
+        "ENSMUST00000100011_ENSMUSG00000048481_hg19_chr2",
+        "ENSMUST00000111043_ENSMUSG00000048480,ENSMUST00000111044_ENSMUSG00000048482_hg19_chr1")
+
+test_that("A vector of mixed Ensembl IDs", {
+    expect_equal(format_multi_ensembl_ids(id), 
+                 c("ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048482_hg19_chr1",
+                   id[2],
+                   "ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048480,ENSMUSG00000048482_hg19_chr1")
+    )
+})
+
+
+context("Test non-Ensembl IDs")
+
+test_that("Non-Ensembl transcript ID is accepted", {
+    id <- c("XY.00000027036_000..2_mm10_chr1",
+            "FF.22_1.z,0101_1.z_mm10_chr1",
+            "1_.,2_.,3_._mm10_chr1")
+    expect_equal(format_multi_ensembl_ids(id),
+                 c("XY.00000027036_000..2_mm10_chr1",
+                   "FF.22,0101_1.z_mm10_chr1",
+                   "1,2,3_._mm10_chr1"))
+})
+
+test_that("Underscore at beginning of ID will fail", {
+    expect_error(format_multi_ensembl_ids("_ENSMUST00000111043_ENSMUSG00000048482,ENSMUST00000111044_ENSMUSG00000048482_hg19_chr1"))
+})
+
+context("Test chromosomes without chr prefix")
+test_that("Chromosome without chr prefix is accepted", {
+    id <- c("ENSMUST00000111043_ENSMUSG00000048482,ENSMUST00000111044_ENSMUSG00000048482_hg19_1",
+            "ENSMUST00000100011_ENSMUSG00000048481_hg19_2",
+            "ENSMUST00000111043_ENSMUSG00000048480,ENSMUST00000111044_ENSMUSG00000048482_hg19_z")
+    expect_equal(format_multi_ensembl_ids(id),
+                 c("ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048482_hg19_1",
+                   id[2],
+                   "ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048480,ENSMUSG00000048482_hg19_z")
+    )
+})
+
+context("Test unk species")
+test_that("Non-hg19 and non-mm10 species are allowed as unk", {
+    expect_equal(format_multi_ensembl_ids("1_0,2_0_unk_chr1"),
+                 "1,2_0_unk_chr1")
+})
+
+test_that("Complex unknown species and non-standard chr is accepted", {
+    expect_equal(format_multi_ensembl_ids("1_0,2_0_ut_z"),
+                 "1,2_0_ut_z")
+})
+
diff --git a/tests/R/test_get_first_sample_ix.R b/tests/R/test_get_first_sample_ix.R
@@ -0,0 +1,30 @@
+context("Test getting the index of the first sample in merged data")
+
+set.seed(123)
+nr <- 10
+N <- 3
+df <- data.frame(APA_ID = letters[1:nr],
+                 Ensembl_Gene = sample(letters, nr),
+                 Gene_Name = rainbow(nr),
+                 Chr = "chr1",
+                 Start = round(runif(nr, 1000, 2000)),
+                 End = round(runif(nr, 3000, 5000)),
+                 Strand = "+",
+                 Length = round(runif(nr, 100, 500)),
+                 SampleA = rnorm(nr),
+                 SampleB = rnorm(nr),
+                 SampleC = rnorm(nr)
+)
+
+test_that("Gets index of first sample column in data frame", {
+    expected <- 9
+    expect_equal(get_first_sample_ix(df), expected)
+})
+
+
+context("Test getting the number of samples from merged data")
+
+test_that("Can calculate number of samples in data frame", {
+    expected <- N
+    expect_equal(get_num_samples(df), expected)
+})
diff --git a/tests/R/test_separate_ensembl_field.R b/tests/R/test_separate_ensembl_field.R
@@ -0,0 +1,55 @@
+context("Test splitting of underscore-delimited input ensembl field")
+
+set.seed(123)
+df <- data.table(Transcript=
+                     c("ENSMUST00000027036_Lypla1_mm10_chr1_4844962_4846739_+_utr_4845016_4846739::chr1:4844962-4846739(+)",
+                       "ENSMUST00000081551_Tcea1,ENSMUST00000165720_Tcea1_mm10_chr1_4896355_4897910_+_utr_4896364_4897910::chr1:4896355-4897910(+)"),
+                 Length=c(100, 1000),
+                 SampleA=runif(2),
+                 SampleB=runif(2),
+                 SampleC=rnorm(2),
+                 stringsAsFactors=FALSE
+)
+
+test_that("'Transcript' field is split into components for 1-row data frame", {
+    exp.df <- df[1,]
+    separate_ensembl_field(exp.df)
+    # expect_equal(ncol(exp.df), 12)
+    # expect_match(exp.df$Transcript[1], "^ENS.*\\d$")
+    expected_cols <- union(c("Transcript", "Gene", "Chr", "LastExon.Start",
+                             "LastExon.End", "Strand", "UTR3.Start", "UTR3.End"),
+                           colnames(df)) 
+    expect_true(all(expected_cols %in% colnames(exp.df)))
+})
+
+test_that("'Transcript' field is split into components for multi-line data frame", {
+    exp.df <- copy(df)
+    separate_ensembl_field(exp.df)
+    # expect_equal(ncol(exp.df), 10)
+    expect_is(exp.df$Transcript, "character")
+    expected_cols <- union(c("Transcript", "Gene", "Chr", "LastExon.Start",
+                             "LastExon.End", "Strand", "UTR3.Start", "UTR3.End"),
+                           colnames(df)) 
+    expect_true(all(expected_cols %in% colnames(exp.df)))
+})
+
+context("Test non-Ensembl IDs")
+set.seed(123)
+df <- data.table(Transcript=
+                     c("XY.00000027036_000..2_mm10_chr1_4844962_4846739_+_utr_4845016_4846739::chr1:4844962-4846739(+)",
+                       "FF.22_1.z,0101_1.z_mm10_chr1_4896355_4897910_+_utr_4896364_4897910::chr1:4896355-4897910(+)"),
+                 Length=c(100, 1000),
+                 SampleA=runif(2),
+                 SampleB=runif(2),
+                 SampleC=rnorm(2),
+                 stringsAsFactors=FALSE
+)
+
+test_that("Non-Ensembl transcript ID is accepted", {
+    exp.df <- copy(df)
+    separate_ensembl_field(exp.df)
+    expect_equal(exp.df$Gene, c("000..2", "1.z"), "Genes do not match")
+    expect_equal(exp.df$Transcript, c("XY.00000027036", "FF.22,0101"),
+                 "Transcripts do not match")
+})
+
diff --git a/tests/python/Row_test.py b/tests/python/Row_test.py
@@ -0,0 +1,42 @@
+import unittest
+import sys
+from qapa import extract as ex
+
+class RowTestCase(unittest.TestCase):
+
+    def setUp(self):
+        example_row = '143	ENSMUST00000100750.9	chrX_random	-	74026591	74085669	74035416	74079934	4	74026591,74036981,74079908,74085509,	74036494,74037332,74080032,74085669,	0	Mecp2	cmpl	cmpl	2,2,0,-1,'
+        self.row = ex.Row(example_row)
+
+    def test_3utr_length(self):
+        target = self.row.get_3utr_length()
+        expected = self.row.cdsStart - self.row.txStart
+        self.assertEqual(target, expected)
+
+
+    def test_random_chromosome(self):
+        target = self.row.is_on_random_chromosome()
+        self.assertTrue(target)
+
+    def test_random_chromosome_no_chr(self):
+        my_row = '143	ENSMUST00000100750.9	X	-	74026591	74085669	74035416	74079934	4	74026591,74036981,74079908,74085509,	74036494,74037332,74080032,74085669,	0	Mecp2	cmpl	cmpl	2,2,0,-1,'
+        my_row = ex.Row(my_row)
+        target = my_row.is_on_random_chromosome()
+        self.assertFalse(target)
+
+    def test_chrY(self):
+        my_row = '143	ENSMUST00000100750.9	chrY	-	74026591	74085669	74035416	74079934	4	74026591,74036981,74079908,74085509,	74036494,74037332,74080032,74085669,	0	Mecp2	cmpl	cmpl	2,2,0,-1,'
+        my_row = ex.Row(my_row)
+        target = my_row.is_on_random_chromosome()
+        self.assertFalse(target)
+
+
+    def test_extract_last_exon(self):
+        target = self.row.extract_last_exon(n=1, min_utr_length=0)
+        self.assertEqual(target[1], 74026591, "Start coord not equal")
+        self.assertEqual(target[2], 74036494, "End coord not equal")
+
+
+if __name__ == '__main__':
+    #print sys.argv[0]
+    unittest.main()
diff --git a/tests/python/annotate_test.py b/tests/python/annotate_test.py
@@ -0,0 +1,50 @@
+import unittest
+import sys
+import pybedtools
+from qapa import annotate as anno
+
+class AnnotateTestCase(unittest.TestCase):
+
+    def setUp(self):
+        example = "chrX	74026591	74036494	ENSMUST00000100750_Mecp2	8825	-	74035416	74036494	Mecp2	74026591,74036981,74079908,74085509	74036494,74037332,74080032,74085669"
+        self.bed = pybedtools.BedTool(example, from_string=True)
+
+
+    def test_extend_feature(self):
+        l = 1
+        feature = self.bed[0]
+        target = anno.extend_feature(feature, length=l)
+        self.assertEqual(target.start, 74026591 - l)
+        self.assertEqual(target.end, 74036494)
+
+
+    def test_gene_at_beginning_of_chr_1(self):
+        example = "chr17_KI270861v1_alt	0	5793	ENST00000634102_SLC43A2 5631	-	5631	5793	SLC43A2 0,6624,8277,13231,15940,20829,21296,21593,23214,43222,45006,46589,57742,58821 5793,6748,8351,13364,16079,20976,21499,21727,23307,43299,45062,46797,57948,58864"
+        bed = pybedtools.BedTool(example, from_string=True)
+        l = 24
+        feature = bed[0]
+        target = anno.extend_feature(feature, length=l)
+        self.assertEqual(target.start, 0)
+        self.assertEqual(target.end, 5793)
+
+        target = anno.restore_feature(target, length=l)
+        self.assertEqual(target.start, 0)
+
+    def test_gene_at_beginning_of_chr_2(self):
+        example = "chr17_KI270861v1_alt	24	5793	ENST00000634102_SLC43A2 5631	-	5631	5793	SLC43A2 24,6624,8277,13231,15940,20829,21296,21593,23214,43222,45006,46589,57742,58821 5793,6748,8351,13364,16079,20976,21499,21727,23307,43299,45062,46797,57948,58864"
+        bed = pybedtools.BedTool(example, from_string=True)
+        l = 24
+        feature = bed[0]
+        target = anno.extend_feature(feature, length=l)
+        self.assertEqual(target.start, 0)
+        self.assertEqual(target.end, 5793)
+
+        target = anno.restore_feature(target, length=l)
+        self.assertEqual(target.start, 24)
+
+
+
+
+if __name__ == '__main__':
+    #print sys.argv[0]
+    unittest.main()
diff --git a/tests/run_R_tests.R b/tests/run_R_tests.R
@@ -0,0 +1,28 @@
+#!/usr/bin/env Rscript
+
+# https://stackoverflow.com/a/29132294
+source_funcs <- function(x) {
+    cmds <- parse(x)
+    assign.funs <- sapply(cmds, function(x) {
+        if(x[[1]]=="<-") {
+            if(x[[3]][[1]]=="function") {
+                return(TRUE)
+            }
+        }
+        return(FALSE)
+    })
+    return(cmds[assign.funs])
+}
+
+files <- c("../scripts/create_merged_data.R",
+           "../scripts/compute_pau.R")
+for (f in files) {
+    cmds <- source_funcs(f)
+    eval(cmds)
+}
+
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(data.table))
+suppressPackageStartupMessages(library(dplyr))
+library(testthat)
+test_dir("R/")
diff --git a/tests/run_py_tests.sh b/tests/run_py_tests.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+for i in python/*_test.py
+do
+    python $i
+    echo -e "\n"
+done
+