From f9a7ab3a7f051f3841ff6a5e3eaa45f7c7bea92c Mon Sep 17 00:00:00 2001 From: cpockrandt Date: Wed, 8 Aug 2018 15:56:01 +0200 Subject: [PATCH] [TEST] Unidirectional FM index and iterator --- test/unit/index/CMakeLists.txt | 4 + test/unit/index/fm_index_iterator_test.cpp | 355 +++++++++++++++++++++ test/unit/index/fm_index_test.cpp | 138 ++++++++ 3 files changed, 497 insertions(+) create mode 100644 test/unit/index/CMakeLists.txt create mode 100644 test/unit/index/fm_index_iterator_test.cpp create mode 100644 test/unit/index/fm_index_test.cpp diff --git a/test/unit/index/CMakeLists.txt b/test/unit/index/CMakeLists.txt new file mode 100644 index 00000000000..bb0e1ba5a28 --- /dev/null +++ b/test/unit/index/CMakeLists.txt @@ -0,0 +1,4 @@ +add_subdirectories() + +seqan3_test (fm_index_test.cpp) +seqan3_test (fm_index_iterator_test.cpp) diff --git a/test/unit/index/fm_index_iterator_test.cpp b/test/unit/index/fm_index_iterator_test.cpp new file mode 100644 index 00000000000..e88f7748d69 --- /dev/null +++ b/test/unit/index/fm_index_iterator_test.cpp @@ -0,0 +1,355 @@ +// ============================================================================ +// SeqAn - The Library for Sequence Analysis +// ============================================================================ +// +// Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin +// Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of Knut Reinert or the FU Berlin nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +// DAMAGE. +// +// ============================================================================ + +#include + +#include + +#include +#include + +#include + +using namespace seqan3; +using namespace seqan3::literal; + +// NOTE: this test is a typed test for now since we will add bidirectional iterators later +template +class fm_index_iterator_test : public ::testing::Test +{}; + +using fm_index_iterator_types = ::testing::Types>>>; + +TYPED_TEST_CASE(fm_index_iterator_test, fm_index_iterator_types); + +TYPED_TEST(fm_index_iterator_test, ctr) +{ + using text_type = std::vector; + using index_type = fm_index>; + using iterator_type = TypeParam; + + text_type text{"ACGACG"_dna4}; + index_type sa{text}; + + // custom constructor + iterator_type it0{sa}; + EXPECT_EQ(it0.depth(), 0); + EXPECT_EQ(it0.locate().size(), sa.size()); + + // default construction (does not set the iterator to the root node) + iterator_type it1; + + // copy construction + iterator_type it2{it0}; + EXPECT_EQ(it0, it2); + + // copy assignment + iterator_type it3 = it0; + EXPECT_EQ(it0, it3); + + // move construction + iterator_type it4{std::move(it0)}; + EXPECT_EQ(it0, it4); + + // move assigment + iterator_type it5 = std::move(it0); + EXPECT_EQ(it0, it5); +} + +TYPED_TEST(fm_index_iterator_test, root_node) +{ + typename TypeParam::index_type::text_type text{"ACGACG"_dna4}; + typename TypeParam::index_type sa{text}; + + // root + TypeParam it(sa); + auto result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{0, 1, 2, 3, 4, 5, 6})); // sentinel position included + EXPECT_EQ(it.depth(), 0); + EXPECT_EQ(it.count(), 7); +} + +TYPED_TEST(fm_index_iterator_test, down_range) +{ + typename TypeParam::index_type::text_type text{"ACGACG"_dna4}; + typename TypeParam::index_type sa{text}; + + // successful down(range) + TypeParam it(sa); + EXPECT_TRUE(it.down("CG"_dna4)); + auto result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{1, 4})); + EXPECT_EQ(it.depth(), 2); + EXPECT_EQ(it.count(), 2); + + EXPECT_TRUE(it.down("A"_dna4)); + EXPECT_EQ(it.locate(), (std::vector{1})); + EXPECT_EQ(it.depth(), 3); + EXPECT_EQ(it.count(), 1); + + // unsuccessful down(range), it remains untouched + TypeParam it_cpy = it; + EXPECT_FALSE(it.down("A"_dna4)); + EXPECT_EQ(it, it_cpy); + + // down(range) does not take an empty range + it_cpy = it; + ASSERT_DEATH(it.down(""_dna4), ""); + EXPECT_EQ(it, it_cpy); +} + +TYPED_TEST(fm_index_iterator_test, down_char) +{ + typename TypeParam::index_type::text_type text{"ACGACG"_dna4}; + typename TypeParam::index_type sa{text}; + + // successful down(char) + TypeParam it(sa); + EXPECT_TRUE(it.down(dna4::A)); + auto result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.down(dna4::C)); + result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 2); + + // unsuccessful down(char), it remains untouched + TypeParam it_cpy = it; + EXPECT_FALSE(it.down(dna4::C)); + EXPECT_EQ(it, it_cpy); +} + +TYPED_TEST(fm_index_iterator_test, down_range_and_right) +{ + typename TypeParam::index_type::text_type text{"ACGAACGC"_dna4}; + typename TypeParam::index_type sa{text}; + + // successful down() and right() + TypeParam it(sa); + EXPECT_TRUE(it.down("ACGA"_dna4)); + EXPECT_EQ(it.locate(), (std::vector{0})); + EXPECT_EQ(it.depth(), 4); + + EXPECT_TRUE(it.right()); + EXPECT_EQ(it.locate(), (std::vector{4})); + EXPECT_EQ(it.depth(), 4); +} + +TYPED_TEST(fm_index_iterator_test, down_char_and_right) +{ + typename TypeParam::index_type::text_type text{"ACGAACGC"_dna4}; + typename TypeParam::index_type sa{text}; + + // successful down() and right() + TypeParam it(sa); + EXPECT_TRUE(it.down(dna4::A)); + auto result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{0, 3, 4})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.right()); + result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{1, 5, 7})); + EXPECT_EQ(it.depth(), 1); +} + +TYPED_TEST(fm_index_iterator_test, down_and_right) +{ + typename TypeParam::index_type::text_type text{"ACGACG"_dna4}; + typename TypeParam::index_type sa{text}; + + // successful down() and right() + TypeParam it(sa); + EXPECT_TRUE(it.down()); + auto result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.right()); + result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{1, 4})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.down()); + result = it.locate(); + std::sort(result.begin(), result.end()); + EXPECT_EQ(result, (std::vector{1, 4})); + EXPECT_EQ(it.depth(), 2); + + // unsuccessful right(), it remains untouched + TypeParam it_cpy = it; + EXPECT_FALSE(it.right()); + EXPECT_EQ(it, it_cpy); + + // unsuccessful down(), it remains untouched + it = TypeParam(sa); + EXPECT_TRUE(it.down("GACG"_dna4)); + it_cpy = it; + EXPECT_FALSE(it.down()); + EXPECT_EQ(it, it_cpy); + + // right() cannot be called on the root node + it = TypeParam(sa); + ASSERT_DEATH(it.right(), ""); + EXPECT_EQ(it, TypeParam(sa)); +} + +template +auto get_all_child_iterators(iterator_type it, index_type sa) +{ + std::array> result; + + uint8_t i = 0; + if (it.down()) + { + do + { + result[i++] = it; + } while (it.right()); + } + + // fill the rest with iterators pointing to root + while (i < iterator_type::index_type::char_type::value_size) + { + result[i++] = fm_index_iterator(sa); + } + + return result; +} + +TYPED_TEST(fm_index_iterator_test, children) +{ + typename TypeParam::index_type::text_type text{"ACGTAGGT"_dna4}; + typename TypeParam::index_type sa{text}; + + // all children + TypeParam it(sa); + EXPECT_EQ(it.children(), get_all_child_iterators(it, sa)); + + // some children + it.down(dna4::A); + EXPECT_EQ(it.children(), get_all_child_iterators(it, sa)); + + // one child + it.down(dna4::G); + EXPECT_EQ(it.children(), get_all_child_iterators(it, sa)); + + // no children + it.down("GT"_dna4); + EXPECT_EQ(it.children(), get_all_child_iterators(it, sa)); +} + +TYPED_TEST(fm_index_iterator_test, path_label) +{ + typename TypeParam::index_type::text_type text{"ACGACG"_dna4}; + typename TypeParam::index_type sa{text}; + + // path_label() + TypeParam it(sa); + EXPECT_TRUE(it.down("ACG"_dna4)); + EXPECT_TRUE(ranges::equal(*it, "ACG"_dna4)); + EXPECT_TRUE(ranges::equal(it.path_label(), "ACG"_dna4)); + // TODO: better thisway? EXPECT_EQ(std::vector{it.path_label()}, "ACG"_dna4); +} + +TYPED_TEST(fm_index_iterator_test, incomplete_alphabet) +{ + using text_type = std::vector; + using index_type = fm_index>; + using iterator_type = TypeParam; + + // search a char that does not occur in the text (higher rank than largest char occurring in text) + { + text_type text{"ACGACG"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); + EXPECT_FALSE(it.down(dna4::T)); + EXPECT_EQ(it, iterator_type(sa)); + } + + // search a char that does not occur in the text (smaller rank than smallest char occurring in text) + { + text_type text{"CGTCGT"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); + EXPECT_FALSE(it.down(dna4::A)); + EXPECT_EQ(it, iterator_type(sa)); + } + + // search a char that does not occur in the text + // (some rank that is neither the smallest nor the highest occurring in text) + { + text_type text{"ATATAT"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); + EXPECT_FALSE(it.down(dna4::C)); + EXPECT_FALSE(it.down(dna4::G)); + EXPECT_FALSE(it.down("ACGT"_dna4)); + EXPECT_FALSE(it.down("G"_dna4)); + EXPECT_EQ(it, iterator_type(sa)); + + EXPECT_TRUE(it.down(dna4::A)); + EXPECT_TRUE(it.right()); + EXPECT_TRUE(ranges::equal(it.path_label(), "T"_dna4)); + // TODO: better this way? EXPECT_EQ(std::vector{it.path_label()}, "T"_dna4); + } +} + +TEST(fm_index_iterator, lazy_locate) +{ + std::vector text{"ACGTACGT"_dna4}; + fm_index> sa{text}; + + auto it = sa.root(); + it.down("ACG"_dna4); + + EXPECT_TRUE(ranges::equal(it.locate(), it.lazy_locate())); +} + +// TODO: tests for other alphabets + +TEST(fm_index, concepts) +{ + EXPECT_TRUE(fm_index_iterator_concept>>>); +} diff --git a/test/unit/index/fm_index_test.cpp b/test/unit/index/fm_index_test.cpp new file mode 100644 index 00000000000..a40dc7266e6 --- /dev/null +++ b/test/unit/index/fm_index_test.cpp @@ -0,0 +1,138 @@ +// ============================================================================ +// SeqAn - The Library for Sequence Analysis +// ============================================================================ +// +// Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin +// Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of Knut Reinert or the FU Berlin nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +// DAMAGE. +// +// ============================================================================ + +#include + +#include +#include + +#include + +using namespace seqan3; +using namespace seqan3::literal; + +// TODO: EXPECT_EQ is not supported by sdsl +// TODO: test bitcompressed vector for construction + +TEST(fm_index, ctr) +{ + // default/zero construction + [[maybe_unused]] fm_index> aa27_sa; + // [[maybe_unused]] fm_index> bool_sa; TODO: bool does not satisfy alphabet concept + [[maybe_unused]] fm_index> char_sa; + + fm_index> dna_sa0; + std::vector text{"ACGT"_dna4}; + dna_sa0.construct(text); + + // copy construction + fm_index> dna_sa1{dna_sa0}; + EXPECT_EQ(dna_sa0.size(), dna_sa1.size()); + // EXPECT_EQ(dna_sa0, dna_sa1); + + // copy assignment + fm_index> dna_sa2 = dna_sa0; + EXPECT_EQ(dna_sa0.size(), dna_sa2.size()); + // EXPECT_EQ(dna_sa0, dna_sa2); + + // move construction + fm_index> dna_sa3{std::move(dna_sa0)}; + EXPECT_EQ(dna_sa0.size(), dna_sa3.size()); + // EXPECT_EQ(dna_sa0, dna_sa3); + + // move assigment + fm_index> dna_sa4 = std::move(dna_sa0); + EXPECT_EQ(dna_sa0.size(), dna_sa4.size()); + // EXPECT_EQ(dna_sa0, dna_sa4); + + // container contructor + fm_index> dna_sa5{text}; + EXPECT_EQ(dna_sa0.size(), dna_sa5.size()); + // EXPECT_EQ(dna_sa0, dna_sa5); +} + +TEST(fm_index, swap) +{ + std::vector textA{"ACGT"_dna4}; + std::vector textB{"ACGTACGT"_dna4}; + + fm_index> sa0{textA}; + fm_index> sa1{textB}; + fm_index> sa2{sa0}; + fm_index> sa3{sa1}; + + EXPECT_EQ(sa0.size(), sa2.size()); + EXPECT_EQ(sa1.size(), sa3.size()); + EXPECT_NE(sa0.size(), sa1.size()); + // EXPECT_EQ(sa0, sa2); + + std::swap(sa1, sa2); + + EXPECT_EQ(sa0.size(), sa1.size()); + EXPECT_EQ(sa2.size(), sa3.size()); + EXPECT_NE(sa0.size(), sa2.size()); +} + +TEST(fm_index, size) +{ + fm_index> sa; + EXPECT_TRUE(sa.empty()); + + std::vector test{"ACGTACGT"_dna4}; + sa.construct(test); + EXPECT_EQ(sa.size(), 9); // including a sentinel character +} + +TEST(fm_index, serialization) +{ + std::vector text{"ACGTACGT"_dna4}; + fm_index> sa0{text/*"ACGT"_dna4*/}; + + test::tmp_filename filename{"sa_index"}; + auto const & path = filename.get_path(); + + EXPECT_TRUE(sa0.store(path)); + + fm_index> sa1{}; + EXPECT_TRUE(sa1.load(path)); + + EXPECT_EQ(sa1.size(), 9); + // EXPECT_EQ(sa0, sa1); +} + +TEST(fm_index, concepts) +{ + EXPECT_TRUE(fm_index_concept>>); + EXPECT_TRUE(fm_index_traits_concept); +}