Skip to content

Commit

Permalink
Incorporate match consolidation into search (#95)
Browse files Browse the repository at this point in the history
* Incorporate match consolidation into search

* Update consolidation test data
  • Loading branch information
eaasna authored Sep 13, 2023
1 parent 9dc6d47 commit 88f91b3
Show file tree
Hide file tree
Showing 50 changed files with 604 additions and 986 deletions.
12 changes: 0 additions & 12 deletions include/utilities/argument_parsing/consolidate.hpp

This file was deleted.

13 changes: 0 additions & 13 deletions include/utilities/consolidate/consolidate.hpp

This file was deleted.

21 changes: 21 additions & 0 deletions include/utilities/consolidate/consolidate_matches.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <filesystem>

#include <valik/shared.hpp>
#include <valik/split/database_metadata.hpp>

#include <utilities/consolidate/io.hpp>

namespace valik
{

/**
* @brief Function that removes duplicates from split Stellar search results and writes the final output file.
*
* @param arguments Command line arguments.
* @return false If could not clean up intermediate results.
*/
void consolidate_matches(search_arguments const & arguments);

} // namespace valik
1 change: 0 additions & 1 deletion include/utilities/consolidate/merge_processes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ namespace valik
* @param var_pack Environmental variables, this function calls the merge executable.
* @return false if merge failed.
*/

bool merge_processes(search_arguments const & arguments,
app::search_time_statistics & time_statistics,
app::execution_metadata & exec_meta,
Expand Down
7 changes: 0 additions & 7 deletions include/utilities/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,4 @@ std::vector<field_t> get_line_vector(std::string const line, char const delim)
return line_vec;
}

struct consolidation_arguments
{
std::filesystem::path matches_in{};
std::filesystem::path ref_meta_path{};
std::filesystem::path matches_out{};
};

} // namespace valik
2 changes: 2 additions & 0 deletions include/valik/search/search.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <valik/search/search_time_statistics.hpp>
#include <valik/shared.hpp>

#include <utilities/consolidate/consolidate_matches.hpp>

namespace valik::app
{

Expand Down
1 change: 1 addition & 0 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ struct search_arguments final : public minimiser_threshold_arguments, public ste
std::vector<std::vector<std::string>> bin_path{};
std::filesystem::path query_file{};
std::filesystem::path index_file{};
std::filesystem::path all_matches{};
std::filesystem::path out_file{"search.gff"};

bool compressed{false};
Expand Down
14 changes: 5 additions & 9 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ target_link_libraries ("raptor_threshold" PUBLIC "${PROJECT_NAME}_interface")
add_library ("${PROJECT_NAME}_search_lib" STATIC valik_search.cpp)
target_link_libraries ("${PROJECT_NAME}_search_lib" PUBLIC "raptor_threshold")

# Valik consolidate
add_library ("${PROJECT_NAME}_consolidation_lib" STATIC valik_consolidate.cpp)
target_link_libraries ("${PROJECT_NAME}_consolidation_lib" PUBLIC "${PROJECT_NAME}_interface")
# Consolidating matches from split Stellar runs
add_library ("${PROJECT_NAME}_consolidation_lib" STATIC consolidate/consolidate_matches.cpp)
target_link_libraries ("${PROJECT_NAME}_consolidation_lib" PUBLIC "raptor_threshold")

add_library ("${PROJECT_NAME}_consolidation_io_lib" STATIC consolidate/io.cpp)
target_link_libraries ("${PROJECT_NAME}_consolidation_io_lib" PUBLIC "${PROJECT_NAME}_interface")
target_link_libraries ("${PROJECT_NAME}_consolidation_io_lib" PUBLIC "raptor_threshold")

add_library ("${PROJECT_NAME}_merge_processes_lib" STATIC consolidate/merge_processes.cpp)
target_link_libraries ("${PROJECT_NAME}_merge_processes_lib" PUBLIC "${PROJECT_NAME}_interface")
target_link_libraries ("${PROJECT_NAME}_merge_processes_lib" PUBLIC "raptor_threshold")

# Sliding window argument parsing
add_library ("${PROJECT_NAME}_argument_parsing_shared_lib" STATIC argument_parsing/shared.cpp)
Expand All @@ -62,9 +62,6 @@ target_link_libraries ("${PROJECT_NAME}_argument_parsing_build_lib" PUBLIC "${PR
add_library ("${PROJECT_NAME}_argument_parsing_search_lib" STATIC argument_parsing/search.cpp)
target_link_libraries ("${PROJECT_NAME}_argument_parsing_search_lib" PUBLIC "${PROJECT_NAME}_argument_parsing_shared_lib")

add_library ("${PROJECT_NAME}_argument_parsing_consolidation_lib" STATIC argument_parsing/consolidate.cpp)
target_link_libraries ("${PROJECT_NAME}_argument_parsing_consolidation_lib" PUBLIC "${PROJECT_NAME}_argument_parsing_shared_lib")

add_library ("${PROJECT_NAME}_argument_parsing_top_level_lib" STATIC argument_parsing/top_level.cpp)
target_link_libraries ("${PROJECT_NAME}_argument_parsing_top_level_lib" PUBLIC "${PROJECT_NAME}_argument_parsing_shared_lib")

Expand All @@ -73,7 +70,6 @@ add_library ("${PROJECT_NAME}_lib" INTERFACE)
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_split_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_build_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_search_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_consolidation_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_shared_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_top_level_lib")
target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_split_lib")
Expand Down
38 changes: 0 additions & 38 deletions src/argument_parsing/consolidate.cpp

This file was deleted.

9 changes: 9 additions & 0 deletions src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,15 @@ void run_search(sharg::parser & parser)
}
}

// ==========================================
// Create temporary file path for merging distributed Stellar runs.
// ==========================================
if (!arguments.ref_meta_path.empty())
{
arguments.all_matches = arguments.out_file;
arguments.all_matches += ".preliminary";
}

// ==========================================
// More checks.
// ==========================================
Expand Down
17 changes: 17 additions & 0 deletions src/consolidate/consolidate_matches.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include <utilities/consolidate/consolidate_matches.hpp>

namespace valik
{

void consolidate_matches(search_arguments const & arguments)
{
auto ref_meta = database_metadata(arguments.ref_meta_path, false);
auto matches = read_stellar_output(arguments.all_matches, ref_meta);

std::sort( matches.begin(), matches.end(), std::greater<stellar_match>());
matches.erase( std::unique( matches.begin(), matches.end() ), matches.end() );

write_stellar_output(arguments.out_file, matches);
}

} // namespace valik
9 changes: 7 additions & 2 deletions src/consolidate/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,20 @@ namespace valik
{

std::vector<stellar_match> read_stellar_output(std::filesystem::path const & match_path,
database_metadata const & reference,
std::ios_base::openmode const mode /* = std::ios_base::in */)
database_metadata const & reference,
std::ios_base::openmode const mode /* = std::ios_base::in */)
{
std::vector<stellar_match> matches;
std::ifstream fin(match_path, mode);
std::string line;
while (std::getline(fin, line))
{
auto line_vec = valik::get_line_vector<std::string>(line, '\t');

//!WORKAROUND: for valik_search_segments test that writes output file names instead of matches
if (line_vec.size() == 1)
break;

assert(line_vec.size() == 9); // Stellar GFF format output has 9 columns
stellar_match match(line_vec, reference);
matches.push_back(match);
Expand Down
2 changes: 1 addition & 1 deletion src/consolidate/merge_processes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ bool merge_processes(search_arguments const & arguments,

external_process merge(merge_process_args);

std::ofstream matches_out(arguments.out_file);
std::ofstream matches_out(arguments.all_matches);
matches_out << merge.cout();

return check_external_process_success(merge_process_args, merge);
Expand Down
18 changes: 0 additions & 18 deletions src/valik_consolidate.cpp

This file was deleted.

6 changes: 1 addition & 5 deletions src/valik_main.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include <seqan3/argument_parser/all.hpp>

#include <utilities/argument_parsing/consolidate.hpp>

#include <valik/argument_parsing/split.hpp>
#include <valik/argument_parsing/build.hpp>
#include <valik/argument_parsing/search.hpp>
Expand All @@ -13,7 +11,7 @@ int main(int argc, char ** argv)
{
try
{
sharg::parser top_level_parser{"valik", argc, argv, sharg::update_notifications::off, {"split", "build", "search", "consolidate"}};
sharg::parser top_level_parser{"valik", argc, argv, sharg::update_notifications::off, {"split", "build", "search"}};
valik::app::init_top_level_parser(top_level_parser);

valik::app::try_parsing(top_level_parser);
Expand All @@ -25,8 +23,6 @@ int main(int argc, char ** argv)
valik::app::run_build(sub_parser);
if (sub_parser.info.app_name == std::string_view{"valik-search"})
valik::app::run_search(sub_parser);
if (sub_parser.info.app_name == std::string_view{"valik-consolidate"})
valik::app::run_consolidation(sub_parser);
}
catch(std::exception const& e)
{
Expand Down
13 changes: 11 additions & 2 deletions src/valik_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,21 @@ void valik_search(search_arguments const & arguments)
}
}

// Consolidate matches (not necessary when searching a metagenomic database)
if (!arguments.ref_meta_path.empty())
{
consolidate_matches(arguments);
const bool error_in_delete = !std::filesystem::remove(arguments.all_matches);
if (error_in_delete)
std::cerr << "Could not clean up intermediate file: \t" << std::string(arguments.all_matches) << '\n';
failed = failed || error_in_delete;
}

if (arguments.write_time)
write_time_statistics(time_statistics, arguments.out_file.string() + ".time");

if (failed) {
if (failed)
throw std::runtime_error("valik_search failed. Run didn't complete correctly.");
}
}

} // namespace valik::app
1 change: 1 addition & 0 deletions test/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ macro (add_api_test test_filename)
add_dependencies (api_test ${target})
endmacro ()

add_subdirectory(utilities)
add_subdirectory(valik)
1 change: 1 addition & 0 deletions test/api/utilities/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_subdirectory(consolidate)
7 changes: 7 additions & 0 deletions test/api/utilities/consolidate/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
add_api_test (consolidate_matches_test.cpp)

target_use_datasources (consolidate_matches_test FILES 8bins50overlap_dream_all.gff)
target_use_datasources (consolidate_matches_test FILES 16bins50overlap_dream_all.gff)
target_use_datasources (consolidate_matches_test FILES 8bins50overlap_reference_metadata.tsv)
target_use_datasources (consolidate_matches_test FILES 16bins50overlap_reference_metadata.tsv)
target_use_datasources (consolidate_matches_test FILES stellar_truth.gff)
Loading

0 comments on commit 88f91b3

Please sign in to comment.