From 11ff4436690cc9b180f51bbc4aef6b01ff26f018 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 28 Jan 2020 12:46:17 -0500 Subject: [PATCH 001/620] Note that we always return the most tipward witness for partial_path_of. --- otc/conflict.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/otc/conflict.h b/otc/conflict.h index a8accac2..57664937 100644 --- a/otc/conflict.h +++ b/otc/conflict.h @@ -240,6 +240,9 @@ inline void perform_conflict_analysis(ConflictTree& induced_tree1, log_supported_by(MRCA, nd1); } else { for(auto nd2 = MRCA; nd2 and nd2->get_data().n_tips == MRCA->get_data().n_tips; nd2 = nd2->get_parent()) { + // The name of nd1 is used as a key to insert into std::map. + // std::map ignores inserts if the kep already exists, and so later (and more parental) nodes are not retained. + // RESULT: This means that we report as a "witness" only the most tip-ward node in the sequence of nodes nd2 from tree2 that map to nd1 from tree1. log_partial_path_of(nd2, nd1); } } From 36f41457e82e5f2aad6e94e6cbf805eff3ef5ed2 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 30 Jan 2020 18:07:52 -0500 Subject: [PATCH 002/620] Bump subtree limit for newick (but not arguson) from 25k -> 100k tips. --- otc/ws/tolws.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/ws/tolws.cpp b/otc/ws/tolws.cpp index e5b55cca..aac3b39c 100644 --- a/otc/ws/tolws.cpp +++ b/otc/ws/tolws.cpp @@ -845,7 +845,7 @@ string newick_subtree_ws_method(const TreesToServe & tts, bool include_all_node_labels, int height_limit) { - const uint32_t NEWICK_TIP_LIMIT = 25000; + const uint32_t NEWICK_TIP_LIMIT = 100000; auto focal = get_node_for_subtree(tree_ptr, node_id, height_limit, NEWICK_TIP_LIMIT); auto locked_taxonomy = tts.get_readable_taxonomy(); const auto & taxonomy = locked_taxonomy.first; From 60ece0573e26b7e31135ce7166f3d54ea1a8506e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:44:48 -0500 Subject: [PATCH 003/620] Add a function to determine if a taxon is extinct from its flags. --- otc/taxonomy/flags.cpp | 7 +++++++ otc/taxonomy/flags.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/otc/taxonomy/flags.cpp b/otc/taxonomy/flags.cpp index a25d4e64..c2ee7fd4 100644 --- a/otc/taxonomy/flags.cpp +++ b/otc/taxonomy/flags.cpp @@ -178,4 +178,11 @@ std::vector flags_to_string_vec(const std::bitset<32> flags) return f; } +bool is_extinct(tax_flags flags) +{ + static int extinct_bit = flag_from_string("extinct"); + static int extinct_inherited_bit = flag_from_string("extinct_inherited"); + return flags.test(extinct_bit) or flags.test(extinct_inherited_bit); +} + } // namespace otc diff --git a/otc/taxonomy/flags.h b/otc/taxonomy/flags.h index 4a89a2d7..22f5059a 100644 --- a/otc/taxonomy/flags.h +++ b/otc/taxonomy/flags.h @@ -20,6 +20,8 @@ std::vector flags_to_string_vec(const tax_flags flags); std::bitset<32> cleaning_flags_from_config_file(const std::string& filename); std::bitset<32> regrafting_flags_from_config_file(const std::string& filename); +bool is_extinct(tax_flags); + } //namespace otc #endif From baa06ca46c59039d51b1c2c24b333d64b04e877b Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:45:54 -0500 Subject: [PATCH 004/620] Add is_extinct() members to TaxonomyRecord and RTRichTaxNodeData --- otc/taxonomy/taxonomy.cpp | 9 +++++++++ otc/taxonomy/taxonomy.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 7bfdde68..e5fc330b 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -160,6 +160,10 @@ const std::string empty_string; const set indexed_source_prefixes = {"ncbi", "gbif", "worms", "if", "irmng"}; std::set rank_strings; +bool TaxonomyRecord::is_extinct() const +{ + return ::is_extinct(flags); +} TaxonomyRecord::TaxonomyRecord(const string& line_) :line(line_) { @@ -579,6 +583,11 @@ RichTaxonomy load_rich_taxonomy(const variables_map& args) { return {taxonomy_dir, cleaning_flags, keep_root}; } +bool RTRichTaxNodeData::is_extinct() const +{ + return ::is_extinct(flags); +} + void RichTaxonomy::read_synonyms() { RTRichTaxTreeData & tree_data = this->tree->get_data(); diff --git a/otc/taxonomy/taxonomy.h b/otc/taxonomy/taxonomy.h index 0154db07..dc10950b 100644 --- a/otc/taxonomy/taxonomy.h +++ b/otc/taxonomy/taxonomy.h @@ -145,6 +145,7 @@ struct TaxonomyRecord { TaxonomyRecord& operator=(const TaxonomyRecord& tr) = delete; TaxonomyRecord(TaxonomyRecord&& tr) = default; TaxonomyRecord(TaxonomyRecord& tr) = delete; + bool is_extinct() const; explicit TaxonomyRecord(const std::string& line); std::vector sourceinfoAsVec() const { std::string si = std::string(sourceinfo); @@ -256,6 +257,7 @@ class RTRichTaxNodeData { auto vs = this->sourceinfoAsVec(); return sources_vec_as_json(vs); } + bool is_extinct() const; }; typedef RootedTreeNode RTRichTaxNode; From 83cc3ec9fb8df69b445212194f0fe125ae246088 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:47:27 -0500 Subject: [PATCH 005/620] Add flag on SunTreeNodeData to record extinctness. --- otc/ws/tolws.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/otc/ws/tolws.h b/otc/ws/tolws.h index c1988a6b..8c97c0bb 100644 --- a/otc/ws/tolws.h +++ b/otc/ws/tolws.h @@ -63,6 +63,8 @@ class SumTreeNodeData { vec_src_node_ids terminal; # endif bool was_uncontested = false; + bool extinct_mark = false; // extinctness means that the node has >= 1 descendant (including itself), and all descendants are extinct. + bool is_extinct() const {return extinct_mark;} uint32_t num_tips = 0; }; From 3ab720fb18c1ffa180b3034cdf7bf030977569cd Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:50:06 -0500 Subject: [PATCH 006/620] Simplify destructuring pair to be more readable. --- ws/tolwsbooting.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index 2bbb02c4..5bbb29f8 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -1197,10 +1197,8 @@ bool read_tree_and_annotations(const fs::path & config_path, auto tax_mem = calc_memory_used(taxonomy, tax_mem_b); write_memory_bookkeeping(LOG(INFO), tax_mem_b, "taxonomy", tax_mem); # endif - auto tree_and_ann = tts.get_new_tree_and_annotations(config_path.native(), tree_path.native()); + auto [tree,sta] = tts.get_new_tree_and_annotations(config_path.native(), tree_path.native()); try { - SummaryTree_t & tree = tree_and_ann.first; - SummaryTreeAnnotation & sta = tree_and_ann.second; sta = annotations_obj; json tref; tref["taxonomy"] = taxonomy.get_version(); From d5867becdc46f3a0ccf583cac0c4136f337d8bd4 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:53:35 -0500 Subject: [PATCH 007/620] Compute extinctness of synth-tree nodes at boot time via post-order traversal. --- ws/tolwsbooting.cpp | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index 5bbb29f8..61623259 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -1153,7 +1153,44 @@ inline std::size_t calc_memory_used(const RichTaxonomy &rt, MemoryBookkeeper &mb #endif - +void mark_summary_tree_nodes_extinct(SummaryTree_t& tree, const RichTaxonomy& taxonomy) +{ + // compute extinctness for each node. Post means that a node is only visited after all its children. + for (auto node: iter_post(tree)) + { + auto& node_data = node->get_data(); + if (node->is_tip()) + { + auto id = node->get_ott_id(); + auto& taxon = taxonomy.included_taxon_from_id(id)->get_data(); + node_data.extinct_mark = taxon.is_extinct(); + } + else + { + // If any child is not extinct, then this node is not extinct either. + node_data.extinct_mark = true; + for (auto c : iter_child_const(*node)) + if (not c->get_data().is_extinct()) + node_data.extinct_mark = false; + + // Complain about higher taxa with extinctness that doesn't match the computed extinctness. + if (node->has_ott_id()) + { + auto id = node->get_ott_id(); + auto& taxon = taxonomy.included_taxon_from_id(id)->get_data(); + if (node_data.is_extinct() != taxon.is_extinct()) + { + LOG(WARNING)<<"Higher taxon "<get_name()<<" is NOT extinct!"; + else + LOG(WARNING)<<" Child "<get_name()<<" is EXTINCT!"; + } + } + } + } +} bool read_tree_and_annotations(const fs::path & config_path, const fs::path & tree_path, @@ -1199,6 +1236,9 @@ bool read_tree_and_annotations(const fs::path & config_path, # endif auto [tree,sta] = tts.get_new_tree_and_annotations(config_path.native(), tree_path.native()); try { + + mark_summary_tree_nodes_extinct(tree, taxonomy); + sta = annotations_obj; json tref; tref["taxonomy"] = taxonomy.get_version(); From 552a0c43fc5c8952cfcfcb0219dbd20b2bdaae3e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 11 Feb 2020 14:53:53 -0500 Subject: [PATCH 008/620] Record extinctness on arguson output. --- otc/ws/tolws.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/otc/ws/tolws.cpp b/otc/ws/tolws.cpp index aac3b39c..6640a9d8 100644 --- a/otc/ws/tolws.cpp +++ b/otc/ws/tolws.cpp @@ -97,6 +97,9 @@ void add_basic_node_info(const RichTaxonomy & taxonomy, const SumTreeNode_t & nd else noderepr["num_tips"] = nd.get_data().num_tips; + if (is_arguson) + noderepr["extinct"] = nd.get_data().is_extinct(); + if (nd.has_ott_id()) { auto nd_id = nd.get_ott_id(); const auto * nd_taxon = taxonomy.included_taxon_from_id(nd_id); From b9adf2ff65c0b495dcddc630baa8f7a69cfac8b3 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 12 Feb 2020 16:41:55 -0500 Subject: [PATCH 009/620] Add stub file for prune-trees.cpp. It works but doesn't prune anything yet. otc-prune-trees cleaned_phylo/ot_934@tree1.tre:ot_934@tree1 --out-dir temp --config=config --- tools/meson.build | 1 + tools/prune-trees.cpp | 221 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 tools/prune-trees.cpp diff --git a/tools/meson.build b/tools/meson.build index ee951703..f3cc8721 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -49,6 +49,7 @@ programs = [ ['regrafttaxonomygenerator', 'regraft-taxonomy-generator'], ['relabel-tree', 'relabel-tree'], ['tree-tool', 'tree-tool'], + ['prune-trees', 'prune-trees'], ['broken-taxa', 'broken-taxa'], ['unprune-solution-and-name-unnamed-nodes', 'unprune-solution-and-name-unnamed-nodes'], ] diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp new file mode 100644 index 00000000..c0cdc0b9 --- /dev/null +++ b/tools/prune-trees.cpp @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "otc/error.h" +#include "otc/tree.h" +#include "otc/otcli.h" +#include "otc/tree_operations.h" +#include "otc/taxonomy/taxonomy.h" +#include "otc/taxonomy/flags.h" + +#include + +#include + +namespace fs = boost::filesystem; + +INITIALIZE_EASYLOGGINGPP + +using namespace otc; + +using std::string; +using std::vector; +using std::cout; +using std::cerr; +using std::endl; +using std::unique_ptr; + +using boost::spirit::qi::symbols; +using namespace boost::spirit; + +using Tree_t = RootedTree; + +namespace po = boost::program_options; +using po::variables_map; + +variables_map parse_cmd_line(int argc,char* argv[]) { + using namespace po; + + // named options + options_description invisible("Invisible options"); + invisible.add_options() + ("trees", value>()->composing(),"Filenames for newick trees") + ; + + options_description output("Output options"); + output.add_options() + ("out-dir",value(),"Output directory for the newick files") + ; + + options_description taxonomy("Taxonomy options"); + taxonomy.add_options() + ("config,c",value(),"Config file containing flags to filter") + ("clean",value(),"Comma-separated string of flags to filter") + ("root,r", value(), "OTT id of root node of subtree to keep") + ("taxonomy", value(),"Directory name for the taxonomy") + ; + + + options_description visible; + visible.add(output).add(taxonomy).add(otc::standard_options()); + + // positional options + positional_options_description p; + p.add("trees", -1); + + variables_map vm = otc::parse_cmd_line_standard(argc, argv, + "Usage: prune-trees : : ... [OPTIONS]\n" + "Prune flagged taxa and remove unmapped tips, writing resulting Newick files to out-dir.", + visible, invisible, p); + return vm; +} + +unique_ptr get_tree(const string& filename) { + vector> trees; + std::function)> a = [&](unique_ptr t) {trees.push_back(std::move(t));return true;}; + ParsingRules rules; + rules.require_ott_ids = false; + otc::process_trees(filename,rules,a);//[&](unique_ptr t) {trees.push_back(std::move(t));return true;}); + return std::move(trees[0]); +} + +Tree_t::node_type* find_node_by_ott_id(Tree_t& tree, OttId root_ott_id, bool throw_if_not_found=true) { + for(auto nd: iter_pre(tree)) { + if (nd->has_ott_id() and nd->get_ott_id() == root_ott_id) { + return nd; + } + } + if (throw_if_not_found) { + throw OTCError() << "Can't find node with id " << root_ott_id << " in tree '" << tree.get_name() << "'"; + } + return nullptr; +} + +Tree_t::node_type* find_node_by_name(Tree_t& tree, const string& name) { + for(auto nd: iter_pre(tree)) { + if (nd->get_name().size() and nd->get_name() == name) { + return nd; + } + } + throw OTCError() << "Can't find node with name '" << name << "' in tree '" << tree.get_name() << "'"; +} + +unique_ptr truncate_to_subtree_by_ott_id(unique_ptr tree, OttId root_ott_id) { + auto root = find_node_by_ott_id(*tree, root_ott_id); + root->detach_this_node(); + unique_ptr tree2 (new Tree_t); + tree2->_set_root(root); + return tree2; +} + +void prune_from_tree(Tree_t & tree, const std::vector & tips) { + OttIdSet tipset; + for (auto t : tips) { + tipset.insert(t); + } + std::set todel; + for(auto nd: iter_pre(tree)) { + if (nd->has_ott_id() and tipset.count(nd->get_ott_id()) > 0) { + todel.insert(nd); + } + } + for (auto tdn : todel) { + tdn->detach_this_node(); + } +} + +void prune_tree(Tree_t & tree, const std::vector & tips) { + for (auto t_ott_id : tips) { + auto tn = find_node_by_ott_id(tree, t_ott_id); + if (tn && !tn->is_tip()) { + const auto children = all_children(tn); + for (auto c : children) { + c->detach_this_node(); + } + } + } +} + +std::string remove_ott_suffix(std::string name) { + static std::regex ott("(.*)[_ ]ott.*"); + std::smatch matches; + if (std::regex_match(name,matches,ott)) { + name = matches[1]; + } + return name; +} + +void write_tree(const Tree_t& tree, const fs::path& out_path) +{ + std::ofstream out_file( out_path.string() ); + if (not out_file) + { + throw OTCError() << "Could not create empty file '" << out_path.string() << "'"; + } + write_tree_as_newick(out_file, tree); +} + +std::pair split_on_last(const string& s, char c) +{ + auto pos = s.rfind(c); + if (pos == string::npos) + return {s,""}; + else + return {s.substr(0,pos),s.substr(pos)}; +} + +int main(int argc, char* argv[]) { + std::ios::sync_with_stdio(false); + try { + variables_map args = parse_cmd_line(argc,argv); + + // Where does this load the taxonomy from? + // Should I remove the --config argument? Or should we pass the propinquity config here? + auto taxonomy = load_taxonomy(args); + + if (not args.count("trees")) throw OTCError() << "No trees given!"; + + auto filenames = args["trees"].as>(); + + if (not args.count("out-dir")) + throw OTCError()<<"output directory not specified! Use --out-dir="; + + auto out_dir = fs::path(args["out-dir"].as()); + + for(auto& filename_name: filenames) + { + // We should split this on ':', and use the last part as a name to write out. + auto [in_filename, out_name] = split_on_last(filename_name, ':'); + if (out_name.empty()) + throw OTCError()<<"tree file '"<:"; + + auto tree = get_tree(in_filename); + + // Uh... what tree are we supposed to write here? + write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); + + // Write out the pruned tree + write_tree(*tree, out_dir / (out_name + ".tre")); + } + } catch (std::exception& e) { + cerr << "otc-prune-trees: Error! " << e.what() << std::endl; + exit(1); + } +} + +// 1. Write a parser to read the lines faster +// 2. Avoid memory allocation -- by mmapping the taxonomy file? +// 3. Convert the flags into a bitmask +// 4. Should the Rank be a converted to an integer? +// 5. Can we assign OTT IDs to internal nodes of a tree while accounting for Incertae Sedis taxa? +// * What are the triplet-inference rules for the Incertae Sedis problem? + +// TODO: mmap via BOOST https://techoverflow.net/blog/2013/03/31/mmap-with-boost-iostreams-a-minimalist-example/ +// TODO: write out a reduced taxonomy + From f87cee9f5fae14bea1325616acfbdbabcd4d9e66 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 13:47:50 -0500 Subject: [PATCH 010/620] Add shared method to BaseTaxonomy for getting forwarded ids. --- otc/taxonomy/taxonomy.cpp | 28 ++++++++++++++++++++++++++++ otc/taxonomy/taxonomy.h | 14 ++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 7bfdde68..94f0606f 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -337,6 +337,24 @@ BaseTaxonomy::BaseTaxonomy(const string& dir, } } +std::optional BaseTaxonomy::get_forwarded_id(OttId id) const +{ + auto id_or_reason = get_forwarded_id_or_reason(id); + if (std::holds_alternative(id_or_reason)) + return std::get(id_or_reason); + else + return {}; +} + +std::variant Taxonomy::get_forwarded_id_or_reason(OttId id) const +{ + if (index.count(id)) + return id; + if (auto iter = forwards.find(id); iter != forwards.end()) + return iter->second; + return reason_missing::unknown; +} + Taxonomy::Taxonomy(const string& dir, bitset<32> cf, OttId kr) @@ -411,6 +429,16 @@ Taxonomy::Taxonomy(const string& dir, read_forwards_file(path + "/forwards.tsv"); } +std::variant RichTaxonomy::get_forwarded_id_or_reason(OttId id) const +{ + const auto & td = tree->get_data(); + if (td.id_to_node.count(id)) + return id; + if (auto iter = forwards.find(id); iter != forwards.end()) + return iter->second; + return reason_missing::unknown; +} + RichTaxonomy::RichTaxonomy(const std::string& dir, std::bitset<32> cf, OttId kr) :BaseTaxonomy(dir, cf, kr) { { //braced to reduce scope of light_taxonomy to reduced memory diff --git a/otc/taxonomy/taxonomy.h b/otc/taxonomy/taxonomy.h index 0154db07..b0f2a7bb 100644 --- a/otc/taxonomy/taxonomy.h +++ b/otc/taxonomy/taxonomy.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "otc/taxonomy/flags.h" #include "otc/error.h" @@ -161,6 +162,8 @@ inline std::vector get_index_vec(std::size_t sz) { return iv; } +enum class reason_missing {unknown, not_an_id, never_minted_id, deprecated, pruned, forwarded, broken}; + class BaseTaxonomy { protected: std::unordered_map forwards; @@ -179,6 +182,12 @@ class BaseTaxonomy { const std::string & get_version_number() const { return version_number; } + + // We should probably generalize this to record EITHER an OttId OR a reason why the OttId isn't found. + virtual std::variant get_forwarded_id_or_reason(OttId id) const = 0; + std::optional get_forwarded_id(OttId id) const; + + virtual ~BaseTaxonomy() = default; }; class Taxonomy: public std::vector, public BaseTaxonomy { @@ -192,6 +201,8 @@ class Taxonomy: public std::vector, public BaseTaxonomy { public: template std::unique_ptr get_tree(std::function) const; + std::variant get_forwarded_id_or_reason(OttId id) const; + TaxonomyRecord& record_from_id(OttId id); const TaxonomyRecord& record_from_id(OttId id) const; @@ -321,6 +332,9 @@ class RichTaxonomy: public BaseTaxonomy { /// Load the taxonomy from directory dir, and apply cleaning flags cf, and keep subtree below kr RichTaxonomy(const std::string& dir, std::bitset<32> cf = std::bitset<32>(), OttId kr = -1); RichTaxonomy(RichTaxonomy &&) = default; + + std::variant get_forwarded_id_or_reason(OttId id) const; + const RTRichTaxNode * included_taxon_from_id(OttId ott_id) const { //Returns node * or nullptr if not found. const auto & td = tree->get_data(); From f01ade654d56ab6624c0238603beea36cd306b48 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 13:49:03 -0500 Subject: [PATCH 011/620] Move prune_unmapped_leaves(tree, taxonomy) to new header ws/prune.h Also use BaseTaxonomy, so that we can share code using Taxonomy and RichTaxonomy. --- otc/ws/conflictws.cpp | 42 +------------------------------ otc/ws/prune.h | 57 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 41 deletions(-) create mode 100644 otc/ws/prune.h diff --git a/otc/ws/conflictws.cpp b/otc/ws/conflictws.cpp index b1818b9c..e056a3e2 100644 --- a/otc/ws/conflictws.cpp +++ b/otc/ws/conflictws.cpp @@ -4,6 +4,7 @@ #include "otc/ws/trees_to_serve.h" #include "otc/conflict.h" #include "otc/ws/nexson/nexson.h" +#include "otc/ws/prune.h" #include #include @@ -500,47 +501,6 @@ void check_all_nodes_have_node_names(const ConflictTree& query_tree) { } -template -pair prune_unmapped_leaves(Tree& tree, const RichTaxonomy& tax) { - int mapped_leaves = 0; - int unmapped_leaves = 0; - vector leaves; - for(auto leaf: iter_leaf(tree)) { - if (leaf->has_ott_id()) { - auto tax_node = tax.included_taxon_from_id(leaf->get_ott_id()); - if (tax_node) { - // Handle forwards - auto id = tax_node->get_ott_id(); - if (id != leaf->get_ott_id()) { - leaf->set_ott_id(id); - } - // Count as mapped - mapped_leaves++; - continue; - } - } - // Mark leaf for deletion - leaves.push_back(leaf); - unmapped_leaves++; - } - for(auto leaf: leaves) { - while (leaf and leaf->is_tip()) { - auto parent = leaf->get_parent(); - if (parent) { - leaf->detach_this_node(); - delete leaf; - leaf = parent; - } else { - delete leaf; - tree._set_root(nullptr); - } - } - assert(tree.get_root()); - } - return {mapped_leaves, unmapped_leaves}; -} - - // Find the smallest set C of taxa (leaf or internal) that we need to add as children of `id` so that // i) all descendents of id are descendants of one of these children // ii) all taxa in C are present in the summary tree diff --git a/otc/ws/prune.h b/otc/ws/prune.h new file mode 100644 index 00000000..424cdc62 --- /dev/null +++ b/otc/ws/prune.h @@ -0,0 +1,57 @@ +#ifndef PRUNE_H +#define PRUNE_H + +#include "otc/tree.h" +#include "otc/taxonomy/taxonomy.h" + +namespace otc { + +template +std::pair prune_unmapped_leaves(Tree& tree, const BaseTaxonomy& tax) +{ + int mapped_leaves = 0; + int unmapped_leaves = 0; + + std::vector leaves; + for(auto leaf: iter_leaf(tree)) + { + if (leaf->has_ott_id()) + { + auto id1 = leaf->get_ott_id(); + auto id2 = tax.get_forwarded_id(id1); + if (id2) + { + // Handle forwards + if (*id2 != id1) + leaf->set_ott_id(*id2); + + // Count as mapped + mapped_leaves++; + continue; + } + } + // Mark leaf for deletion + leaves.push_back(leaf); + unmapped_leaves++; + } + for(auto leaf: leaves) { + while (leaf and leaf->is_tip()) { + auto parent = leaf->get_parent(); + if (parent) { + leaf->detach_this_node(); + delete leaf; + leaf = parent; + } else { + delete leaf; + tree._set_root(nullptr); + } + } + assert(tree.get_root()); + } + return {mapped_leaves, unmapped_leaves}; +} + + +} + +#endif From 5481997ece09da53dec84ccedbb5061ddefba58c Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 13:49:14 -0500 Subject: [PATCH 012/620] Prune unmapped leaves in prune-trees.cpp --- tools/prune-trees.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index c0cdc0b9..bc83a89f 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -10,6 +10,7 @@ #include "otc/error.h" #include "otc/tree.h" #include "otc/otcli.h" +#include "otc/newick.h" #include "otc/tree_operations.h" #include "otc/taxonomy/taxonomy.h" #include "otc/taxonomy/flags.h" @@ -18,6 +19,8 @@ #include +#include "otc/ws/prune.h" + namespace fs = boost::filesystem; INITIALIZE_EASYLOGGINGPP @@ -197,6 +200,8 @@ int main(int argc, char* argv[]) { auto tree = get_tree(in_filename); + prune_unmapped_leaves(*tree, taxonomy); + // Uh... what tree are we supposed to write here? write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); From 20c77f9583dcbf60f6f7ff8cb5ff13383a3a7934 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 14:17:51 -0500 Subject: [PATCH 013/620] Fix inconsistent name: get_forwarded_id -> get_unforwarded_id( ). record_from_unforwarded_id( ) uses "unforwarded" to mean that id that will not be forwarded, as opposed to an id that has already been forwarded. lets be consistent. --- otc/taxonomy/taxonomy.cpp | 8 ++++---- otc/taxonomy/taxonomy.h | 8 ++++---- otc/ws/prune.h | 3 +-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 94f0606f..baf164fa 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -337,16 +337,16 @@ BaseTaxonomy::BaseTaxonomy(const string& dir, } } -std::optional BaseTaxonomy::get_forwarded_id(OttId id) const +std::optional BaseTaxonomy::get_unforwarded_id(OttId id) const { - auto id_or_reason = get_forwarded_id_or_reason(id); + auto id_or_reason = get_unforwarded_id_or_reason(id); if (std::holds_alternative(id_or_reason)) return std::get(id_or_reason); else return {}; } -std::variant Taxonomy::get_forwarded_id_or_reason(OttId id) const +std::variant Taxonomy::get_unforwarded_id_or_reason(OttId id) const { if (index.count(id)) return id; @@ -429,7 +429,7 @@ Taxonomy::Taxonomy(const string& dir, read_forwards_file(path + "/forwards.tsv"); } -std::variant RichTaxonomy::get_forwarded_id_or_reason(OttId id) const +std::variant RichTaxonomy::get_unforwarded_id_or_reason(OttId id) const { const auto & td = tree->get_data(); if (td.id_to_node.count(id)) diff --git a/otc/taxonomy/taxonomy.h b/otc/taxonomy/taxonomy.h index b0f2a7bb..6ad72b94 100644 --- a/otc/taxonomy/taxonomy.h +++ b/otc/taxonomy/taxonomy.h @@ -184,8 +184,8 @@ class BaseTaxonomy { } // We should probably generalize this to record EITHER an OttId OR a reason why the OttId isn't found. - virtual std::variant get_forwarded_id_or_reason(OttId id) const = 0; - std::optional get_forwarded_id(OttId id) const; + virtual std::variant get_unforwarded_id_or_reason(OttId id) const = 0; + std::optional get_unforwarded_id(OttId id) const; virtual ~BaseTaxonomy() = default; }; @@ -201,7 +201,7 @@ class Taxonomy: public std::vector, public BaseTaxonomy { public: template std::unique_ptr get_tree(std::function) const; - std::variant get_forwarded_id_or_reason(OttId id) const; + std::variant get_unforwarded_id_or_reason(OttId id) const; TaxonomyRecord& record_from_id(OttId id); @@ -333,7 +333,7 @@ class RichTaxonomy: public BaseTaxonomy { RichTaxonomy(const std::string& dir, std::bitset<32> cf = std::bitset<32>(), OttId kr = -1); RichTaxonomy(RichTaxonomy &&) = default; - std::variant get_forwarded_id_or_reason(OttId id) const; + std::variant get_unforwarded_id_or_reason(OttId id) const; const RTRichTaxNode * included_taxon_from_id(OttId ott_id) const { //Returns node * or nullptr if not found. diff --git a/otc/ws/prune.h b/otc/ws/prune.h index 424cdc62..87156b9e 100644 --- a/otc/ws/prune.h +++ b/otc/ws/prune.h @@ -18,7 +18,7 @@ std::pair prune_unmapped_leaves(Tree& tree, const BaseTaxonomy& tax) if (leaf->has_ott_id()) { auto id1 = leaf->get_ott_id(); - auto id2 = tax.get_forwarded_id(id1); + auto id2 = tax.get_unforwarded_id(id1); if (id2) { // Handle forwards @@ -51,7 +51,6 @@ std::pair prune_unmapped_leaves(Tree& tree, const BaseTaxonomy& tax) return {mapped_leaves, unmapped_leaves}; } - } #endif From 2d157f0031466c4008e32cb9637e2a233f38b90e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 16:42:02 -0500 Subject: [PATCH 014/620] Move some general-use functions out of otc/ws/tolws.h --- otc/induced_tree.h | 17 ++++++++++++++++ otc/tree_operations.h | 26 +++++++++++++++++++++++++ otc/ws/tolws.h | 45 ------------------------------------------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/otc/induced_tree.h b/otc/induced_tree.h index 4fb9e5a2..6494a1b1 100644 --- a/otc/induced_tree.h +++ b/otc/induced_tree.h @@ -177,6 +177,23 @@ std::unique_ptr get_induced_tree(const Tree_In1_t& T1, return induced_tree; } +// Get a list of nodes in T2 that are leaves in T1. +// The nodes in T2 do NOT need to be leaves of T2. + +template +auto get_induced_nodes(const Tree1_t& T1, const Tree2_t& T2) { + auto& ott_to_nodes2 = T2.get_data().id_to_node; + std::vector nodes; + for(auto leaf: iter_leaf_const(T1)) { + auto id = leaf->get_ott_id(); + auto it = ott_to_nodes2.find(id); + if (it != ott_to_nodes2.end()) { + nodes.push_back(it->second); + } + } + return nodes; +} + }// namespace otc #endif diff --git a/otc/tree_operations.h b/otc/tree_operations.h index e9f8035e..a0a0e4ae 100644 --- a/otc/tree_operations.h +++ b/otc/tree_operations.h @@ -36,6 +36,17 @@ std::string newick(const T &t); template T* bisect_branch_with_new_child(T* x); +template +inline std::size_t n_leaves(const Tree& T) { +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-variable" + std::size_t count = 0; + for(auto nd: iter_leaf_const(T)){ + count++; + } + return count; +} + template unsigned int count_polytomies(const T & tree); @@ -1510,6 +1521,21 @@ inline void set_name_and_maybe_ott_id(const T & src_node, T & dest_node) { } } +template +void delete_tip_and_monotypic_ancestors(T& tree, typename T::node_type* node) { + assert(node->is_tip()); + while (node and node->is_tip()) { + auto parent = node->get_parent(); + if (not parent) { + tree._set_root(nullptr); + } else { + node->detach_this_node(); + } + delete node; + node = parent; + } +} + }// namespace otc #endif diff --git a/otc/ws/tolws.h b/otc/ws/tolws.h index c1988a6b..409b1e65 100644 --- a/otc/ws/tolws.h +++ b/otc/ws/tolws.h @@ -423,50 +423,5 @@ inline void add_taxon_record_info(const RichTaxonomy & , taxonrepr["ott_id"] = record.id; } -template -inline std::size_t n_leaves(const Tree_t& T) { -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wunused-variable" - std::size_t count = 0; - for(auto nd: iter_leaf_const(T)){ - count++; - } - return count; -} - - -// Get a list of nodes in T2 that are leaves in T1. -// The nodes in T2 do NOT need to be leaves of T2. - -template -auto get_induced_nodes(const Tree1_t& T1, const Tree2_t& T2) { - auto& ott_to_nodes2 = T2.get_data().id_to_node; - std::vector nodes; - for(auto leaf: iter_leaf_const(T1)) { - auto id = leaf->get_ott_id(); - auto it = ott_to_nodes2.find(id); - if (it != ott_to_nodes2.end()) { - nodes.push_back(it->second); - } - } - return nodes; -} - -template -void delete_tip_and_monotypic_ancestors(T& tree, typename T::node_type* node) { - assert(node->is_tip()); - while (node and node->is_tip()) { - auto parent = node->get_parent(); - if (not parent) { - tree._set_root(nullptr); - } else { - node->detach_this_node(); - } - delete node; - node = parent; - } -} - - } // namespace otc #endif From 49bf1f271e8d0a3313f69dfe64c22f82293ea46a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 18 Feb 2020 16:42:23 -0500 Subject: [PATCH 015/620] Prune ancestral leaves in otc-prune-trees. --- tools/prune-trees.cpp | 62 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index bc83a89f..e9204ea8 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -12,6 +12,7 @@ #include "otc/otcli.h" #include "otc/newick.h" #include "otc/tree_operations.h" +#include "otc/induced_tree.h" #include "otc/taxonomy/taxonomy.h" #include "otc/taxonomy/flags.h" @@ -37,7 +38,11 @@ using std::unique_ptr; using boost::spirit::qi::symbols; using namespace boost::spirit; -using Tree_t = RootedTree; +struct RTNodeDepth { + int depth = 0; // depth = number of nodes to the root of the tree including the endpoints (so depth of root = 1) +}; + +using Tree_t = RootedTree; namespace po = boost::program_options; using po::variables_map; @@ -164,6 +169,43 @@ void write_tree(const Tree_t& tree, const fs::path& out_path) write_tree_as_newick(out_file, tree); } + +// OK, so we need to make an induced tree from a taxonomy. + +// Remove leaves (and their monotypic ancestors) in the query tree +// that are the ancestors of other leaves in the query tree. + +void prune_ancestral_leaves(Tree_t& query_tree, const Tree_t& taxonomy_tree, const std::unordered_map& tax_id_to_node) +{ + // 1. Get an induced taxonomy tree starting from the leaves of the query tree + auto query_id_to_node = get_ottid_to_const_node_map(query_tree); + auto induced_leaves = get_induced_leaves(query_tree, query_id_to_node, taxonomy_tree, tax_id_to_node); + auto mrca = [](const Tree_t::node_type* n1, const Tree_t::node_type* n2) {return mrca_from_depth(n1,n2);}; + auto induced_taxonomy = get_induced_tree(induced_leaves, mrca); + + LOG(WARNING)<<"induced taxonomy has "< nodes_to_prune; + for(auto leaf: iter_leaf(query_tree)) + { + auto ottid = leaf->get_ott_id(); + auto tax_node = ottid_to_induced_tax_node.at(ottid); + if (not tax_node->is_tip()) { + nodes_to_prune.push_back(leaf); + } + } + + // 3. Prune the leaves that we selected while walking the query tree + for(auto node: nodes_to_prune) + delete_tip_and_monotypic_ancestors(query_tree, node); + + LOG(WARNING)<<"query tree pruned down to "< split_on_last(const string& s, char c) { auto pos = s.rfind(c); @@ -182,6 +224,10 @@ int main(int argc, char* argv[]) { // Should I remove the --config argument? Or should we pass the propinquity config here? auto taxonomy = load_taxonomy(args); + auto taxonomy_tree = taxonomy.get_tree([](auto&){return "";}); + compute_depth(*taxonomy_tree); + auto tax_node_map = get_ottid_to_const_node_map(*taxonomy_tree); + if (not args.count("trees")) throw OTCError() << "No trees given!"; auto filenames = args["trees"].as>(); @@ -199,9 +245,12 @@ int main(int argc, char* argv[]) { throw OTCError()<<"tree file '"<:"; auto tree = get_tree(in_filename); + compute_depth(*tree); prune_unmapped_leaves(*tree, taxonomy); + prune_ancestral_leaves(*tree, *taxonomy_tree, tax_node_map); + // Uh... what tree are we supposed to write here? write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); @@ -213,14 +262,3 @@ int main(int argc, char* argv[]) { exit(1); } } - -// 1. Write a parser to read the lines faster -// 2. Avoid memory allocation -- by mmapping the taxonomy file? -// 3. Convert the flags into a bitmask -// 4. Should the Rank be a converted to an integer? -// 5. Can we assign OTT IDs to internal nodes of a tree while accounting for Incertae Sedis taxa? -// * What are the triplet-inference rules for the Incertae Sedis problem? - -// TODO: mmap via BOOST https://techoverflow.net/blog/2013/03/31/mmap-with-boost-iostreams-a-minimalist-example/ -// TODO: write out a reduced taxonomy - From d6428eda1e9ffed795fe44f9107aa8d2e89d4161 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 17:36:10 -0500 Subject: [PATCH 016/620] Move prune_duplicate_ottids( ) from conflictws.cpp to otc/ws/prune.h --- otc/ws/conflictws.cpp | 22 ---------------------- otc/ws/prune.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/otc/ws/conflictws.cpp b/otc/ws/conflictws.cpp index e056a3e2..e396099d 100644 --- a/otc/ws/conflictws.cpp +++ b/otc/ws/conflictws.cpp @@ -451,28 +451,6 @@ void prune_ancestral_leaves(ConflictTree& query_tree, const RichTaxonomy& taxono LOG(WARNING)<<"query tree pruned down to "< -void prune_duplicate_ottids(Tree& tree) { - vector leaves; - for(auto leaf: iter_leaf(tree)) { - leaves.push_back(leaf); - } - map node_ptrs; - for(auto leaf: leaves) { - if (not leaf->has_ott_id()) { - continue; - } - auto id = leaf->get_ott_id(); - // If the OTT id is new, then add the node as canonical representative of the OTT id - if (not node_ptrs.count(id)) { - node_ptrs.insert({id, leaf}); - } else { - // Otherwise delete the non-canonical OTT id and its ancestors - delete_tip_and_monotypic_ancestors(tree, leaf); - } - } -} - void check_all_leaves_have_ott_ids(const ConflictTree& query_tree) { for(auto leaf: iter_leaf_const(query_tree)) { if (leaf->has_ott_id()) { diff --git a/otc/ws/prune.h b/otc/ws/prune.h index 87156b9e..15e33a35 100644 --- a/otc/ws/prune.h +++ b/otc/ws/prune.h @@ -51,6 +51,28 @@ std::pair prune_unmapped_leaves(Tree& tree, const BaseTaxonomy& tax) return {mapped_leaves, unmapped_leaves}; } +template +void prune_duplicate_ottids(Tree& tree) { + std::vector leaves; + for(auto leaf: iter_leaf(tree)) { + leaves.push_back(leaf); + } + std::map node_ptrs; + for(auto leaf: leaves) { + if (not leaf->has_ott_id()) { + continue; + } + auto id = leaf->get_ott_id(); + // If the OTT id is new, then add the node as canonical representative of the OTT id + if (not node_ptrs.count(id)) { + node_ptrs.insert({id, leaf}); + } else { + // Otherwise delete the non-canonical OTT id and its ancestors + delete_tip_and_monotypic_ancestors(tree, leaf); + } + } +} + } #endif From 5f075082c985b78ac138b9f130d9014cebbcbd49 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 17:36:33 -0500 Subject: [PATCH 017/620] Now prune duplicate OTT ids. --- tools/prune-trees.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index e9204ea8..a2116f20 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -251,6 +251,8 @@ int main(int argc, char* argv[]) { prune_ancestral_leaves(*tree, *taxonomy_tree, tax_node_map); + prune_duplicate_ottids(*tree); + // Uh... what tree are we supposed to write here? write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); From ce19fc333bff749fd803368909a27d0eca4fb7ff Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 18:37:31 -0500 Subject: [PATCH 018/620] Fix problems forwarding to id -1. --- otc/taxonomy/taxonomy.cpp | 58 +++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index baf164fa..b8d5d13a 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -484,7 +484,9 @@ RichTaxonomy::RichTaxonomy(const std::string& dir, std::bitset<32> cf, OttId kr) } -void Taxonomy::read_forwards_file(string filepath) { +void Taxonomy::read_forwards_file(string filepath) +{ + // 1. Read forwards file and create id -> forwarded_id map ifstream forwards_stream(filepath); if (forwards_stream) { string line; @@ -500,33 +502,41 @@ void Taxonomy::read_forwards_file(string filepath) { forwards[check_ott_id_size(old_id)] = check_ott_id_size(new_id); } } - // walk through the full forwards table, and try to find any that are multiple + + + // 2. walk through the full forwards table, and try to find any that are multiple // step paths of forwards. - unordered_set need_iterating; - for (auto old_new : forwards) { - auto new_id = old_new.second; - if (new_id >= 0) { - OttId nnid = this->map(new_id); - if (nnid != new_id) { - need_iterating.insert(old_new.first); - } + unordered_set failed_forwards; + for (auto& [old_id,new_id] : forwards) + { + unordered_set visited; + visited.insert(old_id); + visited.insert(new_id); + + // Iterate the new_id + assert(new_id > 0); + while(forwards.count(new_id)) + { + new_id = forwards.at(new_id); + assert(new_id > 0); + if (visited.count(new_id)) + throw OTCError()<<"forwarding loop from id "< scratch; - for (auto old_id: need_iterating) { - auto fm_it = forwards.find(old_id); - assert(fm_it != forwards.end()); - auto curr_new_id = fm_it->second; - OttId nnid = this->map(fm_it->second); - assert(nnid != fm_it->second); - fm_it->second = nnid; - if (nnid > 0 && nnid != this->map(nnid)) { - scratch.insert(old_id); - } + + if (not index.count(new_id)) + { + LOG(DEBUG) << "OTT id "<= 0); } + + // 3. Remove forwards that forward to ids that are non-existent for any reason + for(auto id: failed_forwards) + forwards.erase(id); } std::string get_taxonomy_dir(const variables_map& args) { From 61fc848b4df45aa2958e52daadd93831ff6f9be1 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 18:51:15 -0500 Subject: [PATCH 019/620] Colon is not part of tree name. --- tools/prune-trees.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index a2116f20..3e35304e 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -212,7 +212,7 @@ std::pair split_on_last(const string& s, char c) if (pos == string::npos) return {s,""}; else - return {s.substr(0,pos),s.substr(pos)}; + return {s.substr(0,pos),s.substr(pos+1)}; } int main(int argc, char* argv[]) { From ac52c916b906c4033fdd82971d21f64dfafae2b8 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 18:51:29 -0500 Subject: [PATCH 020/620] Don't write -taxonomy.tre --- tools/prune-trees.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index 3e35304e..14dc5908 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { prune_duplicate_ottids(*tree); // Uh... what tree are we supposed to write here? - write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); + // write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); // Write out the pruned tree write_tree(*tree, out_dir / (out_name + ".tre")); From ceb4c16157827eaf06d98c9ec0b05e4da871cc8e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 19 Feb 2020 18:51:48 -0500 Subject: [PATCH 021/620] Log some progress info and statistics on pruned leaves. --- tools/prune-trees.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/prune-trees.cpp b/tools/prune-trees.cpp index 14dc5908..472e20e1 100644 --- a/tools/prune-trees.cpp +++ b/tools/prune-trees.cpp @@ -183,8 +183,6 @@ void prune_ancestral_leaves(Tree_t& query_tree, const Tree_t& taxonomy_tree, con auto mrca = [](const Tree_t::node_type* n1, const Tree_t::node_type* n2) {return mrca_from_depth(n1,n2);}; auto induced_taxonomy = get_induced_tree(induced_leaves, mrca); - LOG(WARNING)<<"induced taxonomy has "<([](auto&){return "";}); compute_depth(*taxonomy_tree); auto tax_node_map = get_ottid_to_const_node_map(*taxonomy_tree); + LOG(INFO)<<"done."; if (not args.count("trees")) throw OTCError() << "No trees given!"; @@ -247,17 +247,28 @@ int main(int argc, char* argv[]) { auto tree = get_tree(in_filename); compute_depth(*tree); + int n_leaves1 = n_leaves(*tree); + prune_unmapped_leaves(*tree, taxonomy); + int n_leaves2 = n_leaves(*tree); + prune_ancestral_leaves(*tree, *taxonomy_tree, tax_node_map); + int n_leaves3 = n_leaves(*tree); + prune_duplicate_ottids(*tree); + int n_leaves4 = n_leaves(*tree); + // Uh... what tree are we supposed to write here? // write_tree(*tree, out_dir / (out_name + "-taxonomy.tre")); // Write out the pruned tree write_tree(*tree, out_dir / (out_name + ".tre")); + + LOG(INFO)<<"Pruning tree '"< Date: Tue, 25 Feb 2020 15:28:34 -0500 Subject: [PATCH 022/620] Rename otc-prune-trees to otc-prune-clean --- tools/meson.build | 2 +- tools/{prune-trees.cpp => prune-clean.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tools/{prune-trees.cpp => prune-clean.cpp} (100%) diff --git a/tools/meson.build b/tools/meson.build index f3cc8721..dddc35fc 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -49,7 +49,7 @@ programs = [ ['regrafttaxonomygenerator', 'regraft-taxonomy-generator'], ['relabel-tree', 'relabel-tree'], ['tree-tool', 'tree-tool'], - ['prune-trees', 'prune-trees'], + ['prune-clean', 'prune-clean'], ['broken-taxa', 'broken-taxa'], ['unprune-solution-and-name-unnamed-nodes', 'unprune-solution-and-name-unnamed-nodes'], ] diff --git a/tools/prune-trees.cpp b/tools/prune-clean.cpp similarity index 100% rename from tools/prune-trees.cpp rename to tools/prune-clean.cpp From 91d2d5abdaaee56dae28a23c1debff7c919c25c6 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 25 Feb 2020 15:41:07 -0500 Subject: [PATCH 023/620] Fix removing leaves that are ancestors of other leaves. --- tools/prune-clean.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/prune-clean.cpp b/tools/prune-clean.cpp index 472e20e1..c6dfcd8f 100644 --- a/tools/prune-clean.cpp +++ b/tools/prune-clean.cpp @@ -179,7 +179,7 @@ void prune_ancestral_leaves(Tree_t& query_tree, const Tree_t& taxonomy_tree, con { // 1. Get an induced taxonomy tree starting from the leaves of the query tree auto query_id_to_node = get_ottid_to_const_node_map(query_tree); - auto induced_leaves = get_induced_leaves(query_tree, query_id_to_node, taxonomy_tree, tax_id_to_node); + auto induced_leaves = get_induced_leaves(taxonomy_tree, tax_id_to_node, query_tree, query_id_to_node); auto mrca = [](const Tree_t::node_type* n1, const Tree_t::node_type* n2) {return mrca_from_depth(n1,n2);}; auto induced_taxonomy = get_induced_tree(induced_leaves, mrca); @@ -253,11 +253,11 @@ int main(int argc, char* argv[]) { int n_leaves2 = n_leaves(*tree); - prune_ancestral_leaves(*tree, *taxonomy_tree, tax_node_map); + prune_duplicate_ottids(*tree); int n_leaves3 = n_leaves(*tree); - prune_duplicate_ottids(*tree); + prune_ancestral_leaves(*tree, *taxonomy_tree, tax_node_map); int n_leaves4 = n_leaves(*tree); @@ -268,7 +268,7 @@ int main(int argc, char* argv[]) { write_tree(*tree, out_dir / (out_name + ".tre")); LOG(INFO)<<"Pruning tree '"< Date: Wed, 4 Mar 2020 18:43:12 -0500 Subject: [PATCH 024/620] Print the correct leaf label after forwarding ott ids. --- tools/prune-clean.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/prune-clean.cpp b/tools/prune-clean.cpp index c6dfcd8f..7e5af460 100644 --- a/tools/prune-clean.cpp +++ b/tools/prune-clean.cpp @@ -251,6 +251,14 @@ int main(int argc, char* argv[]) { prune_unmapped_leaves(*tree, taxonomy); + // Clear the existing names to generate ott_XXXX names from the ids, + // since the ids have just been forwarded by prune_unmapped_leaves( ). + for(auto leaf: iter_leaf(*tree)) + { + assert(leaf->has_ott_id()); + leaf->set_name(""); + } + int n_leaves2 = n_leaves(*tree); prune_duplicate_ottids(*tree); From a757667691b0883b84bb882393a8533f243b60b1 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 4 Mar 2020 20:04:25 -0500 Subject: [PATCH 025/620] Linking against libotcetera requires linking against restbed as well. --- otc/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/meson.build b/otc/meson.build index 2c32900f..cca1856c 100644 --- a/otc/meson.build +++ b/otc/meson.build @@ -41,5 +41,5 @@ otc_lib = shared_library('otcetera', dependencies: [boost, restbed], install: true) -libotcetera = declare_dependency(include_directories: otc_inc, link_with: otc_lib) +libotcetera = declare_dependency(include_directories: otc_inc, link_with: otc_lib, dependencies: [restbed]) From 414bded85a48b9d35fc08d25aebf5923401e9247 Mon Sep 17 00:00:00 2001 From: "Mark T. Holder" Date: Mon, 23 Mar 2020 13:39:55 -0500 Subject: [PATCH 026/620] date on copyright --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 3df8c8c2..96bb1d88 100644 --- a/LICENSE +++ b/LICENSE @@ -4,7 +4,7 @@ flag, then you can choose between a GPL or BSD license. If you use the --with-webservices=yes configure flag, only the GPL is permitted. -Copyright (c) 2015-2017 Mark T. Holder and Benjamin D. Redelings +Copyright (c) 2015-2020 Mark T. Holder and Benjamin D. Redelings All rights reserved. otcetera builds upon NCL and is distributed under the same license: From bbbb87207022d77f079bc3df71e4c1ecccb2f066 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 1 Apr 2020 12:03:32 -0700 Subject: [PATCH 027/620] Use structured binding. --- otc/ws/tnrsws.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 0c853bac..7a5acf63 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -580,14 +580,15 @@ std::string tnrs_contexts_ws_method() { // curl -X POST https://api.opentreeoflife.org/v3/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Pan","Homo","Mus","Bufo","Drosophila"]}' // curl -X POST http://localhost:1984/v3/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Pan","Homo","Mus","Bufo","Drosophila"]}' -string tnrs_infer_context_ws_method(const vector& names, const RichTaxonomy& taxonomy) { - auto results = infer_context_and_ambiguous_names(taxonomy, names); - auto& context = results.first; - auto& ambiguous_names = results.second; +string tnrs_infer_context_ws_method(const vector& names, const RichTaxonomy& taxonomy) +{ + auto [context, ambiguous_names] = infer_context_and_ambiguous_names(taxonomy, names); + json response; - response["context_name"] = results.first->name; - response["context_ott_id"] = results.first->ott_id; + response["context_name"] = context->name; + response["context_ott_id"] = context->ott_id; response["ambiguous_names"] = ambiguous_names; + return response.dump(1); } From de60dd55b5ed78020c7af5ebf8c56842410cd71d Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 1 Apr 2020 13:30:11 -0700 Subject: [PATCH 028/620] Fix lcase_match_prefix( ). --- otc/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/util.h b/otc/util.h index d38377b6..29cd68e0 100644 --- a/otc/util.h +++ b/otc/util.h @@ -564,7 +564,7 @@ inline bool lcase_string_equals(const std::string_view& s1, const T& s2) { } inline bool lcase_match_prefix(const std::string_view& s, const std::string_view & prefix) { - if (prefix.size() < s.size()) { + if (prefix.size() > s.size()) { return false; } return lcase_string_equals(s.substr(prefix.size()), prefix); From 4109f5a4de281ad504e4f67bb358c42fef1411f5 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 1 Apr 2020 15:01:54 -0700 Subject: [PATCH 029/620] Move functions out of header files. --- otc/ctrie/ctrie_db.cpp | 82 +++++++++++++++++++++++ otc/ctrie/ctrie_db.h | 78 ---------------------- otc/ctrie/ctrie_node.cpp | 130 +++++++++++++++++++++++++++++++++++++ otc/ctrie/ctrie_node.h | 137 +++++---------------------------------- otc/meson.build | 2 + 5 files changed, 232 insertions(+), 197 deletions(-) create mode 100644 otc/ctrie/ctrie_db.cpp create mode 100644 otc/ctrie/ctrie_node.cpp diff --git a/otc/ctrie/ctrie_db.cpp b/otc/ctrie/ctrie_db.cpp new file mode 100644 index 00000000..d944cd46 --- /dev/null +++ b/otc/ctrie/ctrie_db.cpp @@ -0,0 +1,82 @@ +#include "otc/ctrie/ctrie_db.h" + +namespace otc { + +std::set CompressedTrieBasedDB::fuzzy_query(const std::string & query_str) const { + auto conv_query = to_u32string(query_str); + unsigned int max_dist; + // defaults taken from taxomachine... + const unsigned int SHORT_NAME_LENGTH = 9; + const unsigned int MEDIUM_NAME_LENGTH = 14; + const unsigned int LONG_NAME_LENGTH = 19; + std::size_t iql = conv_query.length(); + if (iql < SHORT_NAME_LENGTH) { + max_dist = 1; + } else if (iql < MEDIUM_NAME_LENGTH) { + max_dist = 2; + } else { + max_dist = (iql < LONG_NAME_LENGTH ? 3 : 4); + } + std::set sorted; + + auto from_thin = thin_trie.fuzzy_matches(conv_query, max_dist); + sorted.insert(std::begin(from_thin), std::end(from_thin)); + + auto from_full = wide_trie.fuzzy_matches(conv_query, max_dist); + sorted.insert(std::begin(from_full), std::end(from_full)); + return sorted; +} + + +void CompressedTrieBasedDB::initialize(const std::set & keys) { + ctrie_init_set_t for_wide; + ctrie_init_set_t for_thin; + // could fit a couple more non-funky, if we want <- 76, I think... + auto nonfunky = " \'()-.0123456789:,_aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ/?"; + std::ostream & out = std::cout; + std::map letter_counts; + std::set thin_letter_set; + unsigned mem_str = 0; + out << keys.size() << " keys\n"; + for (auto i : keys) { + mem_str += i.length(); + auto widestr = to_u32string(i); + bool has_funky = false; + for (auto c : i) { + if (std::strchr(nonfunky, c) == nullptr) { + has_funky = true; + break; + } + } + if (has_funky) { + for_wide.insert(widestr); + for (auto letter : widestr) { + letter_counts[letter] += 1; + } + } else { + for_thin.insert(widestr); + for (auto letter : widestr) { + thin_letter_set.insert(letter); + } + } + //std::cerr << glob_conv8.to_bytes(widestr) << '\n'; + } + stored_str_t wide_letters; + stored_str_t thin_letters; + thin_letters.insert(std::begin(thin_letters), std::begin(thin_letter_set), std::end(thin_letter_set)); + std::map by_count; + for (auto lcp : letter_counts) { + wide_letters.push_back(lcp.first); + by_count[lcp.second].push_back(lcp.first); + } + //std::cerr << "set size = " << (sizeof(std::string *) + sizeof(char *) + 8)*keys.size() + mem_str << "bytes\n"; + wide_trie.init(for_wide, wide_letters); + thin_trie.init(for_thin, thin_letters); + + /* + wide_trie.db_write_words(std::cerr); + thin_trie.db_write_words(std::cerr); + */ +} + +} diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index 1142ed1f..18499bb4 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -19,83 +19,5 @@ class CompressedTrieBasedDB { }; -inline std::set CompressedTrieBasedDB::fuzzy_query(const std::string & query_str) const { - auto conv_query = to_u32string(query_str); - unsigned int max_dist; - // defaults taken from taxomachine... - const unsigned int SHORT_NAME_LENGTH = 9; - const unsigned int MEDIUM_NAME_LENGTH = 14; - const unsigned int LONG_NAME_LENGTH = 19; - std::size_t iql = conv_query.length(); - if (iql < SHORT_NAME_LENGTH) { - max_dist = 1; - } else if (iql < MEDIUM_NAME_LENGTH) { - max_dist = 2; - } else { - max_dist = (iql < LONG_NAME_LENGTH ? 3 : 4); - } - std::set sorted; - - auto from_thin = thin_trie.fuzzy_matches(conv_query, max_dist); - sorted.insert(std::begin(from_thin), std::end(from_thin)); - - auto from_full = wide_trie.fuzzy_matches(conv_query, max_dist); - sorted.insert(std::begin(from_full), std::end(from_full)); - return sorted; -} - - -inline void CompressedTrieBasedDB::initialize(const std::set & keys) { - ctrie_init_set_t for_wide; - ctrie_init_set_t for_thin; - // could fit a couple more non-funky, if we want <- 76, I think... - auto nonfunky = " \'()-.0123456789:,_aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ/?"; - std::ostream & out = std::cout; - std::map letter_counts; - std::set thin_letter_set; - unsigned mem_str = 0; - out << keys.size() << " keys\n"; - for (auto i : keys) { - mem_str += i.length(); - auto widestr = to_u32string(i); - bool has_funky = false; - for (auto c : i) { - if (std::strchr(nonfunky, c) == nullptr) { - has_funky = true; - break; - } - } - if (has_funky) { - for_wide.insert(widestr); - for (auto letter : widestr) { - letter_counts[letter] += 1; - } - } else { - for_thin.insert(widestr); - for (auto letter : widestr) { - thin_letter_set.insert(letter); - } - } - //std::cerr << glob_conv8.to_bytes(widestr) << '\n'; - } - stored_str_t wide_letters; - stored_str_t thin_letters; - thin_letters.insert(std::begin(thin_letters), std::begin(thin_letter_set), std::end(thin_letter_set)); - std::map by_count; - for (auto lcp : letter_counts) { - wide_letters.push_back(lcp.first); - by_count[lcp.second].push_back(lcp.first); - } - //std::cerr << "set size = " << (sizeof(std::string *) + sizeof(char *) + 8)*keys.size() + mem_str << "bytes\n"; - wide_trie.init(for_wide, wide_letters); - thin_trie.init(for_thin, thin_letters); - - /* - wide_trie.db_write_words(std::cerr); - thin_trie.db_write_words(std::cerr); - */ -} - - } // namespace otc #endif diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp new file mode 100644 index 00000000..df536823 --- /dev/null +++ b/otc/ctrie/ctrie_node.cpp @@ -0,0 +1,130 @@ +#include "otc/ctrie/ctrie_node.h" + +namespace otc { + +//@TODO use lookup table +void fill_letter_and_node_indices(unsigned char curr_byte, + int offset, + vec_ind_pair_t & ret, + uint64_t & node_index) +{ + if (curr_byte == 0) { + return; + } + unsigned char curr_bit = FIRST_BIT_OF_BYTE; + for (unsigned char i = 0; i < 8; ++i) { + //std::cerr << "fill_letter_and_node_indices byte=" << std::hex << (unsigned int)curr_byte << " bit=" << std::hex << (unsigned int)curr_bit << '\n' << std::dec; + if (curr_byte & curr_bit) { + ret.push_back(ind_pair_t{i + offset, node_index++}); + } + curr_bit >>= 1; + } +} + +void fill_letter_and_node_indices_64(uint64_t masked_workspace, + int offset, + vec_ind_pair_t & ret, + uint64_t & node_index) +{ + int bitshift = 56; + uint64_t blot_out_masker; + for (unsigned char i = 0U; i < 8; ++i) { + if (masked_workspace == 0) { + return; + } + unsigned char curr_byte = masked_workspace >> bitshift; + if (curr_byte != 0) { + blot_out_masker = curr_byte; + blot_out_masker <<= bitshift; + // 0 out the bits in masked_workspace that we're dealing with in this iteration. + masked_workspace ^= blot_out_masker; + fill_letter_and_node_indices(curr_byte, offset, ret, node_index); + } + bitshift -= 8; + offset += 8; + } +} + +template <> +void CTrieNode::flag_letter(unsigned int i) { + uint64_t bit = ONE_64; + //log_state(); + if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { + const uint64_t shifted = (bit << (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i)); + data.top |= shifted; + } else if (i < LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD) { + bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); + data.mid |= bit; + } else { + assert(i < DATA_TYPE::END_LETTER_INDEX); + bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD -1 - i); + data.bot |= bit; + } +} + +template <> +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { + assert(!is_terminal()); + vec_ind_pair_t ret; + ret.reserve(DATA_TYPE::END_LETTER_INDEX); + uint64_t node_index = get_index(); + uint64_t masked = data.top & TOP_LETTER_MASK; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); + masked = data.mid; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); + masked = data.bot & BOTTOM_LETTER_MASK; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD, ret, node_index); + return ret; +} + +template <> +void CTrieNode::flag_letter(unsigned int i) { + uint64_t bit = ONE_64; + if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { + bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); + data.top |= bit; + } else { + assert(i < DATA_TYPE::END_LETTER_INDEX); + bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); + data.bot |= bit; + } +} + +template <> +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { + //std::cerr << "get_letter_and_node_indices_for_on_bits top=" + // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; + assert(!is_terminal()); + vec_ind_pair_t ret; + ret.reserve(DATA_TYPE::END_LETTER_INDEX); + u_int64_t node_index = get_index(); + uint64_t masked = data.top & TOP_LETTER_MASK; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); + masked = data.bot & BOTTOM_LETTER_MASK; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); + return ret; +} + + +template <> +void CTrieNode::flag_letter(unsigned int i) { + uint64_t bit = ONE_64; + assert(i < DATA_TYPE::END_LETTER_INDEX); + bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); + data.top |= bit; +} + +template <> +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { + //std::cerr << "get_letter_and_node_indices_for_on_bits top=" + // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; + assert(!is_terminal()); + vec_ind_pair_t ret; + ret.reserve(DATA_TYPE::END_LETTER_INDEX); + u_int64_t node_index = get_index(); + uint64_t masked = data.top & TOP_LETTER_MASK; + fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); + return ret; +} + +} diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 6d333ac3..9723f552 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -51,10 +51,11 @@ constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD = 64 + LETTER_IN constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD = 64 + LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD; class CTrie3NodeData { - public: - uint64_t top, mid, bot; +public: static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD + 64 - NUM_INDEX_BITS; + uint64_t top, mid, bot; + CTrie3NodeData() :top{ZERO_64}, mid{ZERO_64}, bot{ZERO_64} { } uint64_t & get_flag_word() { @@ -82,9 +83,10 @@ class CTrie3NodeData { }; class CTrie2NodeData { - public: - uint64_t top, bot; +public: static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD - NUM_INDEX_BITS; + + uint64_t top, bot; CTrie2NodeData() :top{ZERO_64}, bot{ZERO_64} { } @@ -107,9 +109,10 @@ class CTrie2NodeData { }; class CTrie1NodeData { - public: - uint64_t top; +public: static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - NUM_INDEX_BITS; + + uint64_t top; CTrie1NodeData() :top{ZERO_64} { } @@ -134,53 +137,13 @@ class CTrie1NodeData { using ind_pair_t = std::pair; using vec_ind_pair_t = std::vector; -//@TODO use lookup table -inline void fill_letter_and_node_indices(unsigned char curr_byte, - int offset, - vec_ind_pair_t & ret, - uint64_t & node_index) { - if (curr_byte == 0) { - return; - } - unsigned char curr_bit = FIRST_BIT_OF_BYTE; - for (unsigned char i = 0; i < 8; ++i) { - //std::cerr << "fill_letter_and_node_indices byte=" << std::hex << (unsigned int)curr_byte << " bit=" << std::hex << (unsigned int)curr_bit << '\n' << std::dec; - if (curr_byte & curr_bit) { - ret.push_back(ind_pair_t{i + offset, node_index++}); - } - curr_bit >>= 1; - } -} - -inline void fill_letter_and_node_indices_64(uint64_t masked_workspace, - int offset, - vec_ind_pair_t & ret, - uint64_t & node_index) { - int bitshift = 56; - uint64_t blot_out_masker; - for (unsigned char i = 0U; i < 8; ++i) { - if (masked_workspace == 0) { - return; - } - unsigned char curr_byte = masked_workspace >> bitshift; - if (curr_byte != 0) { - blot_out_masker = curr_byte; - blot_out_masker <<= bitshift; - // 0 out the bits in masked_workspace that we're dealing with in this iteration. - masked_workspace ^= blot_out_masker; - fill_letter_and_node_indices(curr_byte, offset, ret, node_index); - } - bitshift -= 8; - offset += 8; - } -} - template class CTrieNode { - private: +private: T data; - public: +public: using DATA_TYPE = T; + CTrieNode() { } @@ -236,86 +199,22 @@ class CTrieNode { template <> -inline void CTrieNode::flag_letter(unsigned int i) { - uint64_t bit = ONE_64; - //log_state(); - if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { - const uint64_t shifted = (bit << (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i)); - data.top |= shifted; - } else if (i < LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD) { - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.mid |= bit; - } else { - assert(i < DATA_TYPE::END_LETTER_INDEX); - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD -1 - i); - data.bot |= bit; - } -} +void CTrieNode::flag_letter(unsigned int i); template <> -inline vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - uint64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - masked = data.mid; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); - masked = data.bot & BOTTOM_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD, ret, node_index); - return ret; -} +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; template <> -inline void CTrieNode::flag_letter(unsigned int i) { - uint64_t bit = ONE_64; - if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.top |= bit; - } else { - assert(i < DATA_TYPE::END_LETTER_INDEX); - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.bot |= bit; - } -} +void CTrieNode::flag_letter(unsigned int i); template <> -inline vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - //std::cerr << "get_letter_and_node_indices_for_on_bits top=" - // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - u_int64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - masked = data.bot & BOTTOM_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); - return ret; -} - +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; template <> -inline void CTrieNode::flag_letter(unsigned int i) { - uint64_t bit = ONE_64; - assert(i < DATA_TYPE::END_LETTER_INDEX); - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.top |= bit; -} +void CTrieNode::flag_letter(unsigned int i); template <> -inline vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - //std::cerr << "get_letter_and_node_indices_for_on_bits top=" - // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - u_int64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - return ret; -} +vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; using CTrie3Node = CTrieNode; using CTrie2Node = CTrieNode; diff --git a/otc/meson.build b/otc/meson.build index 2c32900f..26d98ab9 100644 --- a/otc/meson.build +++ b/otc/meson.build @@ -2,6 +2,8 @@ libotcetera_sources = [ 'config_file.cpp', 'ctrie/context_ctrie_db.cpp', 'ctrie/str_utils.cpp', + 'ctrie/ctrie_db.cpp', + 'ctrie/ctrie_node.cpp', 'embedded_tree.cpp', 'forest.cpp', 'ftree.cpp', From 925e9599a0d1ebd7e0289d264260da69a802e755 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 11:43:55 -0700 Subject: [PATCH 030/620] Fix lcase_match_prefix( ). --- otc/util.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/otc/util.h b/otc/util.h index 29cd68e0..6ca84077 100644 --- a/otc/util.h +++ b/otc/util.h @@ -563,11 +563,9 @@ inline bool lcase_string_equals(const std::string_view& s1, const T& s2) { return true; } -inline bool lcase_match_prefix(const std::string_view& s, const std::string_view & prefix) { - if (prefix.size() > s.size()) { - return false; - } - return lcase_string_equals(s.substr(prefix.size()), prefix); +inline bool lcase_match_prefix(const std::string_view& s, const std::string_view & prefix) +{ + return lcase_string_equals(s.substr(0, prefix.size()), prefix); } } //namespace otc From 537400a8bb82ceed7d857e3d6a6f1c9f9adf0da2 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 12:02:32 -0700 Subject: [PATCH 031/620] Allow nested test dirs. --- ws/test_web_services.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ws/test_web_services.py b/ws/test_web_services.py index 9ddf1957..61f9313e 100644 --- a/ws/test_web_services.py +++ b/ws/test_web_services.py @@ -355,6 +355,16 @@ def run_tests(dirs_to_run, test_threads): time.sleep(0.1) return num_passed, num_failed, num_errors + +def get_test_dirs_under(top_test_dir): + test_dirs = [] + for root, dirs, files in os.walk(top_test_dir): + if "method.json" in files: + path = os.path.relpath(root, top_test_dir) + test_dirs.insert(0,path) + return test_dirs + + if __name__ == '__main__': import argparse import codecs @@ -389,7 +399,7 @@ def run_tests(dirs_to_run, test_threads): if args.test_name is not None: e_dir_list = [args.test_name] else: - e_dir_list = os.listdir(test_par) + e_dir_list = get_test_dirs_under(test_par) e_dir_list.sort() SERVER_PORT = args.server_port From e911bf4d414ef7f7239bca8bb912debeddcac972 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 12:21:15 -0700 Subject: [PATCH 032/620] Use os.path.relpath to get test names. --- ws/test_web_services.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ws/test_web_services.py b/ws/test_web_services.py index 61f9313e..971900ff 100644 --- a/ws/test_web_services.py +++ b/ws/test_web_services.py @@ -139,7 +139,7 @@ def start_worker(num_workers): 'accept' : 'application/json', } class WebServiceTestJob(object): - def __init__(self, test_description, service_prefix): + def __init__(self, test_par, test_description, service_prefix): self.url_fragment = test_description["url_fragment"] self.arguments = test_description["arguments"] v = test_description.get("verb", "GET").upper() @@ -152,8 +152,9 @@ def __init__(self, test_description, service_prefix): self.passed = False self.failed = False self.erred = False + self.test_par = test_par self.test_dir = test_description.get("test_dir") - self.test_subdir = os.path.split(self.test_dir)[-1] + self.test_subdir = os.path.relpath(self.test_dir, self.test_par) self.name = test_description.get("name", self.test_subdir or self.url_fragment) self.stat_lock = RLock() @@ -296,7 +297,7 @@ def kill_server(exe_dir): FAILED_TESTS, ERRORED_TESTS = [], [] -def run_tests(dirs_to_run, test_threads): +def run_tests(test_par, dirs_to_run, test_threads): assert test_threads > 0 td_list = [] for test_dir in dirs_to_run: @@ -318,7 +319,7 @@ def run_tests(dirs_to_run, test_threads): start_worker(test_threads) service_prefix = "http://127.0.0.1:{}/".format(SERVER_PORT) - all_jobs = [WebServiceTestJob(test_description=td, service_prefix=service_prefix) for td in td_list] + all_jobs = [WebServiceTestJob(test_par=test_par, test_description=td, service_prefix=service_prefix) for td in td_list] running_jobs = list(all_jobs) for j in all_jobs: _jobq.put(j) @@ -403,6 +404,7 @@ def get_test_dirs_under(top_test_dir): e_dir_list.sort() SERVER_PORT = args.server_port + # Get test paths to_run = [] for e_subdir_name in e_dir_list: e_path = os.path.join(test_par, e_subdir_name) @@ -416,6 +418,8 @@ def get_test_dirs_under(top_test_dir): to_run.append(e_path) if not to_run: sys.exit("No test were found!") + + # Check that there are no PIDfiles in the way pidfile_path = os.path.join(exe_dir, PIDFILE_NAME) if os.path.exists(pidfile_path): recheck = 0 @@ -426,13 +430,15 @@ def get_test_dirs_under(top_test_dir): break if os.path.exists(pidfile_path): sys.exit("{} is in the way!\n".format(pidfile_path)) + + # try launching otc-tol-ws and running the tests against it. for i in range(2): if launch_server(exe_dir=exe_dir, taxonomy_dir=taxonomy_dir, synth_par=synth_par_path, server_threads=args.server_threads): try: - num_passed, nf, ne = run_tests(to_run, args.test_threads) + num_passed, nf, ne = run_tests(test_par, to_run, args.test_threads) finally: kill_server(exe_dir) NUM_TESTS = nf + ne + num_passed From 6a782da752eeb127d487a194b1543b2e4bde6345 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 12:23:13 -0700 Subject: [PATCH 033/620] Organize TNRS tests. --- .../{infer_contexts_ambig => infer_contexts/ambig}/expected.json | 0 .../{infer_contexts_ambig => infer_contexts/ambig}/method.json | 0 .../ambig_plant}/expected.json | 0 .../ambig_plant}/method.json | 0 .../animals}/expected.json | 0 .../animals}/method.json | 0 .../{infer_contexts_life => infer_contexts/life}/expected.json | 0 .../{infer_contexts_life => infer_contexts/life}/method.json | 0 .../{match_names_fuzzy => match_names/fuzzy}/expected.json | 0 .../{match_names_fuzzy => match_names/fuzzy}/method.json | 0 .../fuzzy_contexts}/expected.json | 0 .../fuzzy_contexts}/method.json | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename test/expectedws-induced/{infer_contexts_ambig => infer_contexts/ambig}/expected.json (100%) rename test/expectedws-induced/{infer_contexts_ambig => infer_contexts/ambig}/method.json (100%) rename test/expectedws-induced/{infer_contexts_ambig_plant => infer_contexts/ambig_plant}/expected.json (100%) rename test/expectedws-induced/{infer_contexts_ambig_plant => infer_contexts/ambig_plant}/method.json (100%) rename test/expectedws-induced/{infer_contexts_animals => infer_contexts/animals}/expected.json (100%) rename test/expectedws-induced/{infer_contexts_animals => infer_contexts/animals}/method.json (100%) rename test/expectedws-induced/{infer_contexts_life => infer_contexts/life}/expected.json (100%) rename test/expectedws-induced/{infer_contexts_life => infer_contexts/life}/method.json (100%) rename test/expectedws-induced/{match_names_fuzzy => match_names/fuzzy}/expected.json (100%) rename test/expectedws-induced/{match_names_fuzzy => match_names/fuzzy}/method.json (100%) rename test/expectedws-induced/{match_names_fuzzy_contexts => match_names/fuzzy_contexts}/expected.json (100%) rename test/expectedws-induced/{match_names_fuzzy_contexts => match_names/fuzzy_contexts}/method.json (100%) diff --git a/test/expectedws-induced/infer_contexts_ambig/expected.json b/test/expectedws-induced/infer_contexts/ambig/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts_ambig/expected.json rename to test/expectedws-induced/infer_contexts/ambig/expected.json diff --git a/test/expectedws-induced/infer_contexts_ambig/method.json b/test/expectedws-induced/infer_contexts/ambig/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts_ambig/method.json rename to test/expectedws-induced/infer_contexts/ambig/method.json diff --git a/test/expectedws-induced/infer_contexts_ambig_plant/expected.json b/test/expectedws-induced/infer_contexts/ambig_plant/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts_ambig_plant/expected.json rename to test/expectedws-induced/infer_contexts/ambig_plant/expected.json diff --git a/test/expectedws-induced/infer_contexts_ambig_plant/method.json b/test/expectedws-induced/infer_contexts/ambig_plant/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts_ambig_plant/method.json rename to test/expectedws-induced/infer_contexts/ambig_plant/method.json diff --git a/test/expectedws-induced/infer_contexts_animals/expected.json b/test/expectedws-induced/infer_contexts/animals/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts_animals/expected.json rename to test/expectedws-induced/infer_contexts/animals/expected.json diff --git a/test/expectedws-induced/infer_contexts_animals/method.json b/test/expectedws-induced/infer_contexts/animals/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts_animals/method.json rename to test/expectedws-induced/infer_contexts/animals/method.json diff --git a/test/expectedws-induced/infer_contexts_life/expected.json b/test/expectedws-induced/infer_contexts/life/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts_life/expected.json rename to test/expectedws-induced/infer_contexts/life/expected.json diff --git a/test/expectedws-induced/infer_contexts_life/method.json b/test/expectedws-induced/infer_contexts/life/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts_life/method.json rename to test/expectedws-induced/infer_contexts/life/method.json diff --git a/test/expectedws-induced/match_names_fuzzy/expected.json b/test/expectedws-induced/match_names/fuzzy/expected.json similarity index 100% rename from test/expectedws-induced/match_names_fuzzy/expected.json rename to test/expectedws-induced/match_names/fuzzy/expected.json diff --git a/test/expectedws-induced/match_names_fuzzy/method.json b/test/expectedws-induced/match_names/fuzzy/method.json similarity index 100% rename from test/expectedws-induced/match_names_fuzzy/method.json rename to test/expectedws-induced/match_names/fuzzy/method.json diff --git a/test/expectedws-induced/match_names_fuzzy_contexts/expected.json b/test/expectedws-induced/match_names/fuzzy_contexts/expected.json similarity index 100% rename from test/expectedws-induced/match_names_fuzzy_contexts/expected.json rename to test/expectedws-induced/match_names/fuzzy_contexts/expected.json diff --git a/test/expectedws-induced/match_names_fuzzy_contexts/method.json b/test/expectedws-induced/match_names/fuzzy_contexts/method.json similarity index 100% rename from test/expectedws-induced/match_names_fuzzy_contexts/method.json rename to test/expectedws-induced/match_names/fuzzy_contexts/method.json From 6bb05855abc865344e70f7cb591b516250cb148b Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 12:23:41 -0700 Subject: [PATCH 034/620] Fix default context for autocomplete_name --- ws/tolwsbooting.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index 2bbb02c4..ee0deff6 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -432,6 +432,7 @@ const int MAX_NONFUZZY_QUERY_STRINGS = 10000; const int MAX_FUZZY_QUERY_STRINGS = 250; static string LIFE_NODE_NAME = "life"; +static string LIFE_CONTEXT_NAME = "All life"; string tnrs_match_names_handler( const json& parsedargs ) { // 1. Requred argument: "names" @@ -454,7 +455,7 @@ string tnrs_match_names_handler( const json& parsedargs ) { string tnrs_autocomplete_name_handler( const json& parsedargs ) { string name = extract_required_argument(parsedargs, "name"); - string context_name = extract_argument_or_default(parsedargs, "context_name", LIFE_NODE_NAME); + string context_name = extract_argument_or_default(parsedargs, "context_name", LIFE_CONTEXT_NAME); bool include_suppressed = extract_argument_or_default(parsedargs, "include_suppressed", false); auto locked_taxonomy = tts.get_readable_taxonomy(); const auto & taxonomy = locked_taxonomy.first; From 6de0948f804f8cf250feb7ddd987f9059e6ca69a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 12:25:26 -0700 Subject: [PATCH 035/620] Add autocomplete_name tests. --- .../autocomplete_name/1/expected.json | 38 +++++++++++++++++++ .../autocomplete_name/1/method.json | 8 ++++ .../autocomplete_name/2/expected.json | 38 +++++++++++++++++++ .../autocomplete_name/2/method.json | 9 +++++ 4 files changed, 93 insertions(+) create mode 100644 test/expectedws-induced/autocomplete_name/1/expected.json create mode 100644 test/expectedws-induced/autocomplete_name/1/method.json create mode 100644 test/expectedws-induced/autocomplete_name/2/expected.json create mode 100644 test/expectedws-induced/autocomplete_name/2/method.json diff --git a/test/expectedws-induced/autocomplete_name/1/expected.json b/test/expectedws-induced/autocomplete_name/1/expected.json new file mode 100644 index 00000000..4b1dfb3c --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/1/expected.json @@ -0,0 +1,38 @@ +[ + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 655592, + "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 267867, + "unique_name": "Amaryllidoideae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 258473, + "unique_name": "Amaryllidaceae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 4671782, + "unique_name": "Amaryllis (genus in Opisthokonta)" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5764812, + "unique_name": "Amaryllidinae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5092163, + "unique_name": "Amaryllididae" + } +] \ No newline at end of file diff --git a/test/expectedws-induced/autocomplete_name/1/method.json b/test/expectedws-induced/autocomplete_name/1/method.json new file mode 100644 index 00000000..4135cd5c --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/1/method.json @@ -0,0 +1,8 @@ +{ + "url_fragment": "v3/tnrs/autocomplete_name", + "verb": "POST", + "arguments": { + "name": "Amary" + } +} + diff --git a/test/expectedws-induced/autocomplete_name/2/expected.json b/test/expectedws-induced/autocomplete_name/2/expected.json new file mode 100644 index 00000000..724277ad --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/2/expected.json @@ -0,0 +1,38 @@ +[ + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 655592, + "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 267867, + "unique_name": "Amaryllidoideae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 258473, + "unique_name": "Amaryllidaceae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 4671782, + "unique_name": "Amaryllis (genus in Opisthokonta)" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5764812, + "unique_name": "Amaryllidinae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5092163, + "unique_name": "Amaryllididae" + } +] \ No newline at end of file diff --git a/test/expectedws-induced/autocomplete_name/2/method.json b/test/expectedws-induced/autocomplete_name/2/method.json new file mode 100644 index 00000000..7ffe5ed3 --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/2/method.json @@ -0,0 +1,9 @@ +{ + "url_fragment": "v3/tnrs/autocomplete_name", + "verb": "POST", + "arguments": { + "name": "Amary", + "context_name": "All life" + } +} + From 674eaaf103a3311d6b7dcb5ec2eee77df174e88f Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 14:26:43 -0700 Subject: [PATCH 036/620] Return [ ] for an empty autocomplete_name response. --- otc/ws/tnrsws.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 7a5acf63..c72eeee1 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -504,7 +504,8 @@ string tnrs_autocomplete_name_ws_method(const string& name, const string& context_name, bool include_suppressed, const RichTaxonomy& taxonomy) { - json response; + // An empty response should be `[ ]`, not `null`. + json response = json::array(); // We need to escape the query string. auto escaped_query = escape_query_string(name); // This corresponds to a SingleNamePrefixQuery in taxomachine. From bfeea3ab02dfdccb0c32cc3965bef9bf74893244 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 14:47:39 -0700 Subject: [PATCH 037/620] Add missing fuzzy matching for autocomplete_name. --- otc/ws/tnrsws.cpp | 54 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index c72eeee1..15365058 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -404,7 +404,6 @@ autocompleting name fields on forms, use the `match_names` service. */ - //FIXME: how is "suppressed_names" different from "deprecated_taxa"? // $ curl -X POST https://api.opentreeoflife.org/v3/tnrs/match_names -H "content-type:application/json" -d '{"names":["Aster","Symphyotrichum","Barnadesia"]}' @@ -474,7 +473,14 @@ inline void add_hits(json& j, const RichTaxonomy& taxonomy, const vector& fuzzy_query_results) +{ + for(auto fuzzy_query_result: fuzzy_query_results) { + j.push_back(autocomplete_json(taxonomy, fuzzy_query_result.get_taxon())); + } +} + inline void add_hits(json& j, const RichTaxonomy& taxonomy, const vec_tax_str_pair_t taxa) { for(auto [taxon, synonym]: taxa) { j.push_back(autocomplete_json(taxonomy, taxon)); @@ -498,6 +504,30 @@ vector prefix_search_species_in_genus(const Taxon* genus, return match_species; } +/* + * Fuzzy matching DOES occur in autocomplete_name: + * + * curl -X POST https://api.opentreeoflife.org/v3/tnrs/autocomplete_name -H "content-type:application/json" -d '{"name":"Homo salien"}' +[ { + "is_suppressed" : false, + "unique_name" : "Homo sapiens", + "ott_id" : 770315, + "is_higher" : false +} ] + * + * However, fuzzy matching does not find `Homo sapiens neanderthalensis` and `Homo sapiens subsp. 'Denisova'`, whereas direct matching does. + * + * curl -X POST https://api.opentreeoflife.org/v3/tnrs/autocomplete_name -H "content-type:application/json" -d '{"name":"Homo sapien"}' + * + * However, it appears that a difference of 2 chars is allowed on an exact match, instead of doing a prefix-query on the fuzzy index: + * + * "Hono saliens" -> "Homo sapiens" + "Neobodo saliens" + * "Hono salien" -> nothing + * "Homo salens" -> "Homo sapiens" + * "Homo salen" -> nothing + */ + + // curl -X POST https://api.opentreeoflife.org/v3/tnrs/autocomplete_name -H "content-type:application/json" -d '{"name":"Endoxyla","context_name":"All life"}' string tnrs_autocomplete_name_ws_method(const string& name, @@ -547,7 +577,15 @@ string tnrs_autocomplete_name_ws_method(const string& name, if (not response.empty()) { return response.dump(1); } - // fuzzy search on names and synonyms + // fuzzy search on names and synonyms (BDR -- not a prefix search?) + { + auto ctp = taxonomy.get_fuzzy_matcher(); + if (ctp == nullptr) { + throw OTCError() << "Fuzzy matching has not been enabled in the taxonomy, but was requested in match_name."; + } + auto fuzzy_results = ctp->fuzzy_query_to_taxa(escaped_query, context_root, taxonomy, include_suppressed); + add_hits(response, taxonomy, fuzzy_results); + } } else { // does not contain a space at all add_hits(response, taxonomy, exact_name_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); add_hits(response, taxonomy, exact_synonym_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); @@ -564,7 +602,15 @@ string tnrs_autocomplete_name_ws_method(const string& name, if (not response.empty()) { return response.dump(1); } - // fuzzy search on higher names and synonyms + // fuzzy search on HIGHER names and synonyms (BDR -- not a prefix search?) + { + auto ctp = taxonomy.get_fuzzy_matcher(); + if (ctp == nullptr) { + throw OTCError() << "Fuzzy matching has not been enabled in the taxonomy, but was requested in match_name."; + } + auto fuzzy_results = ctp->fuzzy_query_to_taxa(escaped_query, context_root, taxonomy, include_suppressed); + add_hits(response, taxonomy, fuzzy_results); + } } return response.dump(1); } From 2c5c39ab189023806425b35239c8db047f1ca4ba Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 14:52:02 -0700 Subject: [PATCH 038/620] Add test for fuzzy autocomplete_name. --- .../autocomplete_name/fuzzy/1/expected.json | 8 ++++++++ .../autocomplete_name/fuzzy/1/method.json | 8 ++++++++ 2 files changed, 16 insertions(+) create mode 100644 test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json create mode 100644 test/expectedws-induced/autocomplete_name/fuzzy/1/method.json diff --git a/test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json b/test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json new file mode 100644 index 00000000..014abc62 --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json @@ -0,0 +1,8 @@ +[ + { + "is_higher": false, + "is_suppressed": false, + "ott_id": 200683, + "unique_name": "Amaryllis paradisicola" + } +] \ No newline at end of file diff --git a/test/expectedws-induced/autocomplete_name/fuzzy/1/method.json b/test/expectedws-induced/autocomplete_name/fuzzy/1/method.json new file mode 100644 index 00000000..3d18b117 --- /dev/null +++ b/test/expectedws-induced/autocomplete_name/fuzzy/1/method.json @@ -0,0 +1,8 @@ +{ + "url_fragment": "v3/tnrs/autocomplete_name", + "verb": "POST", + "arguments": { + "name": "Amarylis paradisicola" + } +} + From f4ccd5ac7f053845034c88d73291c581c6c59e08 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 16:42:32 -0700 Subject: [PATCH 039/620] Give clearer name to test directories. --- .../data/taxonomy-induced}/about.json | 0 .../data/taxonomy-induced}/conflicts.tsv | 0 .../data/taxonomy-induced}/deprecated.tsv | 0 .../data/taxonomy-induced}/synonyms.tsv | 0 .../data/taxonomy-induced}/taxonomy.tsv | 0 .../data/taxonomy-induced}/version.txt | 0 .../data/taxonomy-tree}/out1.0/README.md | 0 .../taxonomy-tree}/out1.0/annotated_supertree/README.md | 0 .../out1.0/annotated_supertree/annotations.json | 0 .../out1.0/annotated_supertree/annotations1.json | 0 .../out1.0/annotated_supertree/annotations2.json | 0 .../taxonomy-tree}/out1.0/annotated_supertree/index.html | 0 .../taxonomy-tree}/out1.0/annotated_supertree/index.json | 0 .../data/taxonomy-tree}/out1.0/assessments/README.md | 0 .../data/taxonomy-tree}/out1.0/assessments/index.html | 0 .../data/taxonomy-tree}/out1.0/assessments/index.json | 0 .../data/taxonomy-tree}/out1.0/assessments/log.txt | 0 .../data/taxonomy-tree}/out1.0/assessments/lost_taxa.txt | 0 .../data/taxonomy-tree}/out1.0/assessments/summary.json | 0 .../out1.0/assessments/supertree_degree_distribution.txt | 0 .../out1.0/assessments/taxonomy_degree_distribution.txt | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/README.md | 0 .../out1.0/cleaned_ott/cleaned_not_updated_ott.tre | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott.json | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott.tre | 0 .../taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott_1.json | 0 .../out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json | 0 .../out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre | 0 .../taxonomy-tree}/out1.0/cleaned_ott/cleaning_flags.txt | 0 .../out1.0/cleaned_ott/flagged_in_cleaned.json | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/index.html | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/index.json | 0 .../out1.0/cleaned_ott/move_extinct_higher_log.json | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/ott_version.txt | 0 .../data/taxonomy-tree}/out1.0/cleaned_ott/root_ott_id.txt | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/README.md | 0 .../taxonomy-tree}/out1.0/cleaned_phylo/cleaning_flags.txt | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/index.html | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/index.json | 0 .../taxonomy-tree}/out1.0/cleaned_phylo/needs_updating.txt | 0 .../out1.0/cleaned_phylo/phylo_inputs_cleaned.txt | 0 .../taxonomy-tree}/out1.0/cleaned_phylo/root_ott_id.txt | 0 .../taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1.json | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1.tre | 0 .../taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1.json | 0 .../data/taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1.tre | 0 .../data/taxonomy-tree}/out1.0/config | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/README.md | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/args.txt | 0 .../out1.0/exemplified_phylo/exemplified_log.json | 0 .../out1.0/exemplified_phylo/incertae_sedis.txt | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/index.html | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/index.json | 0 .../out1.0/exemplified_phylo/nonempty_trees.txt | 0 .../exemplified_phylo/pruned_for_regraft_cleaned_ott.json | 0 .../pruned_taxonomy_degree_distribution.txt | 0 .../out1.0/exemplified_phylo/regraft_cleaned_ott.tre | 0 .../taxonomy-tree}/out1.0/exemplified_phylo/taxonomy.tre | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/xx_1@1.tre | 0 .../data/taxonomy-tree}/out1.0/exemplified_phylo/xx_2@1.tre | 0 .../data/taxonomy-tree}/out1.0/grafted_solution/README.md | 0 .../out1.0/grafted_solution/grafted_solution.tre | 0 .../out1.0/grafted_solution/grafted_solution_ottnames.tre | 0 .../data/taxonomy-tree}/out1.0/grafted_solution/index.html | 0 .../data/taxonomy-tree}/out1.0/grafted_solution/index.json | 0 .../data/taxonomy-tree}/out1.0/index.html | 0 .../data/taxonomy-tree}/out1.0/index.json | 0 .../data/taxonomy-tree}/out1.0/labelled_supertree/README.md | 0 .../out1.0/labelled_supertree/broken_taxa.json | 0 .../taxonomy-tree}/out1.0/labelled_supertree/index.html | 0 .../taxonomy-tree}/out1.0/labelled_supertree/index.json | 0 .../out1.0/labelled_supertree/input_output_stats.json | 0 .../out1.0/labelled_supertree/labelled_supertree.tre | 0 .../labelled_supertree/labelled_supertree_ottnames.tre | 0 .../labelled_supertree_ottnames_without_monotypic.tre | 0 .../labelled_supertree_out_degree_distribution.txt | 0 .../labelled_supertree_simplified_ottnames.tre | 0 ...lled_supertree_simplified_ottnames_without_monotypic.tre | 0 .../out1.0/labelled_supertree/simplified_ottnames.log | 0 .../simplified_ottnames_without_monotypic.log | 0 .../data/taxonomy-tree}/out1.0/phylo_input/README.md | 0 .../data/taxonomy-tree}/out1.0/phylo_input/collections.txt | 0 .../data/taxonomy-tree}/out1.0/phylo_input/index.html | 0 .../data/taxonomy-tree}/out1.0/phylo_input/index.json | 0 .../taxonomy-tree}/out1.0/phylo_input/rank_collection.json | 0 .../taxonomy-tree}/out1.0/phylo_input/study_tree_pairs.txt | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/README.md | 0 .../out1.0/phylo_snapshot/collections_git_shas.txt | 0 .../out1.0/phylo_snapshot/concrete_rank_collection.json | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/git_shas.txt | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/index.html | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/index.json | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/xx_1@1.json | 0 .../data/taxonomy-tree}/out1.0/phylo_snapshot/xx_2@1.json | 0 .../taxonomy-tree}/out1.0/subproblem_solutions/README.md | 0 .../taxonomy-tree}/out1.0/subproblem_solutions/index.html | 0 .../taxonomy-tree}/out1.0/subproblem_solutions/index.json | 0 .../out1.0/subproblem_solutions/ott5268475.tre | 0 .../out1.0/subproblem_solutions/ott805080.tre | 0 .../subproblem_solutions/solution-degree-distributions.txt | 0 .../out1.0/subproblem_solutions/solution-ids.txt | 0 .../data/taxonomy-tree}/out1.0/subproblems/README.md | 0 .../out1.0/subproblems/checksummed-subproblem-ids.txt | 0 .../taxonomy-tree}/out1.0/subproblems/contesting-trees.json | 0 .../out1.0/subproblems/dumped-subproblem-ids.txt | 0 .../data/taxonomy-tree}/out1.0/subproblems/index.html | 0 .../data/taxonomy-tree}/out1.0/subproblems/index.json | 0 .../out1.0/subproblems/ott5268475-tree-names.txt | 0 .../data/taxonomy-tree}/out1.0/subproblems/ott5268475.md5 | 0 .../data/taxonomy-tree}/out1.0/subproblems/ott5268475.tre | 0 .../out1.0/subproblems/ott805080-tree-names.txt | 0 .../data/taxonomy-tree}/out1.0/subproblems/ott805080.md5 | 0 .../data/taxonomy-tree}/out1.0/subproblems/ott805080.tre | 0 .../taxonomy-tree}/out1.0/subproblems/scratch/README.md | 0 .../data/taxonomy-tree}/out1.0/subproblems/scratch/args.txt | 0 .../out1.0/subproblems/scratch/ott5268475-tree-names.txt | 0 .../out1.0/subproblems/scratch/ott5268475.md5 | 0 .../out1.0/subproblems/scratch/ott5268475.tre | 0 .../out1.0/subproblems/scratch/ott805080-tree-names.txt | 0 .../taxonomy-tree}/out1.0/subproblems/scratch/ott805080.md5 | 0 .../taxonomy-tree}/out1.0/subproblems/scratch/ott805080.tre | 0 .../taxonomy-tree}/out1.0/subproblems/subproblem-ids.txt | 0 test/{ => tnrs}/expectedws-induced/.gitignore | 0 .../expectedws-induced/autocomplete_name/1/expected.json | 0 .../expectedws-induced/autocomplete_name/1/method.json | 0 .../expectedws-induced/autocomplete_name/2/expected.json | 0 .../expectedws-induced/autocomplete_name/2/method.json | 0 .../autocomplete_name/fuzzy/1/expected.json | 0 .../autocomplete_name/fuzzy/1/method.json | 0 test/{ => tnrs}/expectedws-induced/contexts/expected.json | 0 test/{ => tnrs}/expectedws-induced/contexts/method.json | 0 .../expectedws-induced/infer_contexts/ambig/expected.json | 0 .../expectedws-induced/infer_contexts/ambig/method.json | 0 .../infer_contexts/ambig_plant/expected.json | 0 .../infer_contexts/ambig_plant/method.json | 0 .../expectedws-induced/infer_contexts/animals/expected.json | 0 .../expectedws-induced/infer_contexts/animals/method.json | 0 .../expectedws-induced/infer_contexts/life/expected.json | 0 .../expectedws-induced/infer_contexts/life/method.json | 0 .../expectedws-induced/match_names/fuzzy/expected.json | 0 .../expectedws-induced/match_names/fuzzy/method.json | 0 .../match_names/fuzzy_contexts/expected.json | 0 .../match_names/fuzzy_contexts/method.json | 0 ws/meson.build | 6 +++--- 146 files changed, 3 insertions(+), 3 deletions(-) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/about.json (100%) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/conflicts.tsv (100%) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/deprecated.tsv (100%) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/synonyms.tsv (100%) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/taxonomy.tsv (100%) rename test/{data/tnrs-taxonomy-induced => tnrs/data/taxonomy-induced}/version.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/annotations.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/annotations1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/annotations2.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/annotated_supertree/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/log.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/lost_taxa.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/summary.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/supertree_degree_distribution.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/assessments/taxonomy_degree_distribution.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_not_updated_ott.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott_1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/cleaning_flags.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/flagged_in_cleaned.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/move_extinct_higher_log.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/ott_version.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_ott/root_ott_id.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/cleaning_flags.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/needs_updating.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/phylo_inputs_cleaned.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/root_ott_id.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_1@1.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/cleaned_phylo/xx_2@1.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/config (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/args.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/exemplified_log.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/incertae_sedis.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/nonempty_trees.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/pruned_for_regraft_cleaned_ott.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/pruned_taxonomy_degree_distribution.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/regraft_cleaned_ott.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/taxonomy.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/xx_1@1.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/exemplified_phylo/xx_2@1.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/grafted_solution/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/grafted_solution/grafted_solution.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/grafted_solution/grafted_solution_ottnames.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/grafted_solution/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/grafted_solution/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/broken_taxa.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/input_output_stats.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree_ottnames.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree_ottnames_without_monotypic.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree_out_degree_distribution.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames_without_monotypic.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/simplified_ottnames.log (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/labelled_supertree/simplified_ottnames_without_monotypic.log (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/collections.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/rank_collection.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_input/study_tree_pairs.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/collections_git_shas.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/concrete_rank_collection.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/git_shas.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/xx_1@1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/phylo_snapshot/xx_2@1.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/ott5268475.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/ott805080.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/solution-degree-distributions.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblem_solutions/solution-ids.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/checksummed-subproblem-ids.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/contesting-trees.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/dumped-subproblem-ids.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/index.html (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/index.json (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott5268475-tree-names.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott5268475.md5 (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott5268475.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott805080-tree-names.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott805080.md5 (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/ott805080.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/README.md (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/args.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott5268475-tree-names.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott5268475.md5 (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott5268475.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott805080-tree-names.txt (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott805080.md5 (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/scratch/ott805080.tre (100%) rename test/{data/tnrs-taxonomy-tree => tnrs/data/taxonomy-tree}/out1.0/subproblems/subproblem-ids.txt (100%) rename test/{ => tnrs}/expectedws-induced/.gitignore (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/1/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/1/method.json (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/2/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/2/method.json (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/fuzzy/1/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/autocomplete_name/fuzzy/1/method.json (100%) rename test/{ => tnrs}/expectedws-induced/contexts/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/contexts/method.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/ambig/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/ambig/method.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/ambig_plant/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/ambig_plant/method.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/animals/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/animals/method.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/life/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/infer_contexts/life/method.json (100%) rename test/{ => tnrs}/expectedws-induced/match_names/fuzzy/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/match_names/fuzzy/method.json (100%) rename test/{ => tnrs}/expectedws-induced/match_names/fuzzy_contexts/expected.json (100%) rename test/{ => tnrs}/expectedws-induced/match_names/fuzzy_contexts/method.json (100%) diff --git a/test/data/tnrs-taxonomy-induced/about.json b/test/tnrs/data/taxonomy-induced/about.json similarity index 100% rename from test/data/tnrs-taxonomy-induced/about.json rename to test/tnrs/data/taxonomy-induced/about.json diff --git a/test/data/tnrs-taxonomy-induced/conflicts.tsv b/test/tnrs/data/taxonomy-induced/conflicts.tsv similarity index 100% rename from test/data/tnrs-taxonomy-induced/conflicts.tsv rename to test/tnrs/data/taxonomy-induced/conflicts.tsv diff --git a/test/data/tnrs-taxonomy-induced/deprecated.tsv b/test/tnrs/data/taxonomy-induced/deprecated.tsv similarity index 100% rename from test/data/tnrs-taxonomy-induced/deprecated.tsv rename to test/tnrs/data/taxonomy-induced/deprecated.tsv diff --git a/test/data/tnrs-taxonomy-induced/synonyms.tsv b/test/tnrs/data/taxonomy-induced/synonyms.tsv similarity index 100% rename from test/data/tnrs-taxonomy-induced/synonyms.tsv rename to test/tnrs/data/taxonomy-induced/synonyms.tsv diff --git a/test/data/tnrs-taxonomy-induced/taxonomy.tsv b/test/tnrs/data/taxonomy-induced/taxonomy.tsv similarity index 100% rename from test/data/tnrs-taxonomy-induced/taxonomy.tsv rename to test/tnrs/data/taxonomy-induced/taxonomy.tsv diff --git a/test/data/tnrs-taxonomy-induced/version.txt b/test/tnrs/data/taxonomy-induced/version.txt similarity index 100% rename from test/data/tnrs-taxonomy-induced/version.txt rename to test/tnrs/data/taxonomy-induced/version.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/README.md b/test/tnrs/data/taxonomy-tree/out1.0/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/README.md b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations.json b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations.json rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations1.json b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations1.json rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations2.json b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations2.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/annotations2.json rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/annotations2.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/index.html b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/index.json b/test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/annotated_supertree/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/annotated_supertree/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/README.md b/test/tnrs/data/taxonomy-tree/out1.0/assessments/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/index.html b/test/tnrs/data/taxonomy-tree/out1.0/assessments/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/index.json b/test/tnrs/data/taxonomy-tree/out1.0/assessments/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/log.txt b/test/tnrs/data/taxonomy-tree/out1.0/assessments/log.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/log.txt rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/log.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/lost_taxa.txt b/test/tnrs/data/taxonomy-tree/out1.0/assessments/lost_taxa.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/lost_taxa.txt rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/lost_taxa.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/summary.json b/test/tnrs/data/taxonomy-tree/out1.0/assessments/summary.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/summary.json rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/summary.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/supertree_degree_distribution.txt b/test/tnrs/data/taxonomy-tree/out1.0/assessments/supertree_degree_distribution.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/supertree_degree_distribution.txt rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/supertree_degree_distribution.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/assessments/taxonomy_degree_distribution.txt b/test/tnrs/data/taxonomy-tree/out1.0/assessments/taxonomy_degree_distribution.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/assessments/taxonomy_degree_distribution.txt rename to test/tnrs/data/taxonomy-tree/out1.0/assessments/taxonomy_degree_distribution.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/README.md b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_not_updated_ott.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_not_updated_ott.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_not_updated_ott.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_not_updated_ott.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_1.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_1.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_pruned_nonflagged.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaned_ott_with_hiddenbarren.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaning_flags.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaning_flags.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/cleaning_flags.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/cleaning_flags.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/flagged_in_cleaned.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/flagged_in_cleaned.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/flagged_in_cleaned.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/flagged_in_cleaned.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/index.html b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/index.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/move_extinct_higher_log.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/move_extinct_higher_log.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/move_extinct_higher_log.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/move_extinct_higher_log.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/ott_version.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/ott_version.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/ott_version.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/ott_version.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/root_ott_id.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/root_ott_id.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_ott/root_ott_id.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_ott/root_ott_id.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/README.md b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/cleaning_flags.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/cleaning_flags.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/cleaning_flags.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/cleaning_flags.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/index.html b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/index.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/needs_updating.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/needs_updating.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/needs_updating.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/needs_updating.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/phylo_inputs_cleaned.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/phylo_inputs_cleaned.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/phylo_inputs_cleaned.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/phylo_inputs_cleaned.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/root_ott_id.txt b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/root_ott_id.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/root_ott_id.txt rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/root_ott_id.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1-taxonomy.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_1@1.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1-taxonomy.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.json b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.json rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.tre b/test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.tre rename to test/tnrs/data/taxonomy-tree/out1.0/cleaned_phylo/xx_2@1.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/config b/test/tnrs/data/taxonomy-tree/out1.0/config similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/config rename to test/tnrs/data/taxonomy-tree/out1.0/config diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/README.md b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/args.txt b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/args.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/args.txt rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/args.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/exemplified_log.json b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/exemplified_log.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/exemplified_log.json rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/exemplified_log.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/incertae_sedis.txt b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/incertae_sedis.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/incertae_sedis.txt rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/incertae_sedis.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/index.html b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/index.json b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/nonempty_trees.txt b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/nonempty_trees.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/nonempty_trees.txt rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/nonempty_trees.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/pruned_for_regraft_cleaned_ott.json b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/pruned_for_regraft_cleaned_ott.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/pruned_for_regraft_cleaned_ott.json rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/pruned_for_regraft_cleaned_ott.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/pruned_taxonomy_degree_distribution.txt b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/pruned_taxonomy_degree_distribution.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/pruned_taxonomy_degree_distribution.txt rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/pruned_taxonomy_degree_distribution.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/regraft_cleaned_ott.tre b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/regraft_cleaned_ott.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/regraft_cleaned_ott.tre rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/regraft_cleaned_ott.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/taxonomy.tre b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/taxonomy.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/taxonomy.tre rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/taxonomy.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/xx_1@1.tre b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/xx_1@1.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/xx_1@1.tre rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/xx_1@1.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/xx_2@1.tre b/test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/xx_2@1.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/exemplified_phylo/xx_2@1.tre rename to test/tnrs/data/taxonomy-tree/out1.0/exemplified_phylo/xx_2@1.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/README.md b/test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/grafted_solution.tre b/test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/grafted_solution.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/grafted_solution.tre rename to test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/grafted_solution.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/grafted_solution_ottnames.tre b/test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/grafted_solution_ottnames.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/grafted_solution_ottnames.tre rename to test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/grafted_solution_ottnames.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/index.html b/test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/index.json b/test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/grafted_solution/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/grafted_solution/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/index.html b/test/tnrs/data/taxonomy-tree/out1.0/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/index.json b/test/tnrs/data/taxonomy-tree/out1.0/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/README.md b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/broken_taxa.json b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/broken_taxa.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/broken_taxa.json rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/broken_taxa.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/index.html b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/index.json b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/input_output_stats.json b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/input_output_stats.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/input_output_stats.json rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/input_output_stats.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree.tre b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree.tre rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames.tre b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames.tre rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames_without_monotypic.tre b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames_without_monotypic.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames_without_monotypic.tre rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_ottnames_without_monotypic.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_out_degree_distribution.txt b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_out_degree_distribution.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_out_degree_distribution.txt rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_out_degree_distribution.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames.tre b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames.tre rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames_without_monotypic.tre b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames_without_monotypic.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames_without_monotypic.tre rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/labelled_supertree_simplified_ottnames_without_monotypic.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames.log b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames.log similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames.log rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames.log diff --git a/test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames_without_monotypic.log b/test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames_without_monotypic.log similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames_without_monotypic.log rename to test/tnrs/data/taxonomy-tree/out1.0/labelled_supertree/simplified_ottnames_without_monotypic.log diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/README.md b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/collections.txt b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/collections.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/collections.txt rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/collections.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/index.html b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/index.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/rank_collection.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/rank_collection.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/rank_collection.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/rank_collection.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_input/study_tree_pairs.txt b/test/tnrs/data/taxonomy-tree/out1.0/phylo_input/study_tree_pairs.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_input/study_tree_pairs.txt rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_input/study_tree_pairs.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/README.md b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/collections_git_shas.txt b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/collections_git_shas.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/collections_git_shas.txt rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/collections_git_shas.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/concrete_rank_collection.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/concrete_rank_collection.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/concrete_rank_collection.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/concrete_rank_collection.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/git_shas.txt b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/git_shas.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/git_shas.txt rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/git_shas.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/index.html b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/index.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/xx_1@1.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/xx_1@1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/xx_1@1.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/xx_1@1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/xx_2@1.json b/test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/xx_2@1.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/phylo_snapshot/xx_2@1.json rename to test/tnrs/data/taxonomy-tree/out1.0/phylo_snapshot/xx_2@1.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/README.md b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/index.html b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/index.json b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/ott5268475.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/ott5268475.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/ott5268475.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/ott5268475.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/ott805080.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/ott805080.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/ott805080.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/ott805080.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/solution-degree-distributions.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/solution-degree-distributions.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/solution-degree-distributions.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/solution-degree-distributions.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/solution-ids.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/solution-ids.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblem_solutions/solution-ids.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblem_solutions/solution-ids.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/README.md b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/checksummed-subproblem-ids.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/checksummed-subproblem-ids.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/checksummed-subproblem-ids.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/checksummed-subproblem-ids.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/contesting-trees.json b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/contesting-trees.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/contesting-trees.json rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/contesting-trees.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/dumped-subproblem-ids.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/dumped-subproblem-ids.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/dumped-subproblem-ids.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/dumped-subproblem-ids.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/index.html b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/index.html similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/index.html rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/index.html diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/index.json b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/index.json similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/index.json rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/index.json diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475-tree-names.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475-tree-names.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475-tree-names.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475-tree-names.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475.md5 b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475.md5 similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475.md5 rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475.md5 diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott5268475.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott5268475.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080-tree-names.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080-tree-names.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080-tree-names.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080-tree-names.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080.md5 b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080.md5 similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080.md5 rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080.md5 diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/ott805080.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/ott805080.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/README.md b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/README.md similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/README.md rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/README.md diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/args.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/args.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/args.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/args.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475-tree-names.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475-tree-names.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475-tree-names.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475-tree-names.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475.md5 b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475.md5 similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475.md5 rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475.md5 diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott5268475.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott5268475.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080-tree-names.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080-tree-names.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080-tree-names.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080-tree-names.txt diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080.md5 b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080.md5 similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080.md5 rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080.md5 diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080.tre b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080.tre similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/scratch/ott805080.tre rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/scratch/ott805080.tre diff --git a/test/data/tnrs-taxonomy-tree/out1.0/subproblems/subproblem-ids.txt b/test/tnrs/data/taxonomy-tree/out1.0/subproblems/subproblem-ids.txt similarity index 100% rename from test/data/tnrs-taxonomy-tree/out1.0/subproblems/subproblem-ids.txt rename to test/tnrs/data/taxonomy-tree/out1.0/subproblems/subproblem-ids.txt diff --git a/test/expectedws-induced/.gitignore b/test/tnrs/expectedws-induced/.gitignore similarity index 100% rename from test/expectedws-induced/.gitignore rename to test/tnrs/expectedws-induced/.gitignore diff --git a/test/expectedws-induced/autocomplete_name/1/expected.json b/test/tnrs/expectedws-induced/autocomplete_name/1/expected.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/1/expected.json rename to test/tnrs/expectedws-induced/autocomplete_name/1/expected.json diff --git a/test/expectedws-induced/autocomplete_name/1/method.json b/test/tnrs/expectedws-induced/autocomplete_name/1/method.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/1/method.json rename to test/tnrs/expectedws-induced/autocomplete_name/1/method.json diff --git a/test/expectedws-induced/autocomplete_name/2/expected.json b/test/tnrs/expectedws-induced/autocomplete_name/2/expected.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/2/expected.json rename to test/tnrs/expectedws-induced/autocomplete_name/2/expected.json diff --git a/test/expectedws-induced/autocomplete_name/2/method.json b/test/tnrs/expectedws-induced/autocomplete_name/2/method.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/2/method.json rename to test/tnrs/expectedws-induced/autocomplete_name/2/method.json diff --git a/test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json b/test/tnrs/expectedws-induced/autocomplete_name/fuzzy/1/expected.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/fuzzy/1/expected.json rename to test/tnrs/expectedws-induced/autocomplete_name/fuzzy/1/expected.json diff --git a/test/expectedws-induced/autocomplete_name/fuzzy/1/method.json b/test/tnrs/expectedws-induced/autocomplete_name/fuzzy/1/method.json similarity index 100% rename from test/expectedws-induced/autocomplete_name/fuzzy/1/method.json rename to test/tnrs/expectedws-induced/autocomplete_name/fuzzy/1/method.json diff --git a/test/expectedws-induced/contexts/expected.json b/test/tnrs/expectedws-induced/contexts/expected.json similarity index 100% rename from test/expectedws-induced/contexts/expected.json rename to test/tnrs/expectedws-induced/contexts/expected.json diff --git a/test/expectedws-induced/contexts/method.json b/test/tnrs/expectedws-induced/contexts/method.json similarity index 100% rename from test/expectedws-induced/contexts/method.json rename to test/tnrs/expectedws-induced/contexts/method.json diff --git a/test/expectedws-induced/infer_contexts/ambig/expected.json b/test/tnrs/expectedws-induced/infer_contexts/ambig/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts/ambig/expected.json rename to test/tnrs/expectedws-induced/infer_contexts/ambig/expected.json diff --git a/test/expectedws-induced/infer_contexts/ambig/method.json b/test/tnrs/expectedws-induced/infer_contexts/ambig/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts/ambig/method.json rename to test/tnrs/expectedws-induced/infer_contexts/ambig/method.json diff --git a/test/expectedws-induced/infer_contexts/ambig_plant/expected.json b/test/tnrs/expectedws-induced/infer_contexts/ambig_plant/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts/ambig_plant/expected.json rename to test/tnrs/expectedws-induced/infer_contexts/ambig_plant/expected.json diff --git a/test/expectedws-induced/infer_contexts/ambig_plant/method.json b/test/tnrs/expectedws-induced/infer_contexts/ambig_plant/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts/ambig_plant/method.json rename to test/tnrs/expectedws-induced/infer_contexts/ambig_plant/method.json diff --git a/test/expectedws-induced/infer_contexts/animals/expected.json b/test/tnrs/expectedws-induced/infer_contexts/animals/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts/animals/expected.json rename to test/tnrs/expectedws-induced/infer_contexts/animals/expected.json diff --git a/test/expectedws-induced/infer_contexts/animals/method.json b/test/tnrs/expectedws-induced/infer_contexts/animals/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts/animals/method.json rename to test/tnrs/expectedws-induced/infer_contexts/animals/method.json diff --git a/test/expectedws-induced/infer_contexts/life/expected.json b/test/tnrs/expectedws-induced/infer_contexts/life/expected.json similarity index 100% rename from test/expectedws-induced/infer_contexts/life/expected.json rename to test/tnrs/expectedws-induced/infer_contexts/life/expected.json diff --git a/test/expectedws-induced/infer_contexts/life/method.json b/test/tnrs/expectedws-induced/infer_contexts/life/method.json similarity index 100% rename from test/expectedws-induced/infer_contexts/life/method.json rename to test/tnrs/expectedws-induced/infer_contexts/life/method.json diff --git a/test/expectedws-induced/match_names/fuzzy/expected.json b/test/tnrs/expectedws-induced/match_names/fuzzy/expected.json similarity index 100% rename from test/expectedws-induced/match_names/fuzzy/expected.json rename to test/tnrs/expectedws-induced/match_names/fuzzy/expected.json diff --git a/test/expectedws-induced/match_names/fuzzy/method.json b/test/tnrs/expectedws-induced/match_names/fuzzy/method.json similarity index 100% rename from test/expectedws-induced/match_names/fuzzy/method.json rename to test/tnrs/expectedws-induced/match_names/fuzzy/method.json diff --git a/test/expectedws-induced/match_names/fuzzy_contexts/expected.json b/test/tnrs/expectedws-induced/match_names/fuzzy_contexts/expected.json similarity index 100% rename from test/expectedws-induced/match_names/fuzzy_contexts/expected.json rename to test/tnrs/expectedws-induced/match_names/fuzzy_contexts/expected.json diff --git a/test/expectedws-induced/match_names/fuzzy_contexts/method.json b/test/tnrs/expectedws-induced/match_names/fuzzy_contexts/method.json similarity index 100% rename from test/expectedws-induced/match_names/fuzzy_contexts/method.json rename to test/tnrs/expectedws-induced/match_names/fuzzy_contexts/method.json diff --git a/ws/meson.build b/ws/meson.build index bbf7c75a..c4547a8c 100644 --- a/ws/meson.build +++ b/ws/meson.build @@ -49,9 +49,9 @@ test('web services test (2)', '--secs-to-recheck-pid-file=30'] ) -tax_dir = join_paths(meson.source_root(),'test/data/tnrs-taxonomy-induced') -synth_dir = join_paths(meson.source_root(),'test/data/tnrs-taxonomy-tree') -expectedws_dir = join_paths(meson.source_root(),'test/expectedws-induced') +tax_dir = join_paths(meson.source_root(),'test/tnrs/data/taxonomy-induced') +synth_dir = join_paths(meson.source_root(),'test/tnrs/data/taxonomy-tree') +expectedws_dir = join_paths(meson.source_root(),'test/tnrs/expectedws-induced') test('tnrs web services test', test_web_services, timeout: 300, From fba6b89f92c3fa785f87400895d375c98d1f1998 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 8 Apr 2020 17:04:43 -0700 Subject: [PATCH 040/620] Skip trees in synth parent dir with the wrong taxonomy_version. --- .../annotated_supertree/annotations.json | 2 +- expectedws/about/expected.json | 2 +- ws/tolwsbooting.cpp | 31 +++++++++++++------ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/data/ex-synth-par/ex-synth-1/annotated_supertree/annotations.json b/data/ex-synth-par/ex-synth-1/annotated_supertree/annotations.json index 6ce4af2e..ba70d187 100644 --- a/data/ex-synth-par/ex-synth-1/annotated_supertree/annotations.json +++ b/data/ex-synth-par/ex-synth-1/annotated_supertree/annotations.json @@ -117,6 +117,6 @@ "xx_1@t2" ], "synth_id": "", - "taxonomy_version": "exampletaxonomy", + "taxonomy_version": "0.0draft0", "tree_id": "" } diff --git a/expectedws/about/expected.json b/expectedws/about/expected.json index 065c6a33..2c7a9ac3 100644 --- a/expectedws/about/expected.json +++ b/expectedws/about/expected.json @@ -38,5 +38,5 @@ } }, "synth_id": "", - "taxonomy_version": "exampletaxonomy" + "taxonomy_version": "0.0draft0" } \ No newline at end of file diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index ee0deff6..b8f489b2 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -1163,14 +1163,8 @@ bool read_tree_and_annotations(const fs::path & config_path, const fs::path & contestingtrees_path, TreesToServe & tts) { - std::ifstream contestingtrees_stream(contestingtrees_path.native().c_str()); - json contestingtrees_obj; - try { - contestingtrees_stream >> contestingtrees_obj; - } catch (...) { - LOG(WARNING) << "Could not read \"" << contestingtrees_path << "\" as JSON.\n"; - throw; - } + auto locked_taxonomy = tts.get_readable_taxonomy(); + const auto & taxonomy = locked_taxonomy.first; std::string annot_str = annotations_path.native(); std::ifstream annotations_stream(annot_str.c_str()); @@ -1181,6 +1175,25 @@ bool read_tree_and_annotations(const fs::path & config_path, LOG(WARNING) << "Could not read \"" << annotations_path << "\" as JSON.\n"; throw; } + + // Check that the tree was built against the correct taxonomy. + string tree_tax_version = annotations_obj["taxonomy_version"]; + string synth_id = annotations_obj["synth_id"]; + if (tree_tax_version != taxonomy.get_version()) + { + LOG(WARNING) << "Read \"" << annotations_path << "\" as JSON.\n"; + throw OTCError()<<"Tree with does not match taxonomy version '"<> contestingtrees_obj; + } catch (...) { + LOG(WARNING) << "Could not read \"" << contestingtrees_path << "\" as JSON.\n"; + throw; + } + std::string bt_str = brokentaxa_path.native(); std::ifstream brokentaxa_stream(bt_str.c_str()); json brokentaxa_obj; @@ -1190,8 +1203,6 @@ bool read_tree_and_annotations(const fs::path & config_path, LOG(WARNING) << "Could not read \"" << brokentaxa_path << "\" as JSON.\n"; throw; } - auto locked_taxonomy = tts.get_readable_taxonomy(); - const auto & taxonomy = locked_taxonomy.first; # if defined(REPORT_MEMORY_USAGE) MemoryBookkeeper tax_mem_b; std::size_t tree_mem = 0; From e4e96b24dee2af81149dedaaf559e0c9ead04a40 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 11:09:52 -0700 Subject: [PATCH 041/620] Add exact_query( ) to class CompressedTrieBasedDB. Currently this returns results for a fuzzy match. We can change that later. --- otc/ctrie/ctrie_db.cpp | 15 +++++++++++++++ otc/ctrie/ctrie_db.h | 1 + 2 files changed, 16 insertions(+) diff --git a/otc/ctrie/ctrie_db.cpp b/otc/ctrie/ctrie_db.cpp index d944cd46..dd982b04 100644 --- a/otc/ctrie/ctrie_db.cpp +++ b/otc/ctrie/ctrie_db.cpp @@ -27,6 +27,21 @@ std::set CompressedTrieBasedDB::fuzzy_ return sorted; } +std::set CompressedTrieBasedDB::exact_query(const std::string & query_str) const +{ + auto conv_query = to_u32string(query_str); + + std::set sorted; + + auto from_thin = thin_trie.fuzzy_matches(conv_query, 0); + sorted.insert(std::begin(from_thin), std::end(from_thin)); + + auto from_full = wide_trie.fuzzy_matches(conv_query, 0); + sorted.insert(std::begin(from_full), std::end(from_full)); + + return sorted; +} + void CompressedTrieBasedDB::initialize(const std::set & keys) { ctrie_init_set_t for_wide; diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index 18499bb4..1aeef7ce 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -13,6 +13,7 @@ class CompressedTrieBasedDB { public: void initialize(const std::set & keys); std::set fuzzy_query(const std::string & query_str) const; + std::set exact_query(const std::string & query_str) const; private: CTrie3_t wide_trie; CTrie2_t thin_trie; From f2bc73e99b7ac2f16147da38519164847f8b198e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 11:10:36 -0700 Subject: [PATCH 042/620] Add ContextAwareCTrieBasedDB::exact_query( ). --- otc/ctrie/context_ctrie_db.cpp | 18 ++++++++++++++++++ otc/ctrie/context_ctrie_db.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 72638c45..850c78a9 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -4,6 +4,10 @@ #include "otc/taxonomy/taxonomy.h" #include "otc/taxonomy/flags.h" +using std::set; +using std::string; +using std::vector; + namespace otc { ContextAwareCTrieBasedDB::ContextAwareCTrieBasedDB(const Context &context_arg, @@ -79,6 +83,20 @@ std::set ContextAwareCTrieBasedDB::fuz return sorted; } +std::set ContextAwareCTrieBasedDB::exact_query(const std::string & query_str) const { + std::set sorted; + if (context.name_matcher != nullptr) { + sorted = context.name_matcher->exact_query(query_str); + } + for (auto c :children) { + if (c->context.name_matcher) { + auto csorted = c->context.name_matcher->exact_query(query_str); + sorted.insert(std::begin(csorted), std::end(csorted)); + } + } + return sorted; +} + using vec_fqr_w_t = std::vector; vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, diff --git a/otc/ctrie/context_ctrie_db.h b/otc/ctrie/context_ctrie_db.h index f60a41cd..5666d51a 100644 --- a/otc/ctrie/context_ctrie_db.h +++ b/otc/ctrie/context_ctrie_db.h @@ -16,6 +16,8 @@ class ContextAwareCTrieBasedDB { ContextAwareCTrieBasedDB(const Context &, const RichTaxonomy &); ContextAwareCTrieBasedDB(const Context &, const RichTaxonomy &, const std::set & keys); std::set fuzzy_query(const std::string & query_str) const; + std::set exact_query(const std::string & query_str) const; + std::vector fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, const RichTaxonomy & taxonomy, From 74c2987ae9c74cb024202045cd923b5defbcbd7f Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 11:11:10 -0700 Subject: [PATCH 043/620] Factor ContextAwareCTrieBaseDB::to_taxa( ) out of fuzzy_query_to_taxa( ). --- otc/ctrie/context_ctrie_db.cpp | 20 ++++++++++++++------ otc/ctrie/context_ctrie_db.h | 6 ++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 850c78a9..17afd9b4 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -98,16 +98,16 @@ std::set ContextAwareCTrieBasedDB::exa } using vec_fqr_w_t = std::vector; -vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & query_str, - const RTRichTaxNode * context_root, - const RichTaxonomy & /*taxonomy*/, - bool include_suppressed) const { - LOG(DEBUG) << "fuzzy_query_to_taxa(" << query_str << ", context_id = " << context_root->get_ott_id() << ", ... , included_suppressed =" << include_suppressed << ")"; +vec_fqr_w_t ContextAwareCTrieBasedDB::to_taxa(const set& sorted, + const RTRichTaxNode * context_root, + const RichTaxonomy & /*taxonomy*/, + bool include_suppressed) const { + LOG(DEBUG) << "to_taxa(context_id = " << context_root->get_ott_id() << ", ... , included_suppressed =" << include_suppressed << ")"; vec_fqr_w_t results; const auto & tax_data = context_root->get_data(); const auto filter_trav_enter = tax_data.trav_enter; const auto filter_trav_exit = tax_data.trav_exit; - const std::set sorted = fuzzy_query(query_str); + if (sorted.empty()) { LOG(DEBUG) << "no matches"; } @@ -141,4 +141,12 @@ vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & qu return results; } +vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & query_str, + const RTRichTaxNode * context_root, + const RichTaxonomy & taxonomy, + bool include_suppressed) const { + LOG(DEBUG) << "fuzzy_query_to_taxa(" << query_str << ", context_id = " << context_root->get_ott_id() << ", ... , included_suppressed =" << include_suppressed << ")"; + return to_taxa(fuzzy_query(query_str), context_root, taxonomy, include_suppressed); +} + } // namespace otc diff --git a/otc/ctrie/context_ctrie_db.h b/otc/ctrie/context_ctrie_db.h index 5666d51a..c42dca12 100644 --- a/otc/ctrie/context_ctrie_db.h +++ b/otc/ctrie/context_ctrie_db.h @@ -22,6 +22,12 @@ class ContextAwareCTrieBasedDB { const RTRichTaxNode * context_root, const RichTaxonomy & taxonomy, bool include_suppressed) const; + + std::vector to_taxa(const std::set& sorted_results, + const RTRichTaxNode * context_root, + const RichTaxonomy & taxonomy, + bool include_suppressed) const; + private: const Context & context; std::vector children; From 77284c4744ff2bc10a8ccaa04d2b9529301196b9 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 11:40:02 -0700 Subject: [PATCH 044/620] Add RichTaxonomy argument to lower-level exact_name_search( ) function. --- otc/taxonomy/taxonomy.cpp | 31 +++++++++++++++++++++++++++++-- otc/taxonomy/taxonomy.h | 29 ++++------------------------- otc/ws/tnrsws.cpp | 6 +++--- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 7bfdde68..52e2fe96 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -820,10 +820,37 @@ const RTRichTaxNode* taxonomy_mrca(const std::vector& node return focal; } +// FIXME: move this out of here -vector exact_name_search(const RTRichTaxNode* context_root, +std::vector exact_name_search(const RichTaxonomy& taxonomy, + const RTRichTaxNode* context_root, + const std::string& query, + bool include_suppressed) +{ + if (include_suppressed) { + return exact_name_search(taxonomy, context_root, query); + } + std::function ok = [&](const RTRichTaxNode* taxon) { + return not taxonomy.node_is_suppressed_from_tnrs(taxon); + }; + return exact_name_search(taxonomy, context_root, query, ok); +} + + +std::vector exact_name_search(const RichTaxonomy& taxonomy, + const std::string& query, + bool include_suppressed) +{ + const RTRichTaxNode* context_root = taxonomy.get_tax_tree().get_root(); + return exact_name_search(taxonomy, context_root, query, include_suppressed); +} + + +vector exact_name_search(const RichTaxonomy& taxonomy, + const RTRichTaxNode* context_root, const std::string& query_ref, - std::function ok) { + std::function ok) +{ std::string query{query_ref}; for (auto& c: query) { c = std::tolower(c); diff --git a/otc/taxonomy/taxonomy.h b/otc/taxonomy/taxonomy.h index 0154db07..9f0e0dd6 100644 --- a/otc/taxonomy/taxonomy.h +++ b/otc/taxonomy/taxonomy.h @@ -572,36 +572,15 @@ std::vector exact_name_search(const RichTaxonomy& taxonomy const std::string& query, bool include_suppressed); -std::vector exact_name_search(const RTRichTaxNode* context_root, - const std::string& query, - std::function ok = [](const RTRichTaxNode*){return true;}); - std::vector exact_name_search(const RichTaxonomy& taxonomy, const RTRichTaxNode* context_root, const std::string& query, bool include_suppressed); - -inline std::vector exact_name_search(const RichTaxonomy& taxonomy, - const RTRichTaxNode* context_root, - const std::string& query, - bool include_suppressed) { - if (include_suppressed) { - return exact_name_search(context_root, query); - } - std::function ok = [&](const RTRichTaxNode* taxon) { - return not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return exact_name_search(context_root, query, ok); -} - - -inline std::vector exact_name_search(const RichTaxonomy& taxonomy, - const std::string& query, - bool include_suppressed) { - const RTRichTaxNode* context_root = taxonomy.get_tax_tree().get_root(); - return exact_name_search(taxonomy, context_root, query, include_suppressed); -} +std::vector exact_name_search(const RichTaxonomy& taxonomy, + const RTRichTaxNode* context_root, + const std::string& query, + std::function ok = [](const RTRichTaxNode*){return true;}); template diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 15365058..ab4e9a0d 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -140,7 +140,7 @@ vector exact_name_search_species(const RichTaxonomy& taxonomy, } return taxon_is_specific(taxon); }; - return exact_name_search(context_root, query, ok); + return exact_name_search(taxonomy, context_root, query, ok); } @@ -154,7 +154,7 @@ vector exact_name_search_genus(const RichTaxonomy& taxonomy, } return taxon_is_genus(taxon); }; - return exact_name_search(context_root, query, ok); + return exact_name_search(taxonomy, context_root, query, ok); } vector exact_name_search_higher(const RichTaxonomy& taxonomy, @@ -167,7 +167,7 @@ vector exact_name_search_higher(const RichTaxonomy& taxonomy, } return taxon_is_higher(taxon); }; - return exact_name_search(context_root, query, ok); + return exact_name_search(taxonomy, context_root, query, ok); } vector prefix_name_search(const Taxon* context_root, From 798641101fc7e12c9efd59706aaa86fd4745065e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 14:26:31 -0700 Subject: [PATCH 045/620] Don't insert {nullptr,nullptr} to match_name_to_taxon when name is homonym. --- otc/ctrie/context_ctrie_db.cpp | 35 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 17afd9b4..3d80340e 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -21,36 +21,45 @@ ContextAwareCTrieBasedDB::ContextAwareCTrieBasedDB(const Context &context_arg, const auto & rt_data = rich_tax_tree.get_data(); std::set all_names; auto insert_hint = all_names.begin(); - for (auto const & name2nd : rt_data.name_to_node) { - auto nn = normalize_query(name2nd.first); - match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{name2nd.second, nullptr}); - insert_hint = all_names.insert(insert_hint, nn); + for (auto& [name, node] : rt_data.name_to_node) + { + // node could be nullptr here if this is a homonym, see note in taxonomy.h + if (node) + { + auto nn = normalize_query(name); + match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{node, nullptr}); + insert_hint = all_names.insert(insert_hint, nn); + } } insert_hint = all_names.begin(); - for (auto name2ndvec : rt_data.homonym_to_node) { - auto nn = normalize_query(name2ndvec.first); - for (auto hnp : name2ndvec.second) { + for (auto& [name, nodes] : rt_data.homonym_to_node) { + auto nn = normalize_query(name); + for (auto hnp : nodes) { + assert(hnp); match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{hnp, nullptr}); } insert_hint = all_names.insert(insert_hint, nn); } // filtered insert_hint = all_names.begin(); - for (auto name2rec : rt_data.name_to_record) { - auto nn = normalize_query(name2rec.first); - match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{nullptr, (const void *)name2rec.second}); + for (auto& [name, record] : rt_data.name_to_record) { + auto nn = normalize_query(name); + assert(record); + match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{nullptr, (const void *)record}); insert_hint = all_names.insert(insert_hint, nn); } insert_hint = all_names.begin(); - for (auto name2recvec : rt_data.homonym_to_record) { - auto nn = normalize_query(name2recvec.first); - for (auto hrp : name2recvec.second) { + for (auto& [name, records] : rt_data.homonym_to_record) { + auto nn = normalize_query(name); + for (auto hrp : records) { + assert(hrp); match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{nullptr, (const void *)hrp}); } insert_hint = all_names.insert(insert_hint, nn); } for (const auto & tjs : taxonomy.get_synonyms_list()) { auto nn = normalize_query(tjs.name); + assert(&tjs); match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{tjs.primary, (const void *)(&tjs)}); all_names.insert(nn); } From d3d06179123eeb8347d236381659847561b5bec3 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 15 Apr 2020 15:34:42 -0700 Subject: [PATCH 046/620] Add RichTaxonomy argument to exact_synonynm_search( ) --- otc/ws/tnrsws.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index ab4e9a0d..620c0ad4 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -86,7 +86,8 @@ bool taxon_is_higher(const Taxon* taxon) { } using vec_tax_str_pair_t = vector >; -vec_tax_str_pair_t exact_synonym_search(const Taxon* context_root, +vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, string query, tax_pred_t ok = [](const Taxon*){return true;}) { @@ -109,12 +110,12 @@ vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, string query, bool include_suppressed) { if (include_suppressed) { - return exact_synonym_search(context_root, query); + return exact_synonym_search(taxonomy, context_root, query); } tax_pred_t ok = [&](const Taxon* taxon) { return not taxonomy.node_is_suppressed_from_tnrs(taxon); }; - return exact_synonym_search(context_root, query, ok); + return exact_synonym_search(taxonomy, context_root, query, ok); } vec_tax_str_pair_t exact_synonym_search_higher(const RichTaxonomy& taxonomy, @@ -127,7 +128,7 @@ vec_tax_str_pair_t exact_synonym_search_higher(const RichTaxonomy& taxonomy, } return taxon_is_higher(taxon); }; - return exact_synonym_search(context_root, query, ok); + return exact_synonym_search(taxonomy, context_root, query, ok); } vector exact_name_search_species(const RichTaxonomy& taxonomy, From 7e15193ace48295cca27cf39f6267059d209f38c Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 14:20:35 -0700 Subject: [PATCH 047/620] Use ctrie distance 0 for exact match. --- otc/taxonomy/taxonomy.cpp | 55 ++++++++++++++++++++++++++++++++++++--- otc/ws/tnrsws.cpp | 41 ++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 5 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 52e2fe96..9d9629bd 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -24,6 +24,7 @@ namespace fs = boost::filesystem; #include "otc/config_file.h" #include "otc/util.h" #include "otc/otc_base_includes.h" +#include "otc/ctrie/context_ctrie_db.h" using namespace otc; @@ -846,10 +847,10 @@ std::vector exact_name_search(const RichTaxonomy& taxonomy } -vector exact_name_search(const RichTaxonomy& taxonomy, - const RTRichTaxNode* context_root, - const std::string& query_ref, - std::function ok) +vector exact_name_search_slow(const RichTaxonomy& taxonomy, + const RTRichTaxNode* context_root, + const std::string& query_ref, + std::function ok) { std::string query{query_ref}; for (auto& c: query) { @@ -867,5 +868,51 @@ vector exact_name_search(const RichTaxonomy& taxonomy, return hits; } +vector exact_name_search(const RichTaxonomy& taxonomy, + const RTRichTaxNode* context_root, + const std::string& query_ref, + std::function ok) +{ + // Maybe move this into the exact_query( ) call. + string query = normalize_query(query_ref); + + auto ctp = taxonomy.get_fuzzy_matcher(); + assert(ctp); + + auto fuzzy_results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); + vector hits; + for(auto& result: fuzzy_results) + { + if (not result.is_synonym()) + { + auto t = result.get_taxon(); + if (ok(t)) + hits.push_back(t); + } + } + +#ifdef DEBUG_NAME_SEARCH + { + auto hits2 = exact_name_search_slow(taxonomy, context_root, query_ref, ok); + std::sort(hits.begin(), hits.end()); + std::sort(hits2.begin(), hits2.end()); + LOG(INFO)<<"exact_name_search: query = '"<get_data().get_nonuniqname(); + LOG(INFO)<<"lcase match:"; + for(int i=0;iget_data().get_nonuniqname(); + } + else + LOG(INFO)<<"exact name search: "<to_taxa(ctp->exact_query(query), context_root, taxonomy, true); + vec_tax_str_pair_t hits2; + for(auto& result: fuzzy_results) + { + if (result.is_synonym()) + { + auto t = result.get_taxon(); + if (ok(t)) + hits2.push_back({t,query}); + } + } +// we can't sort references -- use string_view? + vector taxon_hits1; + for(auto& [taxon,_]: hits) + taxon_hits1.push_back(taxon); + vector taxon_hits2; + for(auto& [taxon,_]: hits2) + taxon_hits2.push_back(taxon); + std::sort(taxon_hits1.begin(), taxon_hits1.end()); + std::sort(taxon_hits2.begin(), taxon_hits2.end()); + LOG(INFO)<<"exact_synonym_search: query = '"<get_data().get_nonuniqname(); + LOG(INFO)<<"ctrie match:"; + for(int i=0;iget_data().get_nonuniqname(); + } + else + LOG(INFO)<<"exact synonym search: "< Date: Thu, 16 Apr 2020 14:25:19 -0700 Subject: [PATCH 048/620] Use ctrie distance 0 for exact synonym match. --- otc/ws/tnrsws.cpp | 52 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 53b7862a..dd9eb064 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -86,10 +86,11 @@ bool taxon_is_higher(const Taxon* taxon) { } using vec_tax_str_pair_t = vector >; -vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) + +vec_tax_str_pair_t exact_synonym_search_slow(const RichTaxonomy& taxonomy, + const Taxon* context_root, + string query, + tax_pred_t ok = [](const Taxon*){return true;}) { query = normalize_query(query); vec_tax_str_pair_t hits; @@ -103,43 +104,60 @@ vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, } } } + return hits; +} + +vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, + string query, + tax_pred_t ok = [](const Taxon*){return true;}) +{ + auto ctp = taxonomy.get_fuzzy_matcher(); - if (auto ctp = taxonomy.get_fuzzy_matcher()) + assert(ctp); + + auto fuzzy_results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); + vec_tax_str_pair_t hits; + for(auto& result: fuzzy_results) { - auto fuzzy_results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); - vec_tax_str_pair_t hits2; - for(auto& result: fuzzy_results) + if (result.is_synonym()) { - if (result.is_synonym()) - { - auto t = result.get_taxon(); - if (ok(t)) - hits2.push_back({t,query}); - } + auto t = result.get_taxon(); + if (ok(t)) + hits.push_back({t,query}); } -// we can't sort references -- use string_view? + } + +#ifdef DEBUG_NAME_SEARCH + { +// we can't sort references -- use string_view? + auto hits2 = exact_synonym_search_slow(taxonomy, context_root, query, ok); + vector taxon_hits1; for(auto& [taxon,_]: hits) taxon_hits1.push_back(taxon); + vector taxon_hits2; for(auto& [taxon,_]: hits2) taxon_hits2.push_back(taxon); + std::sort(taxon_hits1.begin(), taxon_hits1.end()); std::sort(taxon_hits2.begin(), taxon_hits2.end()); LOG(INFO)<<"exact_synonym_search: query = '"<get_data().get_nonuniqname(); - LOG(INFO)<<"ctrie match:"; + LOG(INFO)<<"lcase match:"; for(int i=0;iget_data().get_nonuniqname(); } else LOG(INFO)<<"exact synonym search: "< Date: Thu, 16 Apr 2020 15:38:00 -0700 Subject: [PATCH 049/620] Fix unused var warnings. --- otc/taxonomy/taxonomy.cpp | 2 +- otc/ws/tnrsws.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 9d9629bd..61de874c 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -847,7 +847,7 @@ std::vector exact_name_search(const RichTaxonomy& taxonomy } -vector exact_name_search_slow(const RichTaxonomy& taxonomy, +vector exact_name_search_slow(const RichTaxonomy& /*taxonomy*/, const RTRichTaxNode* context_root, const std::string& query_ref, std::function ok) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index dd9eb064..9460ff16 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -87,7 +87,7 @@ bool taxon_is_higher(const Taxon* taxon) { using vec_tax_str_pair_t = vector >; -vec_tax_str_pair_t exact_synonym_search_slow(const RichTaxonomy& taxonomy, +vec_tax_str_pair_t exact_synonym_search_slow(const RichTaxonomy& /*taxonomy*/, const Taxon* context_root, string query, tax_pred_t ok = [](const Taxon*){return true;}) From 1ae74779ba78dc0d467d0a3884c2ab21a2e742c9 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 15:38:18 -0700 Subject: [PATCH 050/620] This assert has to be true. --- otc/ctrie/context_ctrie_db.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 3d80340e..34ac7619 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -59,7 +59,6 @@ ContextAwareCTrieBasedDB::ContextAwareCTrieBasedDB(const Context &context_arg, } for (const auto & tjs : taxonomy.get_synonyms_list()) { auto nn = normalize_query(tjs.name); - assert(&tjs); match_name_to_taxon[nn].push_back(const_rich_taxon_and_syn_ptr{tjs.primary, (const void *)(&tjs)}); all_names.insert(nn); } From 0373194caf98cf38a99e02ac8dcf2178e7386946 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 15:38:45 -0700 Subject: [PATCH 051/620] Use result.get_matched_name() instead of query. --- otc/ws/tnrsws.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 9460ff16..b62dc19e 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -125,7 +125,7 @@ vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, { auto t = result.get_taxon(); if (ok(t)) - hits.push_back({t,query}); + hits.push_back({t,result.get_matched_name()}); } } From 397d05a9a81b5c70d28f0d71d4f3d7dbfdd55bdf Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 15:38:57 -0700 Subject: [PATCH 052/620] Fix crash: store the whole string, not just a reference. --- otc/ws/tnrsws.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index b62dc19e..75106f3e 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -85,7 +85,7 @@ bool taxon_is_higher(const Taxon* taxon) { return taxon->get_data().rank < TaxonomicRank::RANK_SPECIES; } -using vec_tax_str_pair_t = vector >; +using vec_tax_str_pair_t = vector >; vec_tax_str_pair_t exact_synonym_search_slow(const RichTaxonomy& /*taxonomy*/, const Taxon* context_root, From d92547d2471ff3a9880117ed21a0f1a2e6209940 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 15:55:24 -0700 Subject: [PATCH 053/620] Add some match_name tests with synonyms. The answers may change, but at least the answer isn't std::bad_alloc --- .../match_names/1/expected.json | 70 ++++++++++ .../match_names/1/method.json | 8 ++ .../match_names/2/expected.json | 132 ++++++++++++++++++ .../match_names/2/method.json | 8 ++ 4 files changed, 218 insertions(+) create mode 100644 test/tnrs/expectedws-induced/match_names/1/expected.json create mode 100644 test/tnrs/expectedws-induced/match_names/1/method.json create mode 100644 test/tnrs/expectedws-induced/match_names/2/expected.json create mode 100644 test/tnrs/expectedws-induced/match_names/2/method.json diff --git a/test/tnrs/expectedws-induced/match_names/1/expected.json b/test/tnrs/expectedws-induced/match_names/1/expected.json new file mode 100644 index 00000000..2eaa190d --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/1/expected.json @@ -0,0 +1,70 @@ +{ + "context": "Bacteria", + "governing_code": "ICNP", + "includes_approximate_matches": false, + "includes_deprecated_taxa": false, + "includes_suppressed_names": false, + "matched_names": [ + "Bacteria" + ], + "results": [ + { + "matches": [ + { + "is_approximate_match": false, + "is_synonym": false, + "matched_name": "Bacteria", + "nomenclature_code": "ICNP", + "score": 1.0, + "search_string": "bacteria", + "taxon": { + "flags": [], + "is_suppressed": false, + "is_suppressed_from_synth": false, + "name": "Bacteria", + "ott_id": 844192, + "rank": "domain", + "source": "ott0.0draft0", + "synonyms": [ + "GAL08", + "GOUTA4", + "JL-ETNP-Z39", + "Kazan-3B-28", + "LD1-PA38", + "MVP-21", + "NPL-UPA2", + "OC31", + "RsaHF231", + "S2R-29", + "SBYG-2791", + "SM2F11", + "WCHB1-60", + "not Bacteria Haeckel 1894", + "Monera" + ], + "tax_sources": [ + "silva:A16379/#1", + "ncbi:2", + "worms:6", + "gbif:3", + "irmng:13" + ], + "unique_name": "Bacteria" + } + } + ], + "name": "Bacteria" + } + ], + "taxonomy": { + "author": "open tree of life project", + "name": "ott", + "source": "ott0.0draft0", + "version": "0.0", + "weburl": "https://tree.opentreeoflife.org/about/taxonomy-version/ott0.0" + }, + "unambiguous_names": [ + "Bacteria" + ], + "unmatched_names": [] +} \ No newline at end of file diff --git a/test/tnrs/expectedws-induced/match_names/1/method.json b/test/tnrs/expectedws-induced/match_names/1/method.json new file mode 100644 index 00000000..d06a9f1f --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/1/method.json @@ -0,0 +1,8 @@ +{ + "url_fragment": "v3/tnrs/match_names", + "verb": "POST", + "arguments": { + "names": ["Bacteria"] + } +} + diff --git a/test/tnrs/expectedws-induced/match_names/2/expected.json b/test/tnrs/expectedws-induced/match_names/2/expected.json new file mode 100644 index 00000000..b6390b21 --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/2/expected.json @@ -0,0 +1,132 @@ +{ + "context": "Lobelia", + "governing_code": "ICN", + "includes_approximate_matches": false, + "includes_deprecated_taxa": false, + "includes_suppressed_names": false, + "matched_names": [ + "Hypsela" + ], + "results": [ + { + "matches": [ + { + "is_approximate_match": false, + "is_synonym": false, + "matched_name": "Lobelia", + "nomenclature_code": "ICN", + "score": 1.0, + "search_string": "hypsela", + "taxon": { + "flags": [], + "is_suppressed": false, + "is_suppressed_from_synth": false, + "name": "Lobelia", + "ott_id": 1086294, + "rank": "genus", + "source": "ott0.0draft0", + "synonyms": [ + "Hypsela", + "Heterotoma", + "Enchysia", + "Palmerella", + "Isolobus", + "Myopsia", + "Neowimmeria", + "Haynaldia", + "Dortmanna", + "Trimeris", + "Speirema", + "Colensoa", + "Piddingtonia", + "Isotoma", + "Mezleria", + "Tupa", + "Calcaratolobelia", + "Laurentia", + "Parastranthus", + "Rapuntium", + "Unigenes", + "Pratia", + "Solenopsis" + ], + "tax_sources": [ + "ncbi:4382", + "gbif:2756426", + "irmng:1083184", + "irmng:1452934", + "irmng:1308508", + "irmng:1299058", + "irmng:1295316" + ], + "unique_name": "Lobelia" + } + }, + { + "is_approximate_match": false, + "is_synonym": true, + "matched_name": "Hypsela", + "nomenclature_code": "ICN", + "score": 1.0, + "search_string": "hypsela", + "taxon": { + "flags": [], + "is_suppressed": false, + "is_suppressed_from_synth": false, + "name": "Lobelia", + "ott_id": 1086294, + "rank": "genus", + "source": "ott0.0draft0", + "synonyms": [ + "Hypsela", + "Heterotoma", + "Enchysia", + "Palmerella", + "Isolobus", + "Myopsia", + "Neowimmeria", + "Haynaldia", + "Dortmanna", + "Trimeris", + "Speirema", + "Colensoa", + "Piddingtonia", + "Isotoma", + "Mezleria", + "Tupa", + "Calcaratolobelia", + "Laurentia", + "Parastranthus", + "Rapuntium", + "Unigenes", + "Pratia", + "Solenopsis" + ], + "tax_sources": [ + "ncbi:4382", + "gbif:2756426", + "irmng:1083184", + "irmng:1452934", + "irmng:1308508", + "irmng:1299058", + "irmng:1295316" + ], + "unique_name": "Lobelia" + } + } + ], + "name": "Hypsela" + } + ], + "taxonomy": { + "author": "open tree of life project", + "name": "ott", + "source": "ott0.0draft0", + "version": "0.0", + "weburl": "https://tree.opentreeoflife.org/about/taxonomy-version/ott0.0" + }, + "unambiguous_names": [ + "Hypsela" + ], + "unmatched_names": [] +} \ No newline at end of file diff --git a/test/tnrs/expectedws-induced/match_names/2/method.json b/test/tnrs/expectedws-induced/match_names/2/method.json new file mode 100644 index 00000000..a1347fff --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/2/method.json @@ -0,0 +1,8 @@ +{ + "url_fragment": "v3/tnrs/match_names", + "verb": "POST", + "arguments": { + "names": ["Hypsela"] + } +} + From 13349dea39d8f2508c89a8b292d996da89f79b1e Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 17:51:49 -0700 Subject: [PATCH 054/620] Rename homonym_to_node to homonym_to_nodes. --- otc/ctrie/context_ctrie_db.cpp | 2 +- otc/taxonomy/taxonomy.cpp | 4 ++-- otc/taxonomy/taxonomy.h | 4 ++-- tools/tnrs-cli.cpp | 4 ++-- ws/tolwsbooting.cpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 34ac7619..618ddfa2 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -32,7 +32,7 @@ ContextAwareCTrieBasedDB::ContextAwareCTrieBasedDB(const Context &context_arg, } } insert_hint = all_names.begin(); - for (auto& [name, nodes] : rt_data.homonym_to_node) { + for (auto& [name, nodes] : rt_data.homonym_to_nodes) { auto nn = normalize_query(name); for (auto hnp : nodes) { assert(hnp); diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 61de874c..b520b4b5 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -619,10 +619,10 @@ void RichTaxonomy::read_synonyms() { nit = tree_data.name_to_node.insert(nit, name_map_pair(name_ref, primary)); } else { if (nit->second != nullptr) { - tree_data.homonym_to_node[name_ref].push_back(nit->second); + tree_data.homonym_to_nodes[name_ref].push_back(nit->second); nit->second = nullptr; } - tree_data.homonym_to_node[name_ref].push_back(primary); + tree_data.homonym_to_nodes[name_ref].push_back(primary); } auto vs = comma_separated_as_vec(sourceinfo); diff --git a/otc/taxonomy/taxonomy.h b/otc/taxonomy/taxonomy.h index 9f0e0dd6..dba9a4ce 100644 --- a/otc/taxonomy/taxonomy.h +++ b/otc/taxonomy/taxonomy.h @@ -303,7 +303,7 @@ class RTRichTaxTreeData { std::map name_to_record; // for filtered std::unordered_map id_to_node; std::unordered_map id_to_record; - std::map > homonym_to_node; + std::map > homonym_to_nodes; std::map > homonym_to_record; std::map non_unique_taxon_names; @@ -516,7 +516,7 @@ inline void populate_node_from_taxonomy_record(RTRichTaxNode & nd, data.flags = tr.flags; data.rank = string_to_rank(tr.rank); register_taxon_in_maps(tree_data.name_to_node, - tree_data.homonym_to_node, + tree_data.homonym_to_nodes, data.possibly_nonunique_name, uname, this_node); diff --git a/tools/tnrs-cli.cpp b/tools/tnrs-cli.cpp index 84551998..41b71fa7 100644 --- a/tools/tnrs-cli.cpp +++ b/tools/tnrs-cli.cpp @@ -80,8 +80,8 @@ OttIdSet diagnose_name(const RTRichTaxTreeData & rt_data, if (out != nullptr) {*out << "node in taxonomy: ott_id = " << nd->get_ott_id() << " name = \"" << nd->get_name() << "\"\n";} } } - auto n2hit = rt_data.homonym_to_node.find(name); - if (n2hit != rt_data.homonym_to_node.end()) { + auto n2hit = rt_data.homonym_to_nodes.find(name); + if (n2hit != rt_data.homonym_to_nodes.end()) { const auto & ndv = n2hit->second; for (auto nd : ndv) { if (nd != nullptr) { diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index b8f489b2..a018f0eb 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -1090,7 +1090,7 @@ inline std::size_t calc_memory_used(const RTRichTaxTreeData &d, MemoryBookkeeper std::size_t nn_sz = calc_memory_used_by_map_simple(d.name_to_node, mb); std::size_t nutn_sz = calc_memory_used_by_map_simple(d.non_unique_taxon_names, mb); std::size_t htn_sz = 0; - for (auto el : d.homonym_to_node) { + for (auto el : d.homonym_to_nodes) { htn_sz += sizeof(std::string_view); htn_sz += calc_memory_used_by_vector_eqsize(el.second, sizeof(const RTRichTaxNode *), mb); } @@ -1103,7 +1103,7 @@ inline std::size_t calc_memory_used(const RTRichTaxTreeData &d, MemoryBookkeeper mb["taxonomy data id_to_node"] += in_sz; mb["taxonomy data name_to_node"] += nn_sz; mb["taxonomy data non_unique_taxon_names"] += nutn_sz; - mb["taxonomy data homonym_to_node"] += htn_sz; + mb["taxonomy data homonym_to_nodes"] += htn_sz; return nm_sz + gm_sz + wm_sz + fm_sz + im_sz + f2j_sz + in_sz + nn_sz + nutn_sz + htn_sz; } From 3ab0859e7c4e927a6b9ee0ba29c824cf4e08e9e9 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 16 Apr 2020 17:52:18 -0700 Subject: [PATCH 055/620] Stop putting synonyms in the name maps. --- otc/taxonomy/taxonomy.cpp | 12 ---- .../match_names/2/expected.json | 60 +------------------ 2 files changed, 3 insertions(+), 69 deletions(-) diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index b520b4b5..74c2a235 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -612,18 +612,6 @@ void RichTaxonomy::read_synonyms() { this->synonyms.emplace_back(name, primary, sourceinfo); TaxonomicJuniorSynonym & tjs = *(this->synonyms.rbegin()); - auto nit = tree_data.name_to_node.lower_bound(name); - string_view name_ref = tjs.name; - typedef std::pair name_map_pair; - if (nit == tree_data.name_to_node.end() or nit->first != name_ref) { - nit = tree_data.name_to_node.insert(nit, name_map_pair(name_ref, primary)); - } else { - if (nit->second != nullptr) { - tree_data.homonym_to_nodes[name_ref].push_back(nit->second); - nit->second = nullptr; - } - tree_data.homonym_to_nodes[name_ref].push_back(primary); - } auto vs = comma_separated_as_vec(sourceinfo); process_source_info_vec(vs, tree_data, tjs, primary); diff --git a/test/tnrs/expectedws-induced/match_names/2/expected.json b/test/tnrs/expectedws-induced/match_names/2/expected.json index b6390b21..ccfa2364 100644 --- a/test/tnrs/expectedws-induced/match_names/2/expected.json +++ b/test/tnrs/expectedws-induced/match_names/2/expected.json @@ -1,6 +1,6 @@ { - "context": "Lobelia", - "governing_code": "ICN", + "context": "All life", + "governing_code": "undefined", "includes_approximate_matches": false, "includes_deprecated_taxa": false, "includes_suppressed_names": false, @@ -10,58 +10,6 @@ "results": [ { "matches": [ - { - "is_approximate_match": false, - "is_synonym": false, - "matched_name": "Lobelia", - "nomenclature_code": "ICN", - "score": 1.0, - "search_string": "hypsela", - "taxon": { - "flags": [], - "is_suppressed": false, - "is_suppressed_from_synth": false, - "name": "Lobelia", - "ott_id": 1086294, - "rank": "genus", - "source": "ott0.0draft0", - "synonyms": [ - "Hypsela", - "Heterotoma", - "Enchysia", - "Palmerella", - "Isolobus", - "Myopsia", - "Neowimmeria", - "Haynaldia", - "Dortmanna", - "Trimeris", - "Speirema", - "Colensoa", - "Piddingtonia", - "Isotoma", - "Mezleria", - "Tupa", - "Calcaratolobelia", - "Laurentia", - "Parastranthus", - "Rapuntium", - "Unigenes", - "Pratia", - "Solenopsis" - ], - "tax_sources": [ - "ncbi:4382", - "gbif:2756426", - "irmng:1083184", - "irmng:1452934", - "irmng:1308508", - "irmng:1299058", - "irmng:1295316" - ], - "unique_name": "Lobelia" - } - }, { "is_approximate_match": false, "is_synonym": true, @@ -125,8 +73,6 @@ "version": "0.0", "weburl": "https://tree.opentreeoflife.org/about/taxonomy-version/ott0.0" }, - "unambiguous_names": [ - "Hypsela" - ], + "unambiguous_names": [], "unmatched_names": [] } \ No newline at end of file From 9adb65268be743749288b547c7db00a578a0e19d Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 21 Apr 2020 14:24:00 -0700 Subject: [PATCH 056/620] Remove unused tnrs_json_methods.cpp The content of this file mostly ended up in otc/ws/tnrsws.cpp --- otc/tnrs/tnrs_json_methods.cpp | 904 --------------------------------- 1 file changed, 904 deletions(-) delete mode 100644 otc/tnrs/tnrs_json_methods.cpp diff --git a/otc/tnrs/tnrs_json_methods.cpp b/otc/tnrs/tnrs_json_methods.cpp deleted file mode 100644 index ffef36c3..00000000 --- a/otc/tnrs/tnrs_json_methods.cpp +++ /dev/null @@ -1,904 +0,0 @@ -#include "otc/tnrs/tnrs_json_methods.h" -#include "otc/tnrs/context.h" -#include "json.hpp" -#include "otc/ws/tolws.h" -using json=nlohmann::json; -using namespace std; -namespace otc { - -typedef RTRichTaxNode Taxon; - - -using tax_pred_t = std::function; -json get_taxon_json(const RichTaxonomy& taxonomy, const Taxon& taxon); - -enum match_status {unmatched=0, - ambiguous_match=1, - unambiguous_match=2}; - - -string escape_query_string(const string& name) { - return name; -} - -// It seems that infer_context is supposed to ignore synonyms: -// * Bacteria is NOT ambiguous: it has 1 name match, and 1 synonym match. -// * Firmiscala IS ambiguous: it has 0 name matches, and 1 synonym match. -// * Random gibberish is reported as "ambiguous". -// curl -X POST https://api.opentreeoflife.org/v2/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Bacteria","Firmiscala"]}' -// - - -bool taxon_is_specific(const Taxon* taxon) { - auto rank = taxon->get_data().rank; - return rank_is_specific(rank); -} - -bool taxon_is_genus(const Taxon* taxon) { - return taxon->get_data().rank == TaxonomicRank::RANK_GENUS; -} - -bool taxon_is_higher(const Taxon* taxon) { - return taxon->get_data().rank < TaxonomicRank::RANK_SPECIES; -} - -using vec_tax_str_pair_t = vector >; -vec_tax_str_pair_t exact_synonym_search(const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) -{ - vec_tax_str_pair_t hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - for(auto& tjs: taxon->get_data().junior_synonyms) { - if (lcase_string_equals(query, tjs->get_name())) { - hits.push_back({taxon,tjs->get_name()}); - } - } - } - return hits; -} - -vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - if (include_suppressed) { - return exact_synonym_search(context_root, query); - } - tax_pred_t ok = [&](const Taxon* taxon) { - return not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return exact_synonym_search(context_root, query, ok); -} - -vec_tax_str_pair_t exact_synonym_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return exact_synonym_search(context_root, query, ok); -} - -vector exact_name_search_species(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_specific(taxon); - }; - return exact_name_search(context_root, query, ok); - -} - -vector exact_name_search_genus(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_genus(taxon); - }; - return exact_name_search(context_root, query, ok); -} - -vector exact_name_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return exact_name_search(context_root, query, ok); -} - -vector prefix_name_search(const Taxon* context_root, - const string& query, - tax_pred_t ok = [](const Taxon*){return true;}) { - vector hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - if (lcase_match_prefix(taxon->get_data().get_nonuniqname(), query)) { - hits.push_back(taxon); - } - } - return hits; -} - -vector prefix_name_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - const string& query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return prefix_name_search(context_root, query, ok); -} - -vector prefix_name_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - const string& query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return prefix_name_search(context_root, query, ok); -} - -vec_tax_str_pair_t prefix_synonym_search(const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) -{ - vec_tax_str_pair_t hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - for(auto& tjs: taxon->get_data().junior_synonyms) { - if (lcase_match_prefix(tjs->get_name(), query)) { - hits.push_back({taxon,tjs->get_name()}); - } - } - } - return hits; -} - -vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - std::function ok = [&](const Taxon* taxon) { - return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return prefix_synonym_search(context_root, query, ok); -} - -inline json get_taxon_json(const RichTaxonomy& taxonomy, const Taxon& taxon) { - json j; - tax_service_add_taxon_info(taxonomy, taxon, j); - // What about the "is_suppressed_from_synth" flag? Do we want that? - return j; -} - -inline json get_taxon_record_json(const RichTaxonomy& taxonomy, const TaxonomyRecord & taxon) { - json j; - tax_service_add_suppressed_taxon_info(taxonomy, taxon, j); - return j; -} - - -inline json ContextSearcher::_base_name_match_json(const string& query, - const Taxon * taxon) const { - json result; - result["taxon"] = get_taxon_json(taxonomy, *taxon); - result["search_string"] = query; - result["nomenclature_code"] = Context::get_code_name(taxonomy, taxon); - return result; -} - -inline json ContextSearcher::_base_name_match_json(const string & query, - const TaxonomyRecord * record) const { - json result; - result["taxon"] = get_taxon_record_json(taxonomy, *record); - result["search_string"] = query; - result["nomenclature_code"] = Context::get_code_name(taxonomy, record); - return result; -} - -inline json ContextSearcher::fuzzy_name_match_json(const string& query, - const FuzzyQueryResultWithTaxon & fqrwt) const { - json result; - auto taxon = fqrwt.get_taxon(); - if (taxon != nullptr) { - result = _base_name_match_json(query, taxon); - } else { - result = _base_name_match_json(query, taxon); - } - result["score"] = fqrwt.get_score(); - result["is_approximate_match"] = true; - result["is_synonym"] = false; - result["matched_name"] = fqrwt.get_matched_name(); - return result; -} - -inline json ContextSearcher::exact_name_match_json(const string& query, - const Taxon* taxon) const { - auto result = _base_name_match_json(query, taxon); - result["score"] = 1.0; - result["is_approximate_match"] = false; - result["is_synonym"] = false; - result["matched_name"] = taxon->get_data().get_nonuniqname(); - return result; -using tax_pred_t = std::function; -json get_taxon_json(const RichTaxonomy& taxonomy, const Taxon& taxon); - -enum match_status {unmatched=0, - ambiguous_match=1, - unambiguous_match=2}; - - -string escape_query_string(const string& name) { - return name; -} - -// It seems that infer_context is supposed to ignore synonyms: -// * Bacteria is NOT ambiguous: it has 1 name match, and 1 synonym match. -// * Firmiscala IS ambiguous: it has 0 name matches, and 1 synonym match. -// * Random gibberish is reported as "ambiguous". -// curl -X POST https://api.opentreeoflife.org/v2/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Bacteria","Firmiscala"]}' -// - - -bool taxon_is_specific(const Taxon* taxon) { - auto rank = taxon->get_data().rank; - return rank_is_specific(rank); -} - -bool taxon_is_genus(const Taxon* taxon) { - return taxon->get_data().rank == TaxonomicRank::RANK_GENUS; -} - -bool taxon_is_higher(const Taxon* taxon) { - return taxon->get_data().rank < TaxonomicRank::RANK_SPECIES; -} - -using vec_tax_str_pair_t = vector >; -vec_tax_str_pair_t exact_synonym_search(const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) -{ - vec_tax_str_pair_t hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - for(auto& tjs: taxon->get_data().junior_synonyms) { - if (lcase_string_equals(query, tjs->get_name())) { - hits.push_back({taxon,tjs->get_name()}); - } - } - } - return hits; -} - -vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - if (include_suppressed) { - return exact_synonym_search(context_root, query); - } - tax_pred_t ok = [&](const Taxon* taxon) { - return not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return exact_synonym_search(context_root, query, ok); -} - -vec_tax_str_pair_t exact_synonym_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return exact_synonym_search(context_root, query, ok); -} - -vector exact_name_search_species(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_specific(taxon); - }; - return exact_name_search(context_root, query, ok); - -} - -vector exact_name_search_genus(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_genus(taxon); - }; - return exact_name_search(context_root, query, ok); -} - -vector exact_name_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return exact_name_search(context_root, query, ok); -} - -vector prefix_name_search(const Taxon* context_root, - const string& query, - tax_pred_t ok = [](const Taxon*){return true;}) { - vector hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - if (lcase_match_prefix(taxon->get_data().get_nonuniqname(), query)) { - hits.push_back(taxon); - } - } - return hits; -} - -vector prefix_name_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - const string& query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return prefix_name_search(context_root, query, ok); -} - -vector prefix_name_search_higher(const RichTaxonomy& taxonomy, - const Taxon* context_root, - const string& query, - bool include_suppressed) { - tax_pred_t ok = [&](const Taxon* taxon) { - if (not include_suppressed and taxonomy.node_is_suppressed_from_tnrs(taxon)) { - return false; - } - return taxon_is_higher(taxon); - }; - return prefix_name_search(context_root, query, ok); -} - -vec_tax_str_pair_t prefix_synonym_search(const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) -{ - vec_tax_str_pair_t hits; - for(auto taxon: iter_post_n_const(*context_root)) { - if (not ok(taxon)) { - continue; - } - for(auto& tjs: taxon->get_data().junior_synonyms) { - if (lcase_match_prefix(tjs->get_name(), query)) { - hits.push_back({taxon,tjs->get_name()}); - } - } - } - return hits; -} - -vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - bool include_suppressed) { - std::function ok = [&](const Taxon* taxon) { - return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); - }; - return prefix_synonym_search(context_root, query, ok); -} - -inline json get_taxon_json(const RichTaxonomy& taxonomy, const Taxon& taxon) { - json j; - tax_service_add_taxon_info(taxonomy, taxon, j); - // What about the "is_suppressed_from_synth" flag? Do we want that? - return j; -} - -inline json get_taxon_record_json(const RichTaxonomy& taxonomy, const TaxonomyRecord & taxon) { - json j; - tax_service_add_suppressed_taxon_info(taxonomy, taxon, j); - return j; -} - - -inline json ContextSearcher::_base_name_match_json(const string& query, - const Taxon * taxon) const { - json result; - result["taxon"] = get_taxon_json(taxonomy, *taxon); - result["search_string"] = query; - result["nomenclature_code"] = Context::get_code_name(taxonomy, taxon); - return result; -} - -inline json ContextSearcher::_base_name_match_json(const string & query, - const TaxonomyRecord * record) const { - json result; - result["taxon"] = get_taxon_record_json(taxonomy, *record); - result["search_string"] = query; - result["nomenclature_code"] = Context::get_code_name(taxonomy, record); - return result; -} - -inline json ContextSearcher::fuzzy_name_match_json(const string& query, - const FuzzyQueryResultWithTaxon & fqrwt) const { - json result; - auto taxon = fqrwt.get_taxon(); - if (taxon != nullptr) { - result = _base_name_match_json(query, taxon); - } else { - result = _base_name_match_json(query, taxon); - } - result["score"] = fqrwt.get_score(); - result["is_approximate_match"] = true; - result["is_synonym"] = false; - result["matched_name"] = fqrwt.get_matched_name(); - return result; -} - -inline json ContextSearcher::exact_name_match_json(const string& query, - const Taxon* taxon) const { - auto result = _base_name_match_json(query, taxon); - result["score"] = 1.0; - result["is_approximate_match"] = false; - result["is_synonym"] = false; - result["matched_name"] = taxon->get_data().get_nonuniqname(); - return result; -} - -inline json ContextSearcher::exact_synonym_match_json(const string& query, - const Taxon* taxon, - const string& synonym_name) const { - auto result = _base_name_match_json(query, taxon); - result["score"] = 1.0; - result["is_approximate_match"] = false; - result["is_synonym"] = true; - result["matched_name"] = synonym_name; - return result; -} - - -pair ContextSearcher::match_name(const string & raw_query, - bool do_approximate_matching, - bool include_suppressed) { - auto query = normalize_query(raw_query); - json results; - match_status status = unmatched; - // 1. See if we can find an exact name match - auto exact_name_matches = exact_name_search(taxonomy, context_root, query, include_suppressed); - for(auto taxon: exact_name_matches) { - results.push_back(exact_name_match_json(query, taxon)); - } - if (exact_name_matches.size() == 1) { - status = unambiguous_match; - } - // 2. See if we can find an exact name match for synonyms - auto exact_synonym_matches = exact_synonym_search(taxonomy, context_root, query, include_suppressed); - for(auto& [ taxon, synonym_name ]: exact_synonym_matches) { - results.push_back(exact_synonym_match_json(query, taxon, synonym_name)); - } - if (status == unmatched and results.size()) { - status = ambiguous_match; - } - // 3. Do fuzzy matching ONLY for names that we couldn't match - if (do_approximate_matching and status == unmatched) { - // do fuzzy matching. - auto ctp = taxonomy.get_fuzzy_matcher(); - if (ctp == nullptr) { - throw OTCError() << "Fuzzy matching has not been enabled in the taxonomy, but was requested in match_name."; - } - auto fuzzy_results = ctp->fuzzy_query_to_taxa(query, context_root, taxonomy, include_suppressed); - if (fuzzy_results.size() > 0) { - if (fuzzy_results.size() == 1) { - status = unambiguous_match; - } else { - status = ambiguous_match; - } - for (auto fqr : fuzzy_results) { - results.push_back(fuzzy_name_match_json(query, fqr)); - } - } - } - json match_results; - match_results["name"] = raw_query; - match_results["matches"] = results; - return {match_results, status}; -} - - -// return name.split("\s+",2) if the string has a space or an optional with no value otherwise. -optional> split_genus_species(const string& name) { - auto first_space = name.find(' '); - // Quit if there's no space - if (first_space == string::npos) { - return {}; - } - auto genus = name.substr(0,first_space); - auto non_space = name.find_first_not_of(' ', first_space+1); - if (non_space == string::npos) { - non_space = name.size(); - } - auto species = name.substr(non_space, name.size() - non_space); - return {{genus, species}}; -} - - - -/* -Note from taxomachine: tnrs_v3.java: - -Assumes the input is a taxon name that may be incomplete (i.e. the beginning of a taxon name such as 'Ast', -which would match 'Astelia', 'Astilbe', 'Aster', 'Asteroidea', 'Asteraceae', 'Astrantia', etc.). If the input -string is an exact string match to an existing taxon name, then only the exact match will be returned, (i.e. the -input 'Aster' will produce a single result 'Aster'). - -If name expansion identifies a valid genus name, the results will -not include species names from within that genus, but if a trailing space exists in the input following a valid -genus name, then species names will be returned. For example, both 'Garcin' and 'Garcinia' will match the genus name -'Garcinia' itself but will not match any species names within the genus, but 'Garcinia ' (note the trailing space) -will match all the species in the genus, and 'Garcinia m' with match all species names in Garcinia with a specific -epithet that starts with 'm'. - -**IMPORTANT NOTE: This service should not be used for general purpose TNRS queries.** It is optimized for and -(obviously) intended for use *only* with autocomplete boxes on web forms. For all name matching purposes other than -autocompleting name fields on forms, use the `match_names` service. -*/ - - - -json autocomplete_json(const RichTaxonomy& taxonomy, const Taxon* taxon) { - json match; - match -enum match_status {unmatched=0, - ambiguous_match=1, - unambiguous_match=2}; - -} - -inline json ContextSearcher::exact_synonym_match_json(const string& query, - const Taxon* taxon, - const string& synonym_name) const { - auto result = _base_name_match_json(query, taxon); - result["score"] = 1.0; - result["is_approximate_match"] = false; - result["is_synonym"] = true; - result["matched_name"] = synonym_name; - return result; -} - - -pair ContextSearcher::match_name(const string & raw_query, - bool do_approximate_matching, - bool include_suppressed) { - auto query = normalize_query(raw_query); - json results; - match_status status = unmatched; - // 1. See if we can find an exact name match - auto exact_name_matches = exact_name_search(taxonomy, context_root, query, include_suppressed); - for(auto taxon: exact_name_matches) { - results.push_back(exact_name_match_json(query, taxon)); - } - if (exact_name_matches.size() == 1) { - status = unambiguous_match; - } - // 2. See if we can find an exact name match for synonyms - auto exact_synonym_matches = exact_synonym_search(taxonomy, context_root, query, include_suppressed); - for(auto& [ taxon, synonym_name ]: exact_synonym_matches) { - results.push_back(exact_synonym_match_json(query, taxon, synonym_name)); - } - if (status == unmatched and results.size()) { - status = ambiguous_match; - } - // 3. Do fuzzy matching ONLY for names that we couldn't match - if (do_approximate_matching and status == unmatched) { - // do fuzzy matching. - auto ctp = taxonomy.get_fuzzy_matcher(); - if (ctp == nullptr) { - throw OTCError() << "Fuzzy matching has not been enabled in the taxonomy, but was requested in match_name."; - } - auto fuzzy_results = ctp->fuzzy_query_to_taxa(query, context_root, taxonomy, include_suppressed); - if (fuzzy_results.size() > 0) { - if (fuzzy_results.size() == 1) { - status = unambiguous_match; - } else { - status = ambiguous_match; - } - for (auto fqr : fuzzy_results) { - results.push_back(fuzzy_name_match_json(query, fqr)); - } - } - } - json match_results; - match_results["name"] = raw_query; - match_results["matches"] = results; - return {match_results, status}; -} - - -// return name.split("\s+",2) if the string has a space or an optional with no value otherwise. -optional> split_genus_species(const string& name) { - auto first_space = name.find(' '); - // Quit if there's no space - if (first_space == string::npos) { - return {}; - } - auto genus = name.substr(0,first_space); - auto non_space = name.find_first_not_of(' ', first_space+1); - if (non_space == string::npos) { - non_space = name.size(); - } - auto species = name.substr(non_space, name.size() - non_space); - return {{genus, species}}; -} - - - -/* -Note from taxomachine: tnrs_v3.java: - -Assumes the input is a taxon name that may be incomplete (i.e. the beginning of a taxon name such as 'Ast', -which would match 'Astelia', 'Astilbe', 'Aster', 'Asteroidea', 'Asteraceae', 'Astrantia', etc.). If the input -string is an exact string match to an existing taxon name, then only the exact match will be returned, (i.e. the -input 'Aster' will produce a single result 'Aster'). - -If name expansion identifies a valid genus name, the results will -not include species names from within that genus, but if a trailing space exists in the input following a valid -genus name, then species names will be returned. For example, both 'Garcin' and 'Garcinia' will match the genus name -'Garcinia' itself but will not match any species names within the genus, but 'Garcinia ' (note the trailing space) -will match all the species in the genus, and 'Garcinia m' with match all species names in Garcinia with a specific -epithet that starts with 'm'. - -**IMPORTANT NOTE: This service should not be used for general purpose TNRS queries.** It is optimized for and -(obviously) intended for use *only* with autocomplete boxes on web forms. For all name matching purposes other than -autocompleting name fields on forms, use the `match_names` service. -*/ - - - -json autocomplete_json(const RichTaxonomy& taxonomy, const Taxon* taxon) { - json match; - match -enum match_status {unmatched=0, - ambiguous_match=1, - unambiguous_match=2}; -["ott_id"] = taxon->get_ott_id(); - match["unique_name"] = get_taxon_unique_name(*taxon); - match["is_suppressed"] = taxonomy.node_is_suppressed_from_tnrs(taxon); - match["is_higher"] = taxon_is_higher(taxon); - return match; -} - -inline json autocomplete_json(const RichTaxonomy& taxonomy, const pair& p) { - return autocomplete_json(taxonomy, p.first); -} - -inline void add_hits(json& j, const RichTaxonomy& taxonomy, const vector taxa) { - for(auto taxon: taxa) { - j.push_back(autocomplete_json(taxonomy, taxon)); - } -} - -inline void add_hits(json& j, const RichTaxonomy& taxonomy, const vec_tax_str_pair_t taxa) { - for(auto [taxon, synonym]: taxa) { - j.push_back(autocomplete_json(taxonomy, taxon)); - } -} - -// Find all species in the genus that have the given prefix -vector prefix_search_species_in_genus(const Taxon* genus, - const string_view& species_prefix) { - vector match_species; - auto genus_name = genus->get_data().get_nonuniqname(); - for(auto species: iter_post_n_const(*genus)) { - if (not taxon_is_specific(species)) { - continue; - } - auto species_name = species->get_data().get_nonuniqname().substr(genus_name.size()+1); - if (lcase_match_prefix(species_name, species_prefix)) { - match_species.push_back(species); - } - } - return match_species; -} - - -// curl -X POST https://api.opentreeoflife.org/v3/tnrs/autocomplete_name -H "content-type:application/json" -d '{"name":"Endoxyla","context_name":"All life"}' -string tnrs_autocomplete_name_ws_method(const string& name, - const string& context_name, - bool include_suppressed, - const RichTaxonomy& taxonomy) { - json response; - // We need to escape the query string. - auto escaped_query = escape_query_string(name); - // This corresponds to a SingleNamePrefixQuery in taxomachine. - // * See org/opentree/taxonomy/plugins/tnrs_v3.java - // * See org/opentree/tnrs/queries/SingleNamePrefixQuery.java - // 0. Escape the query?? - // lower-case the name? - // 1. Determine context - auto context = determine_context(context_name); - auto context_root = taxonomy.included_taxon_from_id(context->ott_id); - if (auto query_genus_species = split_genus_species(name)) { - // 2. If we have a space, then assume the first part is a genus and match species names within the genus - // Search against species and synonyms - add_hits(response, taxonomy, exact_name_search_species(taxonomy, context_root, escaped_query, include_suppressed)); - add_hits(response, taxonomy, exact_synonym_search(taxonomy, context_root, escaped_query, include_suppressed)); - if (response.size()) { - return response.dump(1); - } - // no exact hit against the species index - auto genus_hits = exact_name_search_genus(taxonomy, context_root, escaped_query, include_suppressed); - if (not genus_hits.empty()) { // the first word was an exact match against the genus index - auto [query_genus,query_species] = *query_genus_species; - for(auto genus: genus_hits) { - add_hits(response, taxonomy, prefix_search_species_in_genus(genus, query_species)); - } - } - if (not response.empty()) { - return response.dump(1); - } - // no exact hit for first word against the genus index - // Hit query string against the higher taxon index... not sure if this is useful, since it has a space - add_hits(response, taxonomy, exact_name_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); - if (not response.empty()) { - return response.dump(1); - } - // Prefix query against the synonyms and higher taxa - add_hits(response, taxonomy, prefix_name_search(taxonomy, context_root, escaped_query, include_suppressed)); - add_hits(response, taxonomy, prefix_synonym_search(taxonomy, context_root, escaped_query, include_suppressed)); - if (not response.empty()) { - return response.dump(1); - } - // fuzzy search on names and synonyms - } else { // does not contain a space at all - add_hits(response, taxonomy, exact_name_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); - add_hits(response, taxonomy, exact_synonym_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); - if (not response.empty()) { - return response.dump(1); - } - // Do a prefix query against the higher taxon index - add_hits(response, taxonomy, prefix_name_search_higher(taxonomy, context_root, escaped_query, include_suppressed)); - if (not response.empty()) { - return response.dump(1); - } - // Do a prefix query against the all taxa synonym index - add_hits(response, taxonomy, prefix_synonym_search(taxonomy, context_root, escaped_query, include_suppressed)); - if (not response.empty()) { - return response.dump(1); - } - // fuzzy search on higher names and synonyms - } - return response.dump(1); -} - -// curl -X POST https://api.opentreeoflife.org/v3/tnrs/contexts -// curl -X POST http://localhost:1984/v3/tnrs/contexts -std::string tnrs_contexts_ws_method() { - json response; - for(auto& context: all_contexts) { - response[context.group].push_back(context.name); - } - return response.dump(1); -} - -// curl -X POST https://api.opentreeoflife.org/v3/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Pan","Homo","Mus","Bufo","Drosophila"]}' -// curl -X POST http://localhost:1984/v3/tnrs/infer_context -H "content-type:application/json" -d '{"names":["Pan","Homo","Mus","Bufo","Drosophila"]}' -string tnrs_infer_context_ws_method(const vector& names, const RichTaxonomy& taxonomy) { - auto results = infer_context_and_ambiguous_names(taxonomy, names); - auto& context = results.first; - auto& ambiguous_names = results.second; - json response; - response["context_name"] = results.first->name; - response["context_ott_id"] = results.first->ott_id; - response["ambiguous_names"] = ambiguous_names; - return response.dump(1); -} - -const Context* determine_context_for_names(const vector& names, - const optional& context_name, - const RichTaxonomy& taxonomy) { - if (not context_name) { - return infer_context_and_ambiguous_names(taxonomy, names).first; - } - return get_context_by_name(*context_name); -} - - - -//FIXME: how is "suppressed_names" different from "deprecated_taxa"? - -// $ curl -X POST https://api.opentreeoflife.org/v3/tnrs/match_names -H "content-type:application/json" -d '{"names":["Aster","Symphyotrichum","Barnadesia"]}' -// $ curl -X POST http://localhost:1984/v3/tnrs/match_names -H "content-type:application/json" -d '{"names":["Aster","Symphyotrichum","Barnadesia"]}' -std::string tnrs_match_names_ws_method(const vector& names, - const optional& context_name, - bool do_approximate_matching, - const optional>& /* ids */, // FIXME: Do we need to implement this? - bool include_suppressed, - const RichTaxonomy& taxonomy) { - // This corresponds to a MultiNameContextQuery in taxomachine. - // * See org/opentree/taxonomy/plugins/tnrs_v3.java - // * See org/opentree/tnrs/queries/MultiNameContextQuery.java: - // 1. Determine context - auto context = determine_context_for_names(names, context_name, taxonomy); - ContextSearcher searcher(taxonomy, *context); - // 2. Iterate over names and fill arrays `results`, `unmatched_names`, `matched_names`, and `unambiguous_names`. - json results = json::array(); - json unambiguous_names = json::array(); - json unmatched_names = json::array(); - json matched_names = json::array(); - for(auto& name: names) { - // Do the search - auto [result, status] = searcher.match_name(name, do_approximate_matching, include_suppressed); - // Store the result - results.push_back(result); - // Classify name as unmatched / matched / unambiguous - if (status == unmatched) { - unmatched_names.push_back(name); - } else { - matched_names.push_back(name); - if (status == unambiguous_match) { - unambiguous_names.push_back(name); - } - } - } - // 3. Construct JSON response. - json response; - response["governing_code"] = context->code.name; - response["context"] = context->name; - response["includes_approximate_matches"] = do_approximate_matching; - response["includes_deprecated_taxa"] = false; // ?? How is this different from suppressed_names? - response["includes_suppressed_names"] = include_suppressed; - response["taxonomy"] = tax_about_json(taxonomy); - response["unambiguous_names"] = unambiguous_names; - response["unmatched_names"] = unmatched_names; - response["matched_names"] = matched_names; - response["results"] = results; - return response.dump(1); -} - -} From 6536391d7cbc9da53b14cde3c189d01d892a4aee Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 21 Apr 2020 20:06:29 -0700 Subject: [PATCH 057/620] Further speed up exact_match queries. Just use the std::map instead of the ctrie. --- otc/ctrie/context_ctrie_db.cpp | 73 ++++++++++++++++++++++++----- otc/ctrie/context_ctrie_db.h | 16 +++++-- otc/ctrie/ctrie_search_impl.h | 5 +- otc/ctrie/search_data_models.h | 86 +++++++++++++++++++++------------- otc/taxonomy/taxonomy.cpp | 7 +-- otc/ws/tnrsws.cpp | 4 +- 6 files changed, 133 insertions(+), 58 deletions(-) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 618ddfa2..5326b844 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -7,6 +7,7 @@ using std::set; using std::string; using std::vector; +using std::optional; namespace otc { @@ -91,18 +92,13 @@ std::set ContextAwareCTrieBasedDB::fuz return sorted; } -std::set ContextAwareCTrieBasedDB::exact_query(const std::string & query_str) const { - std::set sorted; - if (context.name_matcher != nullptr) { - sorted = context.name_matcher->exact_query(query_str); - } - for (auto c :children) { - if (c->context.name_matcher) { - auto csorted = c->context.name_matcher->exact_query(query_str); - sorted.insert(std::begin(csorted), std::end(csorted)); - } - } - return sorted; +optional ContextAwareCTrieBasedDB::exact_query(const std::string & query_str) const +{ + auto nquery = normalize_query(query_str); + if (match_name_to_taxon.count(nquery)) + return nquery; + else + return {}; } using vec_fqr_w_t = std::vector; @@ -149,6 +145,59 @@ vec_fqr_w_t ContextAwareCTrieBasedDB::to_taxa(const set +ContextAwareCTrieBasedDB::to_taxa(const optional& n_query, + const RTRichTaxNode * context_root, + const RichTaxonomy & /*taxonomy*/, + bool include_suppressed) const +{ + if (not n_query) + { + LOG(DEBUG) << "no matches"; + return {}; + } + + vector results; + const auto & tax_data = context_root->get_data(); + const auto filter_trav_enter = tax_data.trav_enter; + const auto filter_trav_exit = tax_data.trav_exit; + + const auto & vec_taxon_and_syn_ptrs = match_name_to_taxon.at(*n_query); + LOG(DEBUG) << "exact_query(match=\"" << *n_query << ") -> vec size = " << vec_taxon_and_syn_ptrs.size(); + for (auto & [tax_ptr, rec_or_syn_ptr] : vec_taxon_and_syn_ptrs) + { + if (tax_ptr == nullptr) + { + LOG(DEBUG) << "matched suppressed and include_suppressed = " << include_suppressed; + if (include_suppressed) + { + const TaxonomyRecord * tr = (const TaxonomyRecord *) rec_or_syn_ptr; + results.push_back(TaxonResult(tr)); + } + } + else + { + const auto & res_tax_data = tax_ptr->get_data(); + LOG(DEBUG) << "matched taxon trav = (" << res_tax_data.trav_enter << ", " << res_tax_data.trav_exit << "). filter.trav = (" << filter_trav_enter << ", " << filter_trav_exit << ")"; + if (res_tax_data.trav_exit <= filter_trav_exit && res_tax_data.trav_enter >= filter_trav_enter) + { + const TaxonomicJuniorSynonym * syn_ptr = (const TaxonomicJuniorSynonym *) rec_or_syn_ptr; + if (syn_ptr == nullptr) + { + LOG(DEBUG) << "pushing non-syn"; + results.push_back(TaxonResult(tax_ptr)); + } + else + { + LOG(DEBUG) << "pushing synonym"; + results.push_back(TaxonResult(tax_ptr, syn_ptr)); + } + } + } + } + return results; +} + vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, const RichTaxonomy & taxonomy, diff --git a/otc/ctrie/context_ctrie_db.h b/otc/ctrie/context_ctrie_db.h index c42dca12..ed5d969d 100644 --- a/otc/ctrie/context_ctrie_db.h +++ b/otc/ctrie/context_ctrie_db.h @@ -15,8 +15,12 @@ class ContextAwareCTrieBasedDB { public: ContextAwareCTrieBasedDB(const Context &, const RichTaxonomy &); ContextAwareCTrieBasedDB(const Context &, const RichTaxonomy &, const std::set & keys); - std::set fuzzy_query(const std::string & query_str) const; - std::set exact_query(const std::string & query_str) const; + + // What strings (for names or synonyms) match the normalized query string? + std::set fuzzy_query(const std::string & query_str) const; + + // Does anything match this normalized query string? + std::optional exact_query(const std::string & query_str) const; std::vector fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, @@ -28,7 +32,13 @@ class ContextAwareCTrieBasedDB { const RichTaxonomy & taxonomy, bool include_suppressed) const; - private: + std::vector to_taxa(const std::optional& result, + const RTRichTaxNode * context_root, + const RichTaxonomy & taxonomy, + bool include_suppressed) const; + + +private: const Context & context; std::vector children; CompressedTrieBasedDB trie; diff --git a/otc/ctrie/ctrie_search_impl.h b/otc/ctrie/ctrie_search_impl.h index 898796ac..8e5311b2 100644 --- a/otc/ctrie/ctrie_search_impl.h +++ b/otc/ctrie/ctrie_search_impl.h @@ -386,9 +386,8 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, auto altqc = equivalent_letter[qc]; if (DB_FUZZY_MATCH) {trienode->log_state();} auto inds_on = trienode->get_letter_and_node_indices_for_on_bits(); - for (auto & x : inds_on) { - auto trie_char = x.first; - auto next_ind = x.second; + for (auto & [trie_char, next_ind] : inds_on) + { const T * next_nd = &(node_vec[next_ind]); if (trie_char == qc || trie_char == altqc) { if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index ae944c0c..8286d6a0 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -40,48 +40,19 @@ class FuzzyQueryResult { }; -class FuzzyQueryResultWithTaxon { - const FuzzyQueryResult query_result; +class TaxonResult +{ const RTRichTaxNode * taxon = nullptr; const TaxonomyRecord * record = nullptr; bool matched_to_synonym; const std::string matched_name; - public: - FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, - const RTRichTaxNode * tax_arg) - :query_result(fqr), - taxon(tax_arg), - record(nullptr), - matched_to_synonym(false), - matched_name(tax_arg->get_data().get_nonuniqname()) { - } - - FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, - const TaxonomyRecord * tax_rec) - :query_result(fqr), - taxon(nullptr), - record(tax_rec), - matched_to_synonym(false), - matched_name(tax_rec->name) { - } - FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, - const RTRichTaxNode * tax_arg, - const TaxonomicJuniorSynonym *syn) - :query_result(fqr), - taxon(tax_arg), - record(nullptr), - matched_to_synonym(true), - matched_name(syn->get_name()) { - } - - float get_score() const { - return query_result.score; - } +public: bool is_synonym() const { return matched_to_synonym; } + std::string get_matched_name() const { return matched_name; } @@ -93,7 +64,56 @@ class FuzzyQueryResultWithTaxon { const TaxonomyRecord * get_record() const { return record; } + + TaxonResult(const RTRichTaxNode * tax_arg) + :taxon(tax_arg), + matched_to_synonym(false), + matched_name(tax_arg->get_data().get_nonuniqname()) + { } + + TaxonResult(const TaxonomyRecord * tax_rec) + :record(tax_rec), + matched_to_synonym(false), + matched_name(tax_rec->name) + { } + + TaxonResult(const RTRichTaxNode * tax_arg, + const TaxonomicJuniorSynonym *syn) + :taxon(tax_arg), + matched_to_synonym(true), + matched_name(syn->get_name()) + { } +}; + + +class FuzzyQueryResultWithTaxon: public TaxonResult +{ + const FuzzyQueryResult query_result; +public: + FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, + const RTRichTaxNode * tax_arg) + :TaxonResult(tax_arg), + query_result(fqr) + { } + + FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, + const TaxonomyRecord * tax_rec) + :TaxonResult(tax_rec), + query_result(fqr) + { } + + FuzzyQueryResultWithTaxon(const FuzzyQueryResult & fqr, + const RTRichTaxNode * tax_arg, + const TaxonomicJuniorSynonym *syn) + :TaxonResult(tax_arg,syn), + query_result(fqr) + { } + + float get_score() const { + return query_result.score; + } }; + struct SortQueryResByNearness { bool operator() (const FuzzyQueryResult & lhs, const FuzzyQueryResult & rhs) const { diff --git a/otc/taxonomy/taxonomy.cpp b/otc/taxonomy/taxonomy.cpp index 74c2a235..3e151952 100644 --- a/otc/taxonomy/taxonomy.cpp +++ b/otc/taxonomy/taxonomy.cpp @@ -861,15 +861,12 @@ vector exact_name_search(const RichTaxonomy& taxonomy, const std::string& query_ref, std::function ok) { - // Maybe move this into the exact_query( ) call. - string query = normalize_query(query_ref); - auto ctp = taxonomy.get_fuzzy_matcher(); assert(ctp); - auto fuzzy_results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); + auto results = ctp->to_taxa(ctp->exact_query(query_ref), context_root, taxonomy, true); vector hits; - for(auto& result: fuzzy_results) + for(auto& result: results) { if (not result.is_synonym()) { diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 75106f3e..fd667d9a 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -117,9 +117,9 @@ vec_tax_str_pair_t exact_synonym_search(const RichTaxonomy& taxonomy, assert(ctp); - auto fuzzy_results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); + auto results = ctp->to_taxa(ctp->exact_query(query), context_root, taxonomy, true); vec_tax_str_pair_t hits; - for(auto& result: fuzzy_results) + for(auto& result: results) { if (result.is_synonym()) { From 6339369de8851abd0dc0fb471ac51132ae629f25 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 23 Apr 2020 22:08:13 -0700 Subject: [PATCH 058/620] Inline the walk over bits in a byte. --- otc/ctrie/ctrie_node.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index df536823..b8469dc3 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -26,22 +26,34 @@ void fill_letter_and_node_indices_64(uint64_t masked_workspace, vec_ind_pair_t & ret, uint64_t & node_index) { + // Start with the most significant 8 bits int bitshift = 56; uint64_t blot_out_masker; - for (unsigned char i = 0U; i < 8; ++i) { + for (unsigned char i = 0U; i < 8; ++i) + { if (masked_workspace == 0) { return; } unsigned char curr_byte = masked_workspace >> bitshift; - if (curr_byte != 0) { + if (curr_byte != 0) + { blot_out_masker = curr_byte; blot_out_masker <<= bitshift; // 0 out the bits in masked_workspace that we're dealing with in this iteration. masked_workspace ^= blot_out_masker; - fill_letter_and_node_indices(curr_byte, offset, ret, node_index); + + // Start with the most significant bit of the byte. + unsigned char curr_bit = FIRST_BIT_OF_BYTE; + for (unsigned char j = 0; j < 8; ++j) + { + //std::cerr << "fill_letter_and_node_indices byte=" << std::hex << (unsigned int)curr_byte << " bit=" << std::hex << (unsigned int)curr_bit << '\n' << std::dec; + if (curr_byte & curr_bit) + ret.push_back(ind_pair_t{offset, node_index++}); + curr_bit >>= 1; + offset ++; + } + bitshift -= 8; } - bitshift -= 8; - offset += 8; } } From 28538a3740d94476407455624ff8177dd5d72bc9 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 23 Apr 2020 22:18:06 -0700 Subject: [PATCH 059/620] Simplify walking bits in the word. --- otc/ctrie/ctrie_node.cpp | 55 +++++++--------------------------------- 1 file changed, 9 insertions(+), 46 deletions(-) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index b8469dc3..3143fa97 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -2,58 +2,21 @@ namespace otc { -//@TODO use lookup table -void fill_letter_and_node_indices(unsigned char curr_byte, - int offset, - vec_ind_pair_t & ret, - uint64_t & node_index) -{ - if (curr_byte == 0) { - return; - } - unsigned char curr_bit = FIRST_BIT_OF_BYTE; - for (unsigned char i = 0; i < 8; ++i) { - //std::cerr << "fill_letter_and_node_indices byte=" << std::hex << (unsigned int)curr_byte << " bit=" << std::hex << (unsigned int)curr_bit << '\n' << std::dec; - if (curr_byte & curr_bit) { - ret.push_back(ind_pair_t{i + offset, node_index++}); - } - curr_bit >>= 1; - } -} - -void fill_letter_and_node_indices_64(uint64_t masked_workspace, +void fill_letter_and_node_indices_64(uint64_t letter_bits, int offset, vec_ind_pair_t & ret, uint64_t & node_index) { - // Start with the most significant 8 bits - int bitshift = 56; - uint64_t blot_out_masker; - for (unsigned char i = 0U; i < 8; ++i) + uint64_t curr_bit = (ONE_64<<63); + for (unsigned char i = 0; i < 64; i++) { - if (masked_workspace == 0) { - return; - } - unsigned char curr_byte = masked_workspace >> bitshift; - if (curr_byte != 0) - { - blot_out_masker = curr_byte; - blot_out_masker <<= bitshift; - // 0 out the bits in masked_workspace that we're dealing with in this iteration. - masked_workspace ^= blot_out_masker; + if (letter_bits == 0) return; + + if (letter_bits & curr_bit) + ret.push_back(ind_pair_t{offset, node_index++}); - // Start with the most significant bit of the byte. - unsigned char curr_bit = FIRST_BIT_OF_BYTE; - for (unsigned char j = 0; j < 8; ++j) - { - //std::cerr << "fill_letter_and_node_indices byte=" << std::hex << (unsigned int)curr_byte << " bit=" << std::hex << (unsigned int)curr_bit << '\n' << std::dec; - if (curr_byte & curr_bit) - ret.push_back(ind_pair_t{offset, node_index++}); - curr_bit >>= 1; - offset ++; - } - bitshift -= 8; - } + curr_bit >>= 1; + offset ++; } } From c722f70a278fc60f8ad5019397bf2df592cc4f54 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Thu, 23 Apr 2020 22:25:00 -0700 Subject: [PATCH 060/620] Only visit set bits in word. --- otc/ctrie/ctrie_node.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index 3143fa97..af8bf3c6 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -7,16 +7,16 @@ void fill_letter_and_node_indices_64(uint64_t letter_bits, vec_ind_pair_t & ret, uint64_t & node_index) { - uint64_t curr_bit = (ONE_64<<63); - for (unsigned char i = 0; i < 64; i++) + constexpr uint64_t left_bit = (ONE_64<<63); + while (letter_bits) { - if (letter_bits == 0) return; + int i = __builtin_clzl(letter_bits); + uint64_t curr_bit = left_bit >> i; if (letter_bits & curr_bit) - ret.push_back(ind_pair_t{offset, node_index++}); + ret.push_back(ind_pair_t{offset+i, node_index++}); - curr_bit >>= 1; - offset ++; + letter_bits &= (~curr_bit); } } From 8b9b6a3f6431afdadd6b17ec2046207266bbe40a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 28 Apr 2020 14:32:39 -0700 Subject: [PATCH 061/620] Just use the string_view version. --- otc/ctrie/str_utils.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/otc/ctrie/str_utils.h b/otc/ctrie/str_utils.h index 4c777c66..520420f1 100644 --- a/otc/ctrie/str_utils.h +++ b/otc/ctrie/str_utils.h @@ -95,15 +95,5 @@ inline std::string normalize_query(const std::string & raw_query) { return query; } -inline std::string normalize_query(const std::string_view & raw_query) { - std::string query; - query.reserve(query.size()); - for (auto c: raw_query) { - query.push_back(std::tolower(c)); - } - return query; -} - - } // namespace otc #endif From 8a49f2a6a2f59ea7a09413bdb0efc816ef32fd55 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 28 Apr 2020 14:33:28 -0700 Subject: [PATCH 062/620] Convert latin accents and a few other things when normalizing queries. --- otc/ctrie/str_utils.h | 78 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/otc/ctrie/str_utils.h b/otc/ctrie/str_utils.h index 520420f1..033687c2 100644 --- a/otc/ctrie/str_utils.h +++ b/otc/ctrie/str_utils.h @@ -86,13 +86,81 @@ inline bool starts_with(const stored_str_t & full, const stored_str_t & pref) { return 0 == full.compare(0, pref.length(), pref); } -// could use this for \"e -> e as well -inline std::string normalize_query(const std::string & raw_query) { - std::string query = raw_query; - for (auto& c: query) { +// BDR: This does not handle all accented characters, but should get a fair number of them. +// See https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block) +// See https://en.wikipedia.org/wiki/ISO/IEC_8859-1 +// See https://stackoverflow.com/questions/14094621/ +inline unsigned char normalize_latin_char(unsigned char ch) +{ + static const char* + // "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ" + tr = "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYPsaaaaaaeceeeeiiiiOnooooo/0uuuuypy"; + + if ( ch < 128) + return ch; + else if ( ch < 192) + { + if (ch == 171) return '"'; // << quote + if (ch == 187) return '"'; // >> quote + return ch; + } + + return tr[ ch-192 ]; +} + +inline char32_t normalize_greek_or_coptic_uchar(char32_t ch) +{ + // See https://unicode.org/charts/PDF/U0370.pdf + + if (ch == 945) return 'a'; // alpha + if (ch == 946) return 'b'; // beta + if (ch == 947) return 'g'; // gamma + if (ch == 948) return 'd'; // delta + if (ch == 949) return 'e'; // epsilon + if (ch == 950) return 'z'; // zeta + if (ch == 951) return 'e'; // eta + if (ch == 952) return 't'; // theta + if (ch == 953) return 'i'; // iota + if (ch == 954) return 'k'; // kappa + if (ch == 955) return 'l'; // lambda + if (ch == 956) return 'm'; // mu + + return ch; +} + +// https://www.codetable.net/decimal/8217 +inline char32_t normalize_uchar(char32_t ch) +{ + if (ch < 256) + return normalize_latin_char(ch); + + if (ch >= 880 and ch < 1024) + return normalize_greek_or_coptic_uchar(ch); + + if (ch == 352) return 'S'; // Š Latin capital S with Caron + if (ch == 382) return 'z'; // ž + if (ch == 353) return 's'; // ſ + + if (ch == 1086) return 'o'; + if (ch == 1089) return 'c'; + + if (ch == 8201) return ' '; // thin space + if (ch == 8220) return '"'; // left quote + if (ch == 8221) return '"'; // right quote + + return ch; +} + +// Currently we don't allow changing the number of chars, so we can't handle +// ligatures like fl or ae. +inline std::string normalize_query(const std::string_view & raw_query) +{ + auto uquery = to_u32string(raw_query); + for (auto& c: uquery) { + c = normalize_uchar(c); c = std::tolower(c); } - return query; + return to_char_str(uquery); } } // namespace otc From ef34dedf8e46f9aae04e93eb29843d9344eb1c32 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 28 Apr 2020 14:52:19 -0700 Subject: [PATCH 063/620] Remove capitals from nonfunky char list, and add some additional things. --- otc/ctrie/ctrie_db.cpp | 46 +++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/otc/ctrie/ctrie_db.cpp b/otc/ctrie/ctrie_db.cpp index dd982b04..4c8fb338 100644 --- a/otc/ctrie/ctrie_db.cpp +++ b/otc/ctrie/ctrie_db.cpp @@ -42,12 +42,13 @@ std::set CompressedTrieBasedDB::exact_ return sorted; } - void CompressedTrieBasedDB::initialize(const std::set & keys) { ctrie_init_set_t for_wide; ctrie_init_set_t for_thin; - // could fit a couple more non-funky, if we want <- 76, I think... - auto nonfunky = " \'()-.0123456789:,_aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ/?"; + + // We don't need capital letters here, since we lcase queries when we normalize them. + auto nonfunky = " \"\'()[]+-%.&0123456789:<=>,^_abcdefghijklmnopqrstuvwxyz/?#*!"; + std::ostream & out = std::cout; std::map letter_counts; std::set thin_letter_set; @@ -76,14 +77,45 @@ void CompressedTrieBasedDB::initialize(const std::set & keys) { } //std::cerr << glob_conv8.to_bytes(widestr) << '\n'; } + std::cerr< by_count; - for (auto lcp : letter_counts) { - wide_letters.push_back(lcp.first); - by_count[lcp.second].push_back(lcp.first); + std::map> by_count; + + + std::cerr<<"thin letters: "< Date: Wed, 29 Apr 2020 08:00:03 -0700 Subject: [PATCH 064/620] Use CTrie2_t for the wide_trie as well. --- otc/ctrie/ctrie_db.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index 1aeef7ce..4b2b039f 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -8,14 +8,13 @@ namespace otc { using CTrie3_t = CompressedTrie; using CTrie2_t = CompressedTrie; - class CompressedTrieBasedDB { public: void initialize(const std::set & keys); std::set fuzzy_query(const std::string & query_str) const; std::set exact_query(const std::string & query_str) const; private: - CTrie3_t wide_trie; + CTrie2_t wide_trie; CTrie2_t thin_trie; }; From a2f7539178a3dc9c727905b14d9a1f99118ef9ad Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 08:24:35 -0700 Subject: [PATCH 065/620] Add function get_letter_bits() to each node type. --- otc/ctrie/ctrie_node.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 9723f552..19f0045e 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -77,6 +77,12 @@ class CTrie3NodeData { return bot; } + uint64_t get_letter_bits() const { + assert(mid<<2 == 0); + assert((bot & BOTTOM_LETTER_MASK) == 0); + return (top<<2)|(mid>>62); + } + void db_write_state(std::ostream &out) const { out << "top=" << std::bitset<64>{top} << " mid=" << std::bitset<64>{mid} << " bot=" << std::bitset<64>{bot} ; } @@ -103,6 +109,11 @@ class CTrie2NodeData { return bot; } + uint64_t get_letter_bits() const { + assert((bot & BOTTOM_LETTER_MASK)<<2 == 0); + return (top<<2)|(bot>>62); + } + void db_write_state(std::ostream &out) const { out << "top=" << std::bitset<64>{top} << " bot=" << std::bitset<64>{bot} ; } @@ -129,6 +140,10 @@ class CTrie1NodeData { return top; } + uint64_t get_letter_bits() const { + return ((top&TOP_LETTER_MASK)<<2); + } + void db_write_state(std::ostream &out) const { out << "top=" << std::bitset<64>{top} ; } From 510ddc6a4702a80b94dc021083797050efd28fc2 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 09:00:42 -0700 Subject: [PATCH 066/620] Add child child iterators and range class. --- otc/ctrie/ctrie_node.cpp | 4 +++ otc/ctrie/ctrie_node.h | 54 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index af8bf3c6..a86117b0 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -77,6 +77,10 @@ vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bit fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); masked = data.bot & BOTTOM_LETTER_MASK; fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); + + vec_ind_pair_t ret2; + + return ret; } diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 19f0045e..8a093b70 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -11,6 +11,7 @@ namespace otc { using stored_index_t = unsigned char; +using ind_pair_t = std::pair; // The CTrieXNode classes are the elements stored in a vector by the CTrie class. // To allow for random access into the vector, each node is the same size for @@ -50,6 +51,49 @@ constexpr int LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD = -2; // 2 bits for flags constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD = 64 + LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD; constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD = 64 + LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD; +class ctrie_child_sentinel +{ +}; + +class ctrie_child_iterator +{ + uint64_t letter_bits; + uint64_t index_; + + void mask_cur_letter() + { + assert(not done()); + uint64_t curr_bit = (ONE_64<<63)>>letter(); + letter_bits &= (~curr_bit); + } + + bool done() const {return not letter_bits;} + +public: + uint64_t index() const {return index_;} + stored_index_t letter() const {return __builtin_clzl(letter_bits);} + + ctrie_child_iterator operator++() {index_++; mask_cur_letter(); return (*this);} + ctrie_child_iterator operator++(int) {auto tmp = *this; (*this)++; return tmp;} + + ind_pair_t operator*() const { return ind_pair_t(letter(),index());} + + bool operator==(const ctrie_child_iterator& i2) {return index() == i2.index() and letter() == i2.letter();} + bool operator!=(const ctrie_child_iterator& i2) {return not (*this == i2);} + bool operator==(const ctrie_child_sentinel&) {return done();} + bool operator!=(const ctrie_child_sentinel&) {return not done();} + + ctrie_child_iterator(uint64_t ul, uint64_t ui): letter_bits(ul),index_(ui) {} +}; + +struct ctrie_children +{ + ctrie_child_iterator begin_; + ctrie_child_iterator begin() const {return begin_;} + ctrie_child_sentinel end() const {return {};} + ctrie_children(uint64_t ul, uint64_t ui):begin_(ul,ui) {} +}; + class CTrie3NodeData { public: static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD + 64 - NUM_INDEX_BITS; @@ -149,7 +193,7 @@ class CTrie1NodeData { } }; -using ind_pair_t = std::pair; + using vec_ind_pair_t = std::vector; template @@ -165,7 +209,13 @@ class CTrieNode { uint64_t get_index() const { return INDEX_MASK & data.get_index_word_const(); } - + + uint64_t get_letter_bits() const { + return data.get_letter_bits(); + } + + ctrie_children children() const {return {get_letter_bits(),get_index()};} + void log_state() const { std::cerr << " CTrieNode( "; data.db_write_state(std::cerr); std::cerr << ")\n"; } From 07557312c2d2e629ec6fe71a7c3c98012c4dfb19 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 09:18:27 -0700 Subject: [PATCH 067/620] Remove CTrie3 and CTrie1. --- otc/ctrie/ctrie_db.h | 1 - otc/ctrie/ctrie_node.cpp | 54 -------------------------- otc/ctrie/ctrie_node.h | 83 ---------------------------------------- tools/tnrs-cli.cpp | 2 +- 4 files changed, 1 insertion(+), 139 deletions(-) diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index 4b2b039f..d913d4cb 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -5,7 +5,6 @@ #include "otc/ctrie/ctrie.h" namespace otc { -using CTrie3_t = CompressedTrie; using CTrie2_t = CompressedTrie; class CompressedTrieBasedDB { diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index a86117b0..b13f8cea 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -20,38 +20,6 @@ void fill_letter_and_node_indices_64(uint64_t letter_bits, } } -template <> -void CTrieNode::flag_letter(unsigned int i) { - uint64_t bit = ONE_64; - //log_state(); - if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { - const uint64_t shifted = (bit << (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i)); - data.top |= shifted; - } else if (i < LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD) { - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.mid |= bit; - } else { - assert(i < DATA_TYPE::END_LETTER_INDEX); - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD -1 - i); - data.bot |= bit; - } -} - -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - uint64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - masked = data.mid; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); - masked = data.bot & BOTTOM_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD, ret, node_index); - return ret; -} - template <> void CTrieNode::flag_letter(unsigned int i) { uint64_t bit = ONE_64; @@ -84,26 +52,4 @@ vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bit return ret; } - -template <> -void CTrieNode::flag_letter(unsigned int i) { - uint64_t bit = ONE_64; - assert(i < DATA_TYPE::END_LETTER_INDEX); - bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.top |= bit; -} - -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - //std::cerr << "get_letter_and_node_indices_for_on_bits top=" - // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - u_int64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - return ret; -} - } diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 8a093b70..1538bc91 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -94,44 +94,6 @@ struct ctrie_children ctrie_children(uint64_t ul, uint64_t ui):begin_(ul,ui) {} }; -class CTrie3NodeData { -public: - static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD + 64 - NUM_INDEX_BITS; - - uint64_t top, mid, bot; - - CTrie3NodeData() :top{ZERO_64}, mid{ZERO_64}, bot{ZERO_64} { - } - uint64_t & get_flag_word() { - return top; - } - const uint64_t & get_flag_word_const() const { - return top; - } - uint64_t & get_middle_word() { - return mid; - } - const uint64_t & get_middle_word_const() const { - return mid; - } - uint64_t & get_index_word() { - return bot; - } - const uint64_t & get_index_word_const() const { - return bot; - } - - uint64_t get_letter_bits() const { - assert(mid<<2 == 0); - assert((bot & BOTTOM_LETTER_MASK) == 0); - return (top<<2)|(mid>>62); - } - - void db_write_state(std::ostream &out) const { - out << "top=" << std::bitset<64>{top} << " mid=" << std::bitset<64>{mid} << " bot=" << std::bitset<64>{bot} ; - } -}; - class CTrie2NodeData { public: static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD - NUM_INDEX_BITS; @@ -163,37 +125,6 @@ class CTrie2NodeData { } }; -class CTrie1NodeData { -public: - static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - NUM_INDEX_BITS; - - uint64_t top; - - CTrie1NodeData() :top{ZERO_64} { - } - uint64_t & get_flag_word() { - return top; - } - const uint64_t & get_flag_word_const() const { - return top; - } - uint64_t & get_index_word() { - return top; - } - const uint64_t & get_index_word_const() const { - return top; - } - - uint64_t get_letter_bits() const { - return ((top&TOP_LETTER_MASK)<<2); - } - - void db_write_state(std::ostream &out) const { - out << "top=" << std::bitset<64>{top} ; - } -}; - - using vec_ind_pair_t = std::vector; template @@ -263,27 +194,13 @@ class CTrieNode { }; -template <> -void CTrieNode::flag_letter(unsigned int i); - -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; - template <> void CTrieNode::flag_letter(unsigned int i); template <> vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; -template <> -void CTrieNode::flag_letter(unsigned int i); - -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; - -using CTrie3Node = CTrieNode; using CTrie2Node = CTrieNode; -using CTrie1Node = CTrieNode; } // namespace otc #endif diff --git a/tools/tnrs-cli.cpp b/tools/tnrs-cli.cpp index 41b71fa7..11bfc9bc 100644 --- a/tools/tnrs-cli.cpp +++ b/tools/tnrs-cli.cpp @@ -140,7 +140,7 @@ void analyze_case_sensitivity(const RTRichTaxTreeData & rt_data, } void interactive_tests() { - const CTrie3_t testtrie{"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"}; + const CTrie2_t testtrie{"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"}; const std::string p1 = "Enter a query:\n"; const std::string p2 = "Enter a trie:\n"; const std::string p3 = "max distance:\n"; From 7b8454caa55606094918057c12aaba3fcdbcc013 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:12:22 -0700 Subject: [PATCH 068/620] Use children iterator. --- otc/ctrie/ctrie.h | 16 +++++++--------- otc/ctrie/ctrie_search_impl.h | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 23d4955a..a3c8587a 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -429,12 +429,9 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & } if (DB_FUZZY_MATCH) {node_vec[0].log_state();} - auto inds_on = node_vec[0].get_letter_and_node_indices_for_on_bits(); // std::cerr << "ROOT:"; node_vec[0].log_state(); - for (auto & x : inds_on) { - auto trie_char = x.first; - auto next_ind = x.second; + for (auto [trie_char, next_ind] : node_vec[0].children()) { const T * next_nd = &(node_vec[next_ind]); // std::cerr << "ROOT child for \"" << to_char_str(letters[trie_char]) << "\" "; next_nd->log_state(); } @@ -483,10 +480,9 @@ void CompressedTrie::db_write_node(std::ostream & out, const T & nd) const { //out << " letterbits = "; //nd.db_write_state(out); //out << "\n"; - auto vipt = nd.get_letter_and_node_indices_for_on_bits(); - for (auto & ind_pair : vipt) { - out << " " << to_char_str(letters[ind_pair.first]); - out << " => node[" << std::dec << ind_pair.second << "]\n"; + for (auto [trie_char,index] : nd.children()) { + out << " " << to_char_str(letters[trie_char]); + out << " => node[" << std::dec << index << "]\n"; } } } @@ -520,7 +516,9 @@ void CompressedTrie::db_write_words(std::ostream & out) const { auto full = curr_nd_pref.second + suff; out << i++ << " = " << to_char_str(full) << '\n'; } else { - auto vipt = nd_ptr->get_letter_and_node_indices_for_on_bits(); + vec_ind_pair_t vipt; + for(auto x : nd_ptr->children()) + vipt.push_back(x); for (auto vipirit = vipt.rbegin(); vipirit != vipt.rend(); vipirit++) { const T * nn = &(node_vec[vipirit->second]); stored_str_t np = curr_nd_pref.second + letters[vipirit->first]; diff --git a/otc/ctrie/ctrie_search_impl.h b/otc/ctrie/ctrie_search_impl.h index 8e5311b2..05f7f519 100644 --- a/otc/ctrie/ctrie_search_impl.h +++ b/otc/ctrie/ctrie_search_impl.h @@ -385,8 +385,8 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, auto qc = pm.query_char(); auto altqc = equivalent_letter[qc]; if (DB_FUZZY_MATCH) {trienode->log_state();} - auto inds_on = trienode->get_letter_and_node_indices_for_on_bits(); - for (auto & [trie_char, next_ind] : inds_on) + + for (auto [trie_char, next_ind] : trienode->children()) { const T * next_nd = &(node_vec[next_ind]); if (trie_char == qc || trie_char == altqc) { From eb09fb136a183148310d1d698ceff0accc6f293a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:14:29 -0700 Subject: [PATCH 069/620] Remove get_letter_and_node_indices_for_on_bits(). --- otc/ctrie/ctrie_node.cpp | 37 ------------------------------------- otc/ctrie/ctrie_node.h | 4 ---- 2 files changed, 41 deletions(-) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index b13f8cea..0da50c97 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -2,24 +2,6 @@ namespace otc { -void fill_letter_and_node_indices_64(uint64_t letter_bits, - int offset, - vec_ind_pair_t & ret, - uint64_t & node_index) -{ - constexpr uint64_t left_bit = (ONE_64<<63); - while (letter_bits) - { - int i = __builtin_clzl(letter_bits); - - uint64_t curr_bit = left_bit >> i; - if (letter_bits & curr_bit) - ret.push_back(ind_pair_t{offset+i, node_index++}); - - letter_bits &= (~curr_bit); - } -} - template <> void CTrieNode::flag_letter(unsigned int i) { uint64_t bit = ONE_64; @@ -33,23 +15,4 @@ void CTrieNode::flag_letter(unsigned int i) { } } -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const { - //std::cerr << "get_letter_and_node_indices_for_on_bits top=" - // << std::hex << top << " bot=" << std::hex << bot << std::dec << '\n'; - assert(!is_terminal()); - vec_ind_pair_t ret; - ret.reserve(DATA_TYPE::END_LETTER_INDEX); - u_int64_t node_index = get_index(); - uint64_t masked = data.top & TOP_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD, ret, node_index); - masked = data.bot & BOTTOM_LETTER_MASK; - fill_letter_and_node_indices_64(masked, LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD, ret, node_index); - - vec_ind_pair_t ret2; - - - return ret; -} - } diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 1538bc91..5772f37c 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -190,16 +190,12 @@ class CTrieNode { void flag_letter(unsigned int i); - vec_ind_pair_t get_letter_and_node_indices_for_on_bits() const; }; template <> void CTrieNode::flag_letter(unsigned int i); -template <> -vec_ind_pair_t CTrieNode::get_letter_and_node_indices_for_on_bits() const; - using CTrie2Node = CTrieNode; } // namespace otc From 2f65c92685091b713e32767820c6b099174c7cdc Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:23:16 -0700 Subject: [PATCH 070/620] Make CTrie node a non-template. --- otc/ctrie/ctrie.h | 2 +- otc/ctrie/ctrie_node.cpp | 10 ++++----- otc/ctrie/ctrie_node.h | 47 ++++++++++++++-------------------------- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index a3c8587a..484a064c 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -347,7 +347,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & std::set let_set{letter_var.begin(), letter_var.end()}; letters = stored_str_t{let_set.begin(), let_set.end()}; stored_str_t rev_letters = stored_str_t{letters.rbegin(), letters.rend()}; - if (letters.length() >= T::DATA_TYPE::END_LETTER_INDEX) { + if (letters.length() >= T::END_LETTER_INDEX) { throw OTCError() << "# of letters (" << letters.length() << ") exceeds size of CompressedTrie node type"; } if (letters.length() > 253) { diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index 0da50c97..ede25be3 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -2,16 +2,16 @@ namespace otc { -template <> -void CTrieNode::flag_letter(unsigned int i) { +void CTrieNode::flag_letter(unsigned int i) +{ uint64_t bit = ONE_64; if (i < LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD) { bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.top |= bit; + top |= bit; } else { - assert(i < DATA_TYPE::END_LETTER_INDEX); + assert(i < END_LETTER_INDEX); bit <<= (LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD - 1 - i); - data.bot |= bit; + bot |= bit; } } diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 5772f37c..8e379e98 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -94,14 +94,13 @@ struct ctrie_children ctrie_children(uint64_t ul, uint64_t ui):begin_(ul,ui) {} }; -class CTrie2NodeData { -public: - static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD - NUM_INDEX_BITS; +using vec_ind_pair_t = std::vector; - uint64_t top, bot; +class CTrieNode { +private: + uint64_t top = 0; + uint64_t bot = 0; - CTrie2NodeData() :top{ZERO_64}, bot{ZERO_64} { - } uint64_t & get_flag_word() { return top; } @@ -115,56 +114,45 @@ class CTrie2NodeData { return bot; } - uint64_t get_letter_bits() const { - assert((bot & BOTTOM_LETTER_MASK)<<2 == 0); - return (top<<2)|(bot>>62); - } - void db_write_state(std::ostream &out) const { out << "top=" << std::bitset<64>{top} << " bot=" << std::bitset<64>{bot} ; } -}; -using vec_ind_pair_t = std::vector; - -template -class CTrieNode { -private: - T data; public: - using DATA_TYPE = T; + static constexpr unsigned int END_LETTER_INDEX = LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD - NUM_INDEX_BITS; CTrieNode() { } uint64_t get_index() const { - return INDEX_MASK & data.get_index_word_const(); + return INDEX_MASK & get_index_word_const(); } uint64_t get_letter_bits() const { - return data.get_letter_bits(); + assert((bot & BOTTOM_LETTER_MASK)<<2 == 0); + return (top<<2)|(bot>>62); } ctrie_children children() const {return {get_letter_bits(),get_index()};} void log_state() const { - std::cerr << " CTrieNode( "; data.db_write_state(std::cerr); std::cerr << ")\n"; + std::cerr << " CTrieNode( "; db_write_state(std::cerr); std::cerr << ")\n"; } void flag_as_key_terminating() { - data.get_flag_word() |= SECOND_HIGHEST_BIT; + get_flag_word() |= SECOND_HIGHEST_BIT; } bool is_key_terminating() const { - return data.get_flag_word_const() & SECOND_HIGHEST_BIT; + return get_flag_word_const() & SECOND_HIGHEST_BIT; } void flag_as_terminal() { - data.get_flag_word() |= HIGHEST_BIT; + get_flag_word() |= HIGHEST_BIT; } bool is_terminal() const { - return data.get_flag_word_const() & HIGHEST_BIT; + return get_flag_word_const() & HIGHEST_BIT; } void set_index(std::size_t index) { @@ -173,7 +161,7 @@ class CTrieNode { if (ind != (uint64_t)index) { throw OTCError() << "not enough index field to hold pos = " << index; } - auto & word = data.get_index_word(); + auto & word = get_index_word(); word &= COMP_INDEX_MASK; // sets to 0 any bits for the index word |= ind; } @@ -193,10 +181,7 @@ class CTrieNode { }; -template <> -void CTrieNode::flag_letter(unsigned int i); - -using CTrie2Node = CTrieNode; +using CTrie2Node = CTrieNode; } // namespace otc #endif From 72fd898398b5bf476e0b300a2d301b4e795f57dd Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:30:07 -0700 Subject: [PATCH 071/620] Add some comments. --- otc/ctrie/ctrie_node.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 8e379e98..4a49a4f6 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -51,10 +51,15 @@ constexpr int LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD = -2; // 2 bits for flags constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD = 64 + LETTER_INDEX_OF_FIRST_BIT_IN_FIRST_WORD; constexpr unsigned int LETTER_INDEX_OF_FIRST_BIT_IN_THIRD_WORD = 64 + LETTER_INDEX_OF_FIRST_BIT_IN_SECOND_WORD; +// This class is the type of children.end() +// It exists only to be compared against iterators. class ctrie_child_sentinel { }; +// This iterator walks forward through the children of a given node. +// The current letter is indicated by the highest non-zero bit in `letter_bits`. +// Bits are numbered in reverse order from normal, with the highest bit (bit 63) indicating letter 0; class ctrie_child_iterator { uint64_t letter_bits; @@ -67,6 +72,7 @@ class ctrie_child_iterator letter_bits &= (~curr_bit); } + // If there are no 1 bits, then we have visited all the children. bool done() const {return not letter_bits;} public: @@ -86,6 +92,8 @@ class ctrie_child_iterator ctrie_child_iterator(uint64_t ul, uint64_t ui): letter_bits(ul),index_(ui) {} }; +// This is the range object with begin() and end() methods for use in +// range-for loops: for(auto [letter,index] : nd->children()) struct ctrie_children { ctrie_child_iterator begin_; From 7c540650674ead56da4ef21b034dcfea1602115a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:35:19 -0700 Subject: [PATCH 072/620] Add some asserts. --- otc/ctrie/ctrie_node.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 4a49a4f6..6ca866fe 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -76,8 +76,8 @@ class ctrie_child_iterator bool done() const {return not letter_bits;} public: - uint64_t index() const {return index_;} - stored_index_t letter() const {return __builtin_clzl(letter_bits);} + stored_index_t letter() const {assert(not done()); return __builtin_clzl(letter_bits);} + uint64_t index() const {assert(not done()); return index_;} ctrie_child_iterator operator++() {index_++; mask_cur_letter(); return (*this);} ctrie_child_iterator operator++(int) {auto tmp = *this; (*this)++; return tmp;} From a7b9a85451cb2a0f78f25a8fa0280358a55d1a47 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 10:37:34 -0700 Subject: [PATCH 073/620] Remove test for clucene. --- meson.build | 2 -- 1 file changed, 2 deletions(-) diff --git a/meson.build b/meson.build index eab65ed4..85aa7bec 100644 --- a/meson.build +++ b/meson.build @@ -20,8 +20,6 @@ conf_data.set_quoted('_ARCH_', host_machine.system()+' ' + host_machine.cpu_fami conf_data.set_quoted('_COMPILER_', cpp.get_id() + ' ' + cpp.version()+' ' + host_machine.cpu_family()) configure_file(output : 'config.h', configuration : conf_data) -clucene = dependency('libclucene-core', required: false) - # Do we need this? # AC_PROG_LN_S From 6e5964e4c73ed59d3637467fff2bc593e9e7158a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 29 Apr 2020 12:52:32 -0700 Subject: [PATCH 074/620] Use std::vector instead of std::list< >. --- otc/ctrie/ctrie.h | 8 ++++---- otc/ctrie/ctrie_search_impl.h | 13 +++++++------ otc/ctrie/search_data_models.h | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 484a064c..1b7374e4 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -87,8 +87,8 @@ class CompressedTrie { const unsigned int dist_threshold, stored_index_t prev_trie_match_char) const; - std::list fuzzy_matches(const stored_str_t & query_str, - unsigned int max_dist) const; + std::vector fuzzy_matches(const stored_str_t & query_str, + unsigned int max_dist) const; void db_write(std::ostream & out) const; @@ -186,7 +186,7 @@ class CompressedTrie { const unsigned int dist_threshold, stored_index_t prev_trie_match_char) const; void extend_partial_match(const PartialMatch &pm, - std::list & results, + std::vector & results, std::list > & next_alive) const; void db_write_pm(const char *, const PartialMatch &pm) const; unsigned int _calc_dist_impl(const PartialMatch &pm, @@ -200,7 +200,7 @@ class CompressedTrie { stored_char_t trie_match_char) const; bool _check_suffix_for_match(const PartialMatch &pm, const stored_index_t * suffix, - std::list & results) const; + std::vector & results) const; stored_index_t ctrie_get_index_for_letter(const stored_char_t & c) const { auto ltiit = letter_to_ind.find(c); diff --git a/otc/ctrie/ctrie_search_impl.h b/otc/ctrie/ctrie_search_impl.h index 05f7f519..9e2f93b9 100644 --- a/otc/ctrie/ctrie_search_impl.h +++ b/otc/ctrie/ctrie_search_impl.h @@ -321,7 +321,7 @@ inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t pre template bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, const stored_index_t * trie_suff, - std::list & results) const { + std::vector & results) const { if (pm.has_matched_suffix(trie_suff)) { return false; } @@ -371,7 +371,7 @@ void CompressedTrie::db_write_pm(const char * context, const PartialMatch template void CompressedTrie::extend_partial_match(const PartialMatch & pm, - std::list & results, + std::vector & results, std::list > & next_alive) const { if (DB_FUZZY_MATCH) {db_write_pm("extend", pm);} const T * trienode = pm.get_next_node(); @@ -427,11 +427,11 @@ inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) cons template -std::list CompressedTrie::fuzzy_matches(const stored_str_t & query_str, +std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, unsigned int max_dist) const { if (DB_FUZZY_MATCH) {std::cerr << "fuzzy_matches (within " << max_dist << " edits) of \"" << to_char_str(query_str) << "\"\n";} if (query_str.length() == 0) { - return std::list{}; + return std::vector{}; } const FQuery query{query_str, encode_as_indices(query_str), max_dist}; unsigned int num_missing_in_letters = 0; @@ -440,12 +440,13 @@ std::list CompressedTrie::fuzzy_matches(const stored_str_t num_missing_in_letters++; if (num_missing_in_letters > max_dist) { if (DB_FUZZY_MATCH) {std::cerr << "match infeasible because >= " << num_missing_in_letters << " positions in the query were not in the trie.\n";} - return std::list{}; + return std::vector{}; } } } // non-trivial case - std::list results; + std::vector results; + results.reserve(20); const T * root_nd = &(node_vec.at(0)); std::list > alive; alive.push_back(PartialMatch{query, root_nd}); diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index 8286d6a0..844bb019 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -235,7 +235,7 @@ class PartialMatch { bool has_matched_suffix(const stored_index_t * trie_suff) const { return query.has_matched_suffix(next_node, trie_suff); } - bool store_result(std::list & results, + bool store_result(std::vector & results, const stored_index_t * trie_suff, std::size_t suff_len, unsigned int distance) const { From 9649b662fc77d5b10edc002aaca73b6e936438e8 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 15:09:27 -0700 Subject: [PATCH 075/620] Untemplatize class CompressedCTrie. --- otc/ctrie/ctrie.h | 97 +++++++++++++++++------------------ otc/ctrie/ctrie_db.h | 5 +- otc/ctrie/ctrie_node.h | 3 -- otc/ctrie/ctrie_search_impl.h | 55 ++++++++------------ tools/tnrs-cli.cpp | 2 +- 5 files changed, 72 insertions(+), 90 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 1b7374e4..22ccbc83 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -11,6 +11,7 @@ #include #include "otc/otc_base_includes.h" #include "otc/ctrie/search_data_models.h" +#include "otc/ctrie/ctrie_node.h" namespace otc { constexpr bool DB_FUZZY_MATCH = false; @@ -19,17 +20,18 @@ constexpr bool DB_FUZZY_MATCH = false; */ using ctrie_init_set_t = std::set; -template -class CTrieCtorHelperTemp { - public: + +class CTrieCtorHelper +{ +public: stored_str_t prefix; - T * node_ptr; + CTrieNode* node_ptr; ctrie_init_set_t::const_iterator lower; }; using suff_map_t = std::map , std::size_t>; -template + class CompressedTrie { public: @@ -94,7 +96,7 @@ class CompressedTrie { void db_write_words(std::ostream & out) const; - void db_write_node(std::ostream & out, const T & nd) const; + void db_write_node(std::ostream & out, const CTrieNode & nd) const; std::string to_char_from_inds(const stored_index_t * p, std::size_t len) const { std::string ret; @@ -140,7 +142,6 @@ class CompressedTrie { } private: - using CTrieCtorHelper = CTrieCtorHelperTemp; void init(const ctrie_init_set_t & keys, const stored_str_t & letter_var); void fill_equivalent_letter_array(); @@ -148,18 +149,18 @@ class CompressedTrie { std::stack & todo_q, const stored_str_t & rev_letters, const ctrie_init_set_t & keys, - T & par_node, + CTrieNode & par_node, suff_map_t& suffix2index); - void _store_suffix_node(T & curr_node, + void _store_suffix_node(CTrieNode & curr_node, const stored_str_t & curr_str, const stored_str_t & handled, suff_map_t & suffix2index); void _finish_query_result(FuzzyQueryResult & res) const; - T & append_node() { - T empty; + CTrieNode & append_node() { + CTrieNode empty; node_list.push_back(empty); return *(node_list.rbegin()); } @@ -185,11 +186,11 @@ class CompressedTrie { const std::size_t trie_len, const unsigned int dist_threshold, stored_index_t prev_trie_match_char) const; - void extend_partial_match(const PartialMatch &pm, + void extend_partial_match(const PartialMatch &pm, std::vector & results, - std::list > & next_alive) const; - void db_write_pm(const char *, const PartialMatch &pm) const; - unsigned int _calc_dist_impl(const PartialMatch &pm, + std::list > & next_alive) const; + void db_write_pm(const char *, const PartialMatch &pm) const; + unsigned int _calc_dist_impl(const PartialMatch &pm, const stored_index_t * suffix, const std::size_t trie_len) const; unsigned int _match_cost(stored_char_t prev_q_match_char, @@ -198,7 +199,7 @@ class CompressedTrie { stored_char_t trie_match_char) const; unsigned int _match_cost_no_transp(stored_char_t q_match_char, stored_char_t trie_match_char) const; - bool _check_suffix_for_match(const PartialMatch &pm, + bool _check_suffix_for_match(const PartialMatch &pm, const stored_index_t * suffix, std::vector & results) const; @@ -213,22 +214,21 @@ class CompressedTrie { std::unordered_map letter_to_ind; std::vector equivalent_letter; stored_str_t letters; - std::list node_list; + std::list node_list; std::vector concat_suff; - std::vector node_vec; + std::vector node_vec; stored_index_t null_char_index; friend class CompressedTrieBasedDB; }; -template -void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, - std::stack & todo_q, - const stored_str_t & rev_letters, - const ctrie_init_set_t & keys, - T & par_node, - suff_map_t & suffix2index) { +inline void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, + std::stack & todo_q, + const stored_str_t & rev_letters, + const ctrie_init_set_t & keys, + CTrieNode & par_node, + suff_map_t & suffix2index) { stored_str_t next_pref; CTrieCtorHelper ctch; unsigned int curr_letter_index = 0; @@ -249,7 +249,7 @@ void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, if (lb != keys.end()) { if (starts_with(*lb, next_pref)) { auto advit = lb; - T & next_node = append_node(); + CTrieNode & next_node = append_node(); ctch.node_ptr = &next_node; advit++; if (advit != keys.end() && starts_with(*advit, next_pref)) { @@ -273,11 +273,11 @@ void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, } -template -void CompressedTrie::_store_suffix_node(T & curr_node, - const stored_str_t & curr_str, - const stored_str_t & handled, - suff_map_t & suffix2index) { + +inline void CompressedTrie::_store_suffix_node(CTrieNode & curr_node, + const stored_str_t & curr_str, + const stored_str_t & handled, + suff_map_t & suffix2index) { const stored_str_t suffix = curr_str.substr(handled.length() + 1); auto suff_as_inds = encode_as_indices(suffix, true); //const std::string suff_as_char = to_char_str(suffix); @@ -308,8 +308,8 @@ void CompressedTrie::_store_suffix_node(T & curr_node, } } -template -void CompressedTrie::fill_equivalent_letter_array() { + +inline void CompressedTrie::fill_equivalent_letter_array() { equivalent_letter.reserve(letters.length()); equivalent_letter.clear(); for (auto nl : letters) { @@ -336,8 +336,7 @@ void CompressedTrie::fill_equivalent_letter_array() { } } -template -void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & letter_var) { +inline void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & letter_var) { clear(); // max_node_index = 0; if (keys.empty()) { @@ -347,7 +346,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & std::set let_set{letter_var.begin(), letter_var.end()}; letters = stored_str_t{let_set.begin(), let_set.end()}; stored_str_t rev_letters = stored_str_t{letters.rbegin(), letters.rend()}; - if (letters.length() >= T::END_LETTER_INDEX) { + if (letters.length() >= CTrieNode::END_LETTER_INDEX) { throw OTCError() << "# of letters (" << letters.length() << ") exceeds size of CompressedTrie node type"; } if (letters.length() > 253) { @@ -367,7 +366,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & concat_suff.push_back(null_char_index); std::vector mt{1, null_char_index}; suffix2index[mt] = 0; - T & root_node = append_node(); + CTrieNode & root_node = append_node(); static const std::string TARGET_THIN_STR{"A"}; static const stored_str_t TARGET_STR = to_u32string(TARGET_THIN_STR); unsigned int target_ind = UINT_MAX; @@ -380,7 +379,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & curr_ctch = todo_q.top(); todo_q.pop(); curr_pref = curr_ctch.prefix; - T & curr_node = *(curr_ctch.node_ptr); + CTrieNode & curr_node = *(curr_ctch.node_ptr); bool done_with_curr = false; if (*curr_ctch.lower == curr_pref) { curr_ctch.lower++; @@ -432,7 +431,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & // std::cerr << "ROOT:"; node_vec[0].log_state(); for (auto [trie_char, next_ind] : node_vec[0].children()) { - const T * next_nd = &(node_vec[next_ind]); + const CTrieNode * next_nd = &(node_vec[next_ind]); // std::cerr << "ROOT child for \"" << to_char_str(letters[trie_char]) << "\" "; next_nd->log_state(); } @@ -443,7 +442,7 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & std::cerr << to_char_str(letters[eli]) << " = " << to_char_str(letters[equivalent_letter[eli]]) << "\n"; } } - auto nvs = sizeof(T)*node_vec.size(); + auto nvs = sizeof(CTrieNode)*node_vec.size(); auto suffs = concat_suff.size(); std::cerr << "vecsize = " << nvs << " bytes\n"; std::cerr << "concat_suff length = " << suffs << " bytes\n"; @@ -465,14 +464,13 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & -template -void CompressedTrie::db_write_node(std::ostream & out, const T & nd) const { +inline void CompressedTrie::db_write_node(std::ostream & out, const CTrieNode & nd) const { if (nd.is_terminal()) { auto suff_index = nd.get_index(); auto suff = get_suffix(suff_index); auto suff_str = to_char_str(suff); out << "TerminalNode suffix_ind=" << suff_index - << " suffix=" << suff + << " suffix=" << to_char_str(suff) << " char_str=\"" << suff_str << "\"\n"; } else { out << "InternalNode" << (nd.is_key_terminating() ? "* " : " "); @@ -487,9 +485,8 @@ void CompressedTrie::db_write_node(std::ostream & out, const T & nd) const { } } -template -void CompressedTrie::db_write(std::ostream & out) const { - out << "CompressedTrie nodes. Letters = \"" << to_char_str(letters) << "\"\n"; +inline void CompressedTrie::db_write(std::ostream & out) const { + out << "CompressedTrie nodes. Letters = \"" << to_char_str(letters) << "\"\n"; out << " " << node_vec.size() << " nodes:\n"; std::size_t i = 0; for (auto nd : node_vec) { @@ -499,9 +496,9 @@ void CompressedTrie::db_write(std::ostream & out) const { } } -template -void CompressedTrie::db_write_words(std::ostream & out) const { - using nd_pref_pair = std::pair; + +inline void CompressedTrie::db_write_words(std::ostream & out) const { + using nd_pref_pair = std::pair; std::deque todo; stored_str_t mt; todo.push_back(nd_pref_pair{&(node_vec[0]), mt}); @@ -520,7 +517,7 @@ void CompressedTrie::db_write_words(std::ostream & out) const { for(auto x : nd_ptr->children()) vipt.push_back(x); for (auto vipirit = vipt.rbegin(); vipirit != vipt.rend(); vipirit++) { - const T * nn = &(node_vec[vipirit->second]); + const CTrieNode * nn = &(node_vec[vipirit->second]); stored_str_t np = curr_nd_pref.second + letters[vipirit->first]; todo.push_front(nd_pref_pair{nn, np}); } diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index d913d4cb..30ecc30c 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -5,7 +5,6 @@ #include "otc/ctrie/ctrie.h" namespace otc { -using CTrie2_t = CompressedTrie; class CompressedTrieBasedDB { public: @@ -13,8 +12,8 @@ class CompressedTrieBasedDB { std::set fuzzy_query(const std::string & query_str) const; std::set exact_query(const std::string & query_str) const; private: - CTrie2_t wide_trie; - CTrie2_t thin_trie; + CompressedTrie wide_trie; + CompressedTrie thin_trie; }; diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 6ca866fe..25d579fa 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -188,8 +188,5 @@ class CTrieNode { void flag_letter(unsigned int i); }; - -using CTrie2Node = CTrieNode; - } // namespace otc #endif diff --git a/otc/ctrie/ctrie_search_impl.h b/otc/ctrie/ctrie_search_impl.h index 9e2f93b9..a8ec2b85 100644 --- a/otc/ctrie/ctrie_search_impl.h +++ b/otc/ctrie/ctrie_search_impl.h @@ -15,8 +15,7 @@ inline std::vector _init_prev_row(unsigned int dist_threshold) { return prev_row; } -template -inline unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, +inline unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, const stored_index_t * trie_suff, const std::size_t trie_len) const { const stored_index_t * quer_suff = pm.query_data(); @@ -60,8 +59,7 @@ inline unsigned int _ran_out_of_trie_score(const std::vector & pre return d; } -template -inline unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, +inline unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, stored_char_t q_match_char, stored_char_t prev_trie_match_char, stored_char_t trie_match_char) const { @@ -83,8 +81,7 @@ inline unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_ch return 1; } -template -inline unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_char, +inline unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_char, stored_char_t trie_match_char) const { if (q_match_char == NO_MATCHING_CHAR_CODE || trie_match_char == NO_MATCHING_CHAR_CODE) { return 1; @@ -97,8 +94,7 @@ inline unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_mat -template -inline bool CompressedTrie::_are_equivalent(stored_char_t prev_q, +inline bool CompressedTrie::_are_equivalent(stored_char_t prev_q, const stored_index_t * quer_suff, const std::size_t quer_len, const stored_index_t * trie_suff, @@ -122,8 +118,7 @@ inline bool CompressedTrie::_are_equivalent(stored_char_t prev_q, } // checks for some easy optimizations, and calls dynamic programming version if needed. -template -inline unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_quer_char, +inline unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_quer_char, const stored_index_t * quer_suff, const std::size_t quer_len, const stored_index_t * trie_suff, @@ -215,8 +210,7 @@ inline unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_q } // called after preprocessing by _calc_dist_prim_impl -template -inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_quer_char, +inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_quer_char, const stored_index_t * quer_suff, const std::size_t quer_len, const stored_index_t * trie_suff, @@ -318,8 +312,7 @@ inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t pre } -template -bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, +inline bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, const stored_index_t * trie_suff, std::vector & results) const { if (pm.has_matched_suffix(trie_suff)) { @@ -351,8 +344,7 @@ bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, } -template -void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) const { +inline void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) const { auto & out = std::cerr; if (context != nullptr) { out << context << " "; @@ -369,12 +361,11 @@ void CompressedTrie::db_write_pm(const char * context, const PartialMatch out << ")\n"; } -template -void CompressedTrie::extend_partial_match(const PartialMatch & pm, +inline void CompressedTrie::extend_partial_match(const PartialMatch & pm, std::vector & results, - std::list > & next_alive) const { + std::list > & next_alive) const { if (DB_FUZZY_MATCH) {db_write_pm("extend", pm);} - const T * trienode = pm.get_next_node(); + const CTrieNode * trienode = pm.get_next_node(); if (trienode->is_terminal()) { auto suffix_index = trienode->get_index(); _check_suffix_for_match(pm, get_suffix_as_indices(suffix_index), results); @@ -388,21 +379,21 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, for (auto [trie_char, next_ind] : trienode->children()) { - const T * next_nd = &(node_vec[next_ind]); + const CTrieNode * next_nd = &(node_vec[next_ind]); if (trie_char == qc || trie_char == altqc) { if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd, next_nd, false}); + next_alive.push_back(PartialMatch{pm, trie_char, cd, next_nd, false}); } else if (cd + 1 <= max_dist) { if (DB_FUZZY_MATCH) {std::cerr << "mismatched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd + 1, next_nd, true}); + next_alive.push_back(PartialMatch{pm, trie_char, cd + 1, next_nd, true}); if (pm.can_rightshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, next_nd, trie_char}); // rightshift + next_alive.push_back(PartialMatch{pm, cd + 1, next_nd, trie_char}); // rightshift } } } // frameshift if (cd + 1 <= max_dist && pm.can_downshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, trienode}); //downshift + next_alive.push_back(PartialMatch{pm, cd + 1, trienode}); //downshift } if (trienode->is_key_terminating()) { auto d = pm.num_q_char_left() + pm.curr_distance(); @@ -413,8 +404,7 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, } -template -inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { +inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { res.match_wide_char.clear(); for (auto ind : res.match_coded) { assert(ind != NO_MATCHING_CHAR_CODE); @@ -426,8 +416,7 @@ inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) cons } -template -std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, +inline std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, unsigned int max_dist) const { if (DB_FUZZY_MATCH) {std::cerr << "fuzzy_matches (within " << max_dist << " edits) of \"" << to_char_str(query_str) << "\"\n";} if (query_str.length() == 0) { @@ -447,12 +436,12 @@ std::vector CompressedTrie::fuzzy_matches(const stored_str_ // non-trivial case std::vector results; results.reserve(20); - const T * root_nd = &(node_vec.at(0)); - std::list > alive; - alive.push_back(PartialMatch{query, root_nd}); + const CTrieNode * root_nd = &(node_vec.at(0)); + std::list> alive; + alive.push_back(PartialMatch{query, root_nd}); while (!alive.empty()) { if (DB_FUZZY_MATCH) {std::cerr << " " << alive.size() << " alive partial matches and " << results.size() << " hits.\n";} - std::list > next_alive; + std::list> next_alive; for (const auto & pm : alive) { auto prevnalen = next_alive.size(); extend_partial_match(pm, results, next_alive); diff --git a/tools/tnrs-cli.cpp b/tools/tnrs-cli.cpp index 11bfc9bc..f4684444 100644 --- a/tools/tnrs-cli.cpp +++ b/tools/tnrs-cli.cpp @@ -140,7 +140,7 @@ void analyze_case_sensitivity(const RTRichTaxTreeData & rt_data, } void interactive_tests() { - const CTrie2_t testtrie{"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"}; + const CompressedTrie testtrie{"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"}; const std::string p1 = "Enter a query:\n"; const std::string p2 = "Enter a trie:\n"; const std::string p3 = "max distance:\n"; From 9606e27ae1de0ce1d323afb8dd169a381ca946bb Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 15:17:53 -0700 Subject: [PATCH 076/620] Move some inline code out of header files. --- otc/ctrie/ctrie.cpp | 309 ++++++++++++++++++ otc/ctrie/ctrie.h | 306 ----------------- .../{ctrie_search_impl.h => search_impl.cpp} | 89 +++-- otc/meson.build | 2 + 4 files changed, 354 insertions(+), 352 deletions(-) create mode 100644 otc/ctrie/ctrie.cpp rename otc/ctrie/{ctrie_search_impl.h => search_impl.cpp} (81%) diff --git a/otc/ctrie/ctrie.cpp b/otc/ctrie/ctrie.cpp new file mode 100644 index 00000000..f67e6fc9 --- /dev/null +++ b/otc/ctrie/ctrie.cpp @@ -0,0 +1,309 @@ +#include "otc/ctrie/ctrie.h" + +namespace otc { + +void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, + std::stack & todo_q, + const stored_str_t & rev_letters, + const ctrie_init_set_t & keys, + CTrieNode & par_node, + suff_map_t & suffix2index) { + stored_str_t next_pref; + CTrieCtorHelper ctch; + unsigned int curr_letter_index = 0; + std::list to_queue; + ctrie_init_set_t::const_iterator lb; + bool has_indexed_par = false; + static const std::string TARGET_THIN_STR{"A"}; + static const stored_str_t TARGET_STR = to_u32string(TARGET_THIN_STR); + bool had_target_pref = false; + for (auto letter : rev_letters) { + if (letter == '\0') { + assert(curr_letter_index == rev_letters.length() - 1); + break; + } + next_pref = curr_pref; + next_pref.push_back(letter); + lb = keys.lower_bound(next_pref); + if (lb != keys.end()) { + if (starts_with(*lb, next_pref)) { + auto advit = lb; + CTrieNode & next_node = append_node(); + ctch.node_ptr = &next_node; + advit++; + if (advit != keys.end() && starts_with(*advit, next_pref)) { + ctch.prefix = next_pref; + ctch.lower = lb; + todo_q.push(ctch); + } else { + _store_suffix_node(next_node, *lb, curr_pref, suffix2index); + } + par_node.flag_letter(curr_letter_index); + if (!has_indexed_par) { + par_node.set_first_child_index(node_list.size() - 1); + has_indexed_par = true; + } + } + } + curr_letter_index++; + } + assert(has_indexed_par); + +} + + + +void CompressedTrie::_store_suffix_node(CTrieNode & curr_node, + const stored_str_t & curr_str, + const stored_str_t & handled, + suff_map_t & suffix2index) { + const stored_str_t suffix = curr_str.substr(handled.length() + 1); + auto suff_as_inds = encode_as_indices(suffix, true); + //const std::string suff_as_char = to_char_str(suffix); + // std::cerr << " handled \"" << to_char_str(handled) << "\" suffix = \"" << suff_as_char << "\"\n"; + auto mit = suffix2index.find(suff_as_inds); + if (mit != suffix2index.end()) { + curr_node.flag_as_suffix(mit->second); + } else { + std::size_t pos = concat_suff.size(); + concat_suff.insert(std::end(concat_suff), std::begin(suff_as_inds), std::end(suff_as_inds)); + concat_suff.push_back(null_char_index); + curr_node.flag_as_suffix(pos); + suffix2index[suff_as_inds] = pos; + std::size_t suff_pref = 1; + auto sai_it = suff_as_inds.begin(); + sai_it++; + while (suff_pref < suff_as_inds.size() - 1) { + std::vector tmp{sai_it, suff_as_inds.end()}; + if (suffix2index.find(tmp) != suffix2index.end()) { + // std::cerr << " found tmp \"" << tmp << "\"\n"; + break; + } + // std::cerr << " adding tmp \"" << tmp << "\" pos + suff_pref = " << pos + suff_pref << "\n"; + suffix2index[tmp] = pos + suff_pref; + suff_pref++; + sai_it++; + } + } +} + + +void CompressedTrie::fill_equivalent_letter_array() { + equivalent_letter.reserve(letters.length()); + equivalent_letter.clear(); + for (auto nl : letters) { + std::string uncov = to_char_str(nl); + std::string lccov = lower_case_version(uncov); + stored_index_t char_ind = NO_MATCHING_CHAR_CODE; + if (lccov != uncov) { + auto alt = to_u32string(lccov); + if (alt.length() != 1) { + throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << lccov << "\"\n"; + } + char_ind = ctrie_get_index_for_letter(alt[0]); + } else { + std::string uccov = upper_case_version(uncov); + if (uccov != uncov) { + auto alt = to_u32string(uccov); + if (alt.length() != 1) { + throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << uccov << "\"\n"; + } + char_ind = ctrie_get_index_for_letter(alt[0]); + } + } + equivalent_letter.push_back(char_ind); + } +} + +void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & letter_var) { + clear(); + // max_node_index = 0; + if (keys.empty()) { + return; + } + // sort the letters in strings, and make sure they are uniq + std::set let_set{letter_var.begin(), letter_var.end()}; + letters = stored_str_t{let_set.begin(), let_set.end()}; + stored_str_t rev_letters = stored_str_t{letters.rbegin(), letters.rend()}; + if (letters.length() >= CTrieNode::END_LETTER_INDEX) { + throw OTCError() << "# of letters (" << letters.length() << ") exceeds size of CompressedTrie node type"; + } + if (letters.length() > 253) { + throw OTCError() << "# of letters (" << letters.length() << ") exceeds 253, so letter_to_ind value type needs to be changed."; + } + stored_index_t curr_ind = 0; + for (auto nl : letters) { + letter_to_ind[nl] = curr_ind++; + } + fill_equivalent_letter_array(); + null_char_index = letters.length(); + letters.append(1, '\0'); + + std::stack todo_q; + stored_str_t curr_pref; + suff_map_t suffix2index; + concat_suff.push_back(null_char_index); + std::vector mt{1, null_char_index}; + suffix2index[mt] = 0; + CTrieNode & root_node = append_node(); + static const std::string TARGET_THIN_STR{"A"}; + static const stored_str_t TARGET_STR = to_u32string(TARGET_THIN_STR); + unsigned int target_ind = UINT_MAX; + assert(node_list.size() == 1); + // std::cerr << "ROOT before any children:"; root_node.log_state(); + _process_prefix(curr_pref, todo_q, letters, keys, root_node, suffix2index); + // std::cerr << "ROOT after first _process_prefix:"; root_node.log_state(); + CTrieCtorHelper curr_ctch; + while (!todo_q.empty()) { + curr_ctch = todo_q.top(); + todo_q.pop(); + curr_pref = curr_ctch.prefix; + CTrieNode & curr_node = *(curr_ctch.node_ptr); + bool done_with_curr = false; + if (*curr_ctch.lower == curr_pref) { + curr_ctch.lower++; + if (curr_ctch.lower != keys.end() && starts_with(*curr_ctch.lower, curr_pref)) { + curr_node.flag_as_key_terminating(); + } else { + done_with_curr = true; + curr_node.flag_as_terminal(); + } + } + if (!done_with_curr) { + _process_prefix(curr_pref, todo_q, letters, keys, curr_node, suffix2index); + } + if (curr_pref == TARGET_STR) { + // std::cerr << "MATCH TARGET: "; curr_node.log_state(); + std::size_t i = 0; + for (const auto & nd : node_list) { + if (&(nd) == &curr_node) { + target_ind = i; + break; + } + i++; + } + // std::cerr << "MATCH TARGET at node " << target_ind << "\n"; + } + } + + if (target_ind != UINT_MAX) { + std::size_t i = 0; + for (const auto & nd : node_list) { + if (i++ == target_ind) { + // std::cerr << "MATCH TARGET from node list spot " << target_ind << " = "; + nd.log_state(); + } + } + } + + // move to vector... + node_vec.clear(); + node_vec.insert(node_vec.begin(), node_list.begin(), node_list.end()); + node_list.clear(); + + if (target_ind != UINT_MAX) { + // std::cerr << "MATCH TARGET from node vector spot " << target_ind << " = "; + node_vec[target_ind].log_state(); + } + + if (DB_FUZZY_MATCH) {node_vec[0].log_state();} + // std::cerr << "ROOT:"; node_vec[0].log_state(); + + for (auto [trie_char, next_ind] : node_vec[0].children()) { + const CTrieNode * next_nd = &(node_vec[next_ind]); + // std::cerr << "ROOT child for \"" << to_char_str(letters[trie_char]) << "\" "; next_nd->log_state(); + } + + for (unsigned int eli = 0; eli < equivalent_letter.size(); ++eli) { + if (equivalent_letter[eli] == NO_MATCHING_CHAR_CODE) { + std::cerr << to_char_str(letters[eli]) << " = \n"; + } else { + std::cerr << to_char_str(letters[eli]) << " = " << to_char_str(letters[equivalent_letter[eli]]) << "\n"; + } + } + auto nvs = sizeof(CTrieNode)*node_vec.size(); + auto suffs = concat_suff.size(); + std::cerr << "vecsize = " << nvs << " bytes\n"; + std::cerr << "concat_suff length = " << suffs << " bytes\n"; + std::cerr << "compressed tree size = " << 4*letters.size() + nvs + suffs << " bytes\n"; + std::cerr << "max_node_index = " << node_vec.size() << "\n"; + + /* + std::cerr << "concat_suff = \""; + for (auto c : concat_suff) { + if (c == '\0') { + std::cerr << "\\0"; + } else { + std::cerr << c; + } + } + std::cerr << "\"\n"; + */ +} + + + +void CompressedTrie::db_write_node(std::ostream & out, const CTrieNode & nd) const { + if (nd.is_terminal()) { + auto suff_index = nd.get_index(); + auto suff = get_suffix(suff_index); + auto suff_str = to_char_str(suff); + out << "TerminalNode suffix_ind=" << suff_index + << " suffix=" << to_char_str(suff) + << " char_str=\"" << suff_str << "\"\n"; + } else { + out << "InternalNode" << (nd.is_key_terminating() ? "* " : " "); + out << " offset = " << nd.get_index() << "\n"; + //out << " letterbits = "; + //nd.db_write_state(out); + //out << "\n"; + for (auto [trie_char,index] : nd.children()) { + out << " " << to_char_str(letters[trie_char]); + out << " => node[" << std::dec << index << "]\n"; + } + } +} + +void CompressedTrie::db_write(std::ostream & out) const { + out << "CompressedTrie nodes. Letters = \"" << to_char_str(letters) << "\"\n"; + out << " " << node_vec.size() << " nodes:\n"; + std::size_t i = 0; + for (auto nd : node_vec) { + out << "node_vec[" << i++ << "] = "; + db_write_node(out, nd); + + } + +} + +void CompressedTrie::db_write_words(std::ostream & out) const { + using nd_pref_pair = std::pair; + std::deque todo; + stored_str_t mt; + todo.push_back(nd_pref_pair{&(node_vec[0]), mt}); + std::size_t i = 0; + while (!todo.empty()) { + auto curr_nd_pref = todo.front(); + todo.pop_front(); + auto nd_ptr = curr_nd_pref.first; + if (nd_ptr->is_terminal()) { + auto suff_index = nd_ptr->get_index(); + auto suff = get_suffix(suff_index); + auto full = curr_nd_pref.second + suff; + out << i++ << " = " << to_char_str(full) << '\n'; + } else { + vec_ind_pair_t vipt; + for(auto x : nd_ptr->children()) + vipt.push_back(x); + for (auto vipirit = vipt.rbegin(); vipirit != vipt.rend(); vipirit++) { + const CTrieNode * nn = &(node_vec[vipirit->second]); + stored_str_t np = curr_nd_pref.second + letters[vipirit->first]; + todo.push_front(nd_pref_pair{nn, np}); + } + } + } +} + +} // namespace otc + +// search impl in different file just to separate init from search. diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 22ccbc83..ad44793d 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -222,312 +222,6 @@ class CompressedTrie { friend class CompressedTrieBasedDB; }; - -inline void CompressedTrie::_process_prefix(const stored_str_t & curr_pref, - std::stack & todo_q, - const stored_str_t & rev_letters, - const ctrie_init_set_t & keys, - CTrieNode & par_node, - suff_map_t & suffix2index) { - stored_str_t next_pref; - CTrieCtorHelper ctch; - unsigned int curr_letter_index = 0; - std::list to_queue; - ctrie_init_set_t::const_iterator lb; - bool has_indexed_par = false; - static const std::string TARGET_THIN_STR{"A"}; - static const stored_str_t TARGET_STR = to_u32string(TARGET_THIN_STR); - bool had_target_pref = false; - for (auto letter : rev_letters) { - if (letter == '\0') { - assert(curr_letter_index == rev_letters.length() - 1); - break; - } - next_pref = curr_pref; - next_pref.push_back(letter); - lb = keys.lower_bound(next_pref); - if (lb != keys.end()) { - if (starts_with(*lb, next_pref)) { - auto advit = lb; - CTrieNode & next_node = append_node(); - ctch.node_ptr = &next_node; - advit++; - if (advit != keys.end() && starts_with(*advit, next_pref)) { - ctch.prefix = next_pref; - ctch.lower = lb; - todo_q.push(ctch); - } else { - _store_suffix_node(next_node, *lb, curr_pref, suffix2index); - } - par_node.flag_letter(curr_letter_index); - if (!has_indexed_par) { - par_node.set_first_child_index(node_list.size() - 1); - has_indexed_par = true; - } - } - } - curr_letter_index++; - } - assert(has_indexed_par); - -} - - - -inline void CompressedTrie::_store_suffix_node(CTrieNode & curr_node, - const stored_str_t & curr_str, - const stored_str_t & handled, - suff_map_t & suffix2index) { - const stored_str_t suffix = curr_str.substr(handled.length() + 1); - auto suff_as_inds = encode_as_indices(suffix, true); - //const std::string suff_as_char = to_char_str(suffix); - // std::cerr << " handled \"" << to_char_str(handled) << "\" suffix = \"" << suff_as_char << "\"\n"; - auto mit = suffix2index.find(suff_as_inds); - if (mit != suffix2index.end()) { - curr_node.flag_as_suffix(mit->second); - } else { - std::size_t pos = concat_suff.size(); - concat_suff.insert(std::end(concat_suff), std::begin(suff_as_inds), std::end(suff_as_inds)); - concat_suff.push_back(null_char_index); - curr_node.flag_as_suffix(pos); - suffix2index[suff_as_inds] = pos; - std::size_t suff_pref = 1; - auto sai_it = suff_as_inds.begin(); - sai_it++; - while (suff_pref < suff_as_inds.size() - 1) { - std::vector tmp{sai_it, suff_as_inds.end()}; - if (suffix2index.find(tmp) != suffix2index.end()) { - // std::cerr << " found tmp \"" << tmp << "\"\n"; - break; - } - // std::cerr << " adding tmp \"" << tmp << "\" pos + suff_pref = " << pos + suff_pref << "\n"; - suffix2index[tmp] = pos + suff_pref; - suff_pref++; - sai_it++; - } - } -} - - -inline void CompressedTrie::fill_equivalent_letter_array() { - equivalent_letter.reserve(letters.length()); - equivalent_letter.clear(); - for (auto nl : letters) { - std::string uncov = to_char_str(nl); - std::string lccov = lower_case_version(uncov); - stored_index_t char_ind = NO_MATCHING_CHAR_CODE; - if (lccov != uncov) { - auto alt = to_u32string(lccov); - if (alt.length() != 1) { - throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << lccov << "\"\n"; - } - char_ind = ctrie_get_index_for_letter(alt[0]); - } else { - std::string uccov = upper_case_version(uncov); - if (uccov != uncov) { - auto alt = to_u32string(uccov); - if (alt.length() != 1) { - throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << uccov << "\"\n"; - } - char_ind = ctrie_get_index_for_letter(alt[0]); - } - } - equivalent_letter.push_back(char_ind); - } -} - -inline void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & letter_var) { - clear(); - // max_node_index = 0; - if (keys.empty()) { - return; - } - // sort the letters in strings, and make sure they are uniq - std::set let_set{letter_var.begin(), letter_var.end()}; - letters = stored_str_t{let_set.begin(), let_set.end()}; - stored_str_t rev_letters = stored_str_t{letters.rbegin(), letters.rend()}; - if (letters.length() >= CTrieNode::END_LETTER_INDEX) { - throw OTCError() << "# of letters (" << letters.length() << ") exceeds size of CompressedTrie node type"; - } - if (letters.length() > 253) { - throw OTCError() << "# of letters (" << letters.length() << ") exceeds 253, so letter_to_ind value type needs to be changed."; - } - stored_index_t curr_ind = 0; - for (auto nl : letters) { - letter_to_ind[nl] = curr_ind++; - } - fill_equivalent_letter_array(); - null_char_index = letters.length(); - letters.append(1, '\0'); - - std::stack todo_q; - stored_str_t curr_pref; - suff_map_t suffix2index; - concat_suff.push_back(null_char_index); - std::vector mt{1, null_char_index}; - suffix2index[mt] = 0; - CTrieNode & root_node = append_node(); - static const std::string TARGET_THIN_STR{"A"}; - static const stored_str_t TARGET_STR = to_u32string(TARGET_THIN_STR); - unsigned int target_ind = UINT_MAX; - assert(node_list.size() == 1); - // std::cerr << "ROOT before any children:"; root_node.log_state(); - _process_prefix(curr_pref, todo_q, letters, keys, root_node, suffix2index); - // std::cerr << "ROOT after first _process_prefix:"; root_node.log_state(); - CTrieCtorHelper curr_ctch; - while (!todo_q.empty()) { - curr_ctch = todo_q.top(); - todo_q.pop(); - curr_pref = curr_ctch.prefix; - CTrieNode & curr_node = *(curr_ctch.node_ptr); - bool done_with_curr = false; - if (*curr_ctch.lower == curr_pref) { - curr_ctch.lower++; - if (curr_ctch.lower != keys.end() && starts_with(*curr_ctch.lower, curr_pref)) { - curr_node.flag_as_key_terminating(); - } else { - done_with_curr = true; - curr_node.flag_as_terminal(); - } - } - if (!done_with_curr) { - _process_prefix(curr_pref, todo_q, letters, keys, curr_node, suffix2index); - } - if (curr_pref == TARGET_STR) { - // std::cerr << "MATCH TARGET: "; curr_node.log_state(); - std::size_t i = 0; - for (const auto & nd : node_list) { - if (&(nd) == &curr_node) { - target_ind = i; - break; - } - i++; - } - // std::cerr << "MATCH TARGET at node " << target_ind << "\n"; - } - } - - if (target_ind != UINT_MAX) { - std::size_t i = 0; - for (const auto & nd : node_list) { - if (i++ == target_ind) { - // std::cerr << "MATCH TARGET from node list spot " << target_ind << " = "; - nd.log_state(); - } - } - } - - // move to vector... - node_vec.clear(); - node_vec.insert(node_vec.begin(), node_list.begin(), node_list.end()); - node_list.clear(); - - if (target_ind != UINT_MAX) { - // std::cerr << "MATCH TARGET from node vector spot " << target_ind << " = "; - node_vec[target_ind].log_state(); - } - - if (DB_FUZZY_MATCH) {node_vec[0].log_state();} - // std::cerr << "ROOT:"; node_vec[0].log_state(); - - for (auto [trie_char, next_ind] : node_vec[0].children()) { - const CTrieNode * next_nd = &(node_vec[next_ind]); - // std::cerr << "ROOT child for \"" << to_char_str(letters[trie_char]) << "\" "; next_nd->log_state(); - } - - for (unsigned int eli = 0; eli < equivalent_letter.size(); ++eli) { - if (equivalent_letter[eli] == NO_MATCHING_CHAR_CODE) { - std::cerr << to_char_str(letters[eli]) << " = \n"; - } else { - std::cerr << to_char_str(letters[eli]) << " = " << to_char_str(letters[equivalent_letter[eli]]) << "\n"; - } - } - auto nvs = sizeof(CTrieNode)*node_vec.size(); - auto suffs = concat_suff.size(); - std::cerr << "vecsize = " << nvs << " bytes\n"; - std::cerr << "concat_suff length = " << suffs << " bytes\n"; - std::cerr << "compressed tree size = " << 4*letters.size() + nvs + suffs << " bytes\n"; - std::cerr << "max_node_index = " << node_vec.size() << "\n"; - - /* - std::cerr << "concat_suff = \""; - for (auto c : concat_suff) { - if (c == '\0') { - std::cerr << "\\0"; - } else { - std::cerr << c; - } - } - std::cerr << "\"\n"; - */ -} - - - -inline void CompressedTrie::db_write_node(std::ostream & out, const CTrieNode & nd) const { - if (nd.is_terminal()) { - auto suff_index = nd.get_index(); - auto suff = get_suffix(suff_index); - auto suff_str = to_char_str(suff); - out << "TerminalNode suffix_ind=" << suff_index - << " suffix=" << to_char_str(suff) - << " char_str=\"" << suff_str << "\"\n"; - } else { - out << "InternalNode" << (nd.is_key_terminating() ? "* " : " "); - out << " offset = " << nd.get_index() << "\n"; - //out << " letterbits = "; - //nd.db_write_state(out); - //out << "\n"; - for (auto [trie_char,index] : nd.children()) { - out << " " << to_char_str(letters[trie_char]); - out << " => node[" << std::dec << index << "]\n"; - } - } -} - -inline void CompressedTrie::db_write(std::ostream & out) const { - out << "CompressedTrie nodes. Letters = \"" << to_char_str(letters) << "\"\n"; - out << " " << node_vec.size() << " nodes:\n"; - std::size_t i = 0; - for (auto nd : node_vec) { - out << "node_vec[" << i++ << "] = "; - db_write_node(out, nd); - - } - -} - -inline void CompressedTrie::db_write_words(std::ostream & out) const { - using nd_pref_pair = std::pair; - std::deque todo; - stored_str_t mt; - todo.push_back(nd_pref_pair{&(node_vec[0]), mt}); - std::size_t i = 0; - while (!todo.empty()) { - auto curr_nd_pref = todo.front(); - todo.pop_front(); - auto nd_ptr = curr_nd_pref.first; - if (nd_ptr->is_terminal()) { - auto suff_index = nd_ptr->get_index(); - auto suff = get_suffix(suff_index); - auto full = curr_nd_pref.second + suff; - out << i++ << " = " << to_char_str(full) << '\n'; - } else { - vec_ind_pair_t vipt; - for(auto x : nd_ptr->children()) - vipt.push_back(x); - for (auto vipirit = vipt.rbegin(); vipirit != vipt.rend(); vipirit++) { - const CTrieNode * nn = &(node_vec[vipirit->second]); - stored_str_t np = curr_nd_pref.second + letters[vipirit->first]; - todo.push_front(nd_pref_pair{nn, np}); - } - } - } -} - } // namespace otc -// search impl in different file just to separate init from search. -#include "otc/ctrie/ctrie_search_impl.h" - #endif diff --git a/otc/ctrie/ctrie_search_impl.h b/otc/ctrie/search_impl.cpp similarity index 81% rename from otc/ctrie/ctrie_search_impl.h rename to otc/ctrie/search_impl.cpp index a8ec2b85..9052e1ae 100644 --- a/otc/ctrie/ctrie_search_impl.h +++ b/otc/ctrie/search_impl.cpp @@ -1,11 +1,8 @@ -#ifndef OTC_CTRIE_SEARCH_IMPL_H -#define OTC_CTRIE_SEARCH_IMPL_H - #include "otc/ctrie/ctrie.h" namespace otc { -inline std::vector _init_prev_row(unsigned int dist_threshold) { +std::vector _init_prev_row(unsigned int dist_threshold) { std::vector prev_row; prev_row.reserve(2 + 2*dist_threshold); unsigned int cd = 0; @@ -15,9 +12,9 @@ inline std::vector _init_prev_row(unsigned int dist_threshold) { return prev_row; } -inline unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, - const stored_index_t * trie_suff, - const std::size_t trie_len) const { +unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, + const stored_index_t * trie_suff, + const std::size_t trie_len) const { const stored_index_t * quer_suff = pm.query_data(); const unsigned int dist_threshold = pm.max_distance() - pm.curr_distance(); stored_index_t prev_trie_match_char = pm.get_prev_mismatched_trie(); @@ -36,9 +33,9 @@ inline unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &p return pm.curr_distance() + d; } -inline unsigned int _ran_out_of_trie_score(const std::vector & prev_row, - std::size_t first_quer_ind, - std::size_t quer_len) { +unsigned int _ran_out_of_trie_score(const std::vector & prev_row, + std::size_t first_quer_ind, + std::size_t quer_len) { int num_q_left = quer_len - first_quer_ind; if (num_q_left < 0) { assert(prev_row.size() == 1); @@ -59,10 +56,10 @@ inline unsigned int _ran_out_of_trie_score(const std::vector & pre return d; } -inline unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, - stored_char_t q_match_char, - stored_char_t prev_trie_match_char, - stored_char_t trie_match_char) const { +unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, + stored_char_t q_match_char, + stored_char_t prev_trie_match_char, + stored_char_t trie_match_char) const { if (q_match_char == NO_MATCHING_CHAR_CODE || trie_match_char == NO_MATCHING_CHAR_CODE) { return 1; } @@ -81,7 +78,7 @@ inline unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, return 1; } -inline unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_char, +unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_char, stored_char_t trie_match_char) const { if (q_match_char == NO_MATCHING_CHAR_CODE || trie_match_char == NO_MATCHING_CHAR_CODE) { return 1; @@ -94,12 +91,12 @@ inline unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_ -inline bool CompressedTrie::_are_equivalent(stored_char_t prev_q, - const stored_index_t * quer_suff, - const std::size_t quer_len, - const stored_index_t * trie_suff, - const std::size_t trie_len, - stored_index_t prev_t) const { +bool CompressedTrie::_are_equivalent(stored_char_t prev_q, + const stored_index_t * quer_suff, + const std::size_t quer_len, + const stored_index_t * trie_suff, + const std::size_t trie_len, + stored_index_t prev_t) const { if (quer_len != trie_len) { return false; } @@ -118,13 +115,13 @@ inline bool CompressedTrie::_are_equivalent(stored_char_t prev_q, } // checks for some easy optimizations, and calls dynamic programming version if needed. -inline unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_quer_char, - const stored_index_t * quer_suff, - const std::size_t quer_len, - const stored_index_t * trie_suff, - const std::size_t trie_len, - const unsigned int dist_threshold, - stored_index_t prev_trie_match_char) const { +unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_quer_char, + const stored_index_t * quer_suff, + const std::size_t quer_len, + const stored_index_t * trie_suff, + const std::size_t trie_len, + const unsigned int dist_threshold, + stored_index_t prev_trie_match_char) const { if (DB_FUZZY_MATCH) { std::cerr << "_calc_dist_prim_impl(pqc=\"" << (prev_quer_char == NO_MATCHING_CHAR_CODE ? "NO_MATCHING_CHAR_CODE" : to_char_str(letters[prev_quer_char])) << ", \""; for (auto i=0U; i < quer_len; ++i) {std::cerr << (quer_suff[i] == NO_MATCHING_CHAR_CODE ? "NO_MATCHING_CHAR_CODE" : to_char_str(letters[quer_suff[i]]));} @@ -210,13 +207,13 @@ inline unsigned int CompressedTrie::_calc_dist_prim_impl(stored_char_t prev_quer } // called after preprocessing by _calc_dist_prim_impl -inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_quer_char, - const stored_index_t * quer_suff, - const std::size_t quer_len, - const stored_index_t * trie_suff, - const std::size_t trie_len, - const unsigned int dist_threshold, - stored_index_t prev_trie_match_char) const { +unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_quer_char, + const stored_index_t * quer_suff, + const std::size_t quer_len, + const stored_index_t * trie_suff, + const std::size_t trie_len, + const unsigned int dist_threshold, + stored_index_t prev_trie_match_char) const { std::size_t prev_quer_ind = 0; std::size_t trie_ind = 0; std::vector prev_row = _init_prev_row(dist_threshold); @@ -312,9 +309,9 @@ inline unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_q } -inline bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, - const stored_index_t * trie_suff, - std::vector & results) const { +bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, + const stored_index_t * trie_suff, + std::vector & results) const { if (pm.has_matched_suffix(trie_suff)) { return false; } @@ -344,7 +341,7 @@ inline bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm) const { +void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) const { auto & out = std::cerr; if (context != nullptr) { out << context << " "; @@ -361,9 +358,9 @@ inline void CompressedTrie::db_write_pm(const char * context, const PartialMatch out << ")\n"; } -inline void CompressedTrie::extend_partial_match(const PartialMatch & pm, - std::vector & results, - std::list > & next_alive) const { +void CompressedTrie::extend_partial_match(const PartialMatch & pm, + std::vector & results, + std::list > & next_alive) const { if (DB_FUZZY_MATCH) {db_write_pm("extend", pm);} const CTrieNode * trienode = pm.get_next_node(); if (trienode->is_terminal()) { @@ -404,7 +401,7 @@ inline void CompressedTrie::extend_partial_match(const PartialMatch & } -inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { +void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { res.match_wide_char.clear(); for (auto ind : res.match_coded) { assert(ind != NO_MATCHING_CHAR_CODE); @@ -416,8 +413,8 @@ inline void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { } -inline std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, - unsigned int max_dist) const { +std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, + unsigned int max_dist) const { if (DB_FUZZY_MATCH) {std::cerr << "fuzzy_matches (within " << max_dist << " edits) of \"" << to_char_str(query_str) << "\"\n";} if (query_str.length() == 0) { return std::vector{}; @@ -457,4 +454,4 @@ inline std::vector CompressedTrie::fuzzy_matches(const stored_ } // namespace otc -#endif + diff --git a/otc/meson.build b/otc/meson.build index 26d98ab9..ba58c5e9 100644 --- a/otc/meson.build +++ b/otc/meson.build @@ -4,6 +4,8 @@ libotcetera_sources = [ 'ctrie/str_utils.cpp', 'ctrie/ctrie_db.cpp', 'ctrie/ctrie_node.cpp', + 'ctrie/ctrie.cpp', + 'ctrie/search_impl.cpp', 'embedded_tree.cpp', 'forest.cpp', 'ftree.cpp', From 42d8b44c80c3da80b741f23849f4673a32a5b951 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 15:22:29 -0700 Subject: [PATCH 077/620] Untemplatize PartialMatch. --- otc/ctrie/ctrie.h | 10 +++++----- otc/ctrie/search_data_models.h | 13 ++++++------- otc/ctrie/search_impl.cpp | 24 ++++++++++++------------ 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index ad44793d..1f0ab96b 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -186,11 +186,11 @@ class CompressedTrie { const std::size_t trie_len, const unsigned int dist_threshold, stored_index_t prev_trie_match_char) const; - void extend_partial_match(const PartialMatch &pm, + void extend_partial_match(const PartialMatch &pm, std::vector & results, - std::list > & next_alive) const; - void db_write_pm(const char *, const PartialMatch &pm) const; - unsigned int _calc_dist_impl(const PartialMatch &pm, + std::list & next_alive) const; + void db_write_pm(const char *, const PartialMatch &pm) const; + unsigned int _calc_dist_impl(const PartialMatch &pm, const stored_index_t * suffix, const std::size_t trie_len) const; unsigned int _match_cost(stored_char_t prev_q_match_char, @@ -199,7 +199,7 @@ class CompressedTrie { stored_char_t trie_match_char) const; unsigned int _match_cost_no_transp(stored_char_t q_match_char, stored_char_t trie_match_char) const; - bool _check_suffix_for_match(const PartialMatch &pm, + bool _check_suffix_for_match(const PartialMatch &pm, const stored_index_t * suffix, std::vector & results) const; diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index 844bb019..74582b1c 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -154,13 +154,12 @@ class FQuery { } }; -template class PartialMatch { public: enum creation_modes {MATCH, DOWN, RIGHT}; PartialMatch(const FQuery & q, - const T *nextn) + const CTrieNode *nextn) :query(q), qpos(0), distance(0), @@ -175,7 +174,7 @@ class PartialMatch { PartialMatch(const PartialMatch & prevpm, stored_index_t match_char, unsigned int start_dist, - const T *nextn, + const CTrieNode *nextn, bool was_match) :query(prevpm.query), qpos(prevpm.qpos + 1), @@ -195,7 +194,7 @@ class PartialMatch { // create a partial match from a gap, moving through query but not trie PartialMatch(const PartialMatch & prevpm, unsigned int start_dist, - const T *nextn) + const CTrieNode *nextn) :query(prevpm.query), qpos(prevpm.qpos + 1), distance(start_dist), @@ -210,7 +209,7 @@ class PartialMatch { // create a partial match from a gap, moving through trie but not query PartialMatch(const PartialMatch & prevpm, unsigned int start_dist, - const T *nextn, + const CTrieNode *nextn, stored_index_t match_char) :query(prevpm.query), qpos(prevpm.qpos), @@ -244,7 +243,7 @@ class PartialMatch { return true; } - const T * get_next_node() const { + const CTrieNode * get_next_node() const { return next_node; } @@ -290,7 +289,7 @@ class PartialMatch { std::size_t qpos; stored_str_t growing_match; unsigned int distance; - const T * next_node; + const CTrieNode * next_node; stored_index_t prev_mismatched_trie; std::vector match_coded; const creation_modes create_mode; diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index 9052e1ae..f88d3e42 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -12,7 +12,7 @@ std::vector _init_prev_row(unsigned int dist_threshold) { return prev_row; } -unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, +unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, const stored_index_t * trie_suff, const std::size_t trie_len) const { const stored_index_t * quer_suff = pm.query_data(); @@ -309,7 +309,7 @@ unsigned int CompressedTrie::_dp_calc_dist_prim_impl(stored_char_t prev_quer_cha } -bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, +bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, const stored_index_t * trie_suff, std::vector & results) const { if (pm.has_matched_suffix(trie_suff)) { @@ -341,7 +341,7 @@ bool CompressedTrie::_check_suffix_for_match(const PartialMatch &pm, } -void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) const { +void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) const { auto & out = std::cerr; if (context != nullptr) { out << context << " "; @@ -358,9 +358,9 @@ void CompressedTrie::db_write_pm(const char * context, const PartialMatch & pm, +void CompressedTrie::extend_partial_match(const PartialMatch & pm, std::vector & results, - std::list > & next_alive) const { + std::list & next_alive) const { if (DB_FUZZY_MATCH) {db_write_pm("extend", pm);} const CTrieNode * trienode = pm.get_next_node(); if (trienode->is_terminal()) { @@ -379,18 +379,18 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, const CTrieNode * next_nd = &(node_vec[next_ind]); if (trie_char == qc || trie_char == altqc) { if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd, next_nd, false}); + next_alive.push_back(PartialMatch{pm, trie_char, cd, next_nd, false}); } else if (cd + 1 <= max_dist) { if (DB_FUZZY_MATCH) {std::cerr << "mismatched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd + 1, next_nd, true}); + next_alive.push_back(PartialMatch{pm, trie_char, cd + 1, next_nd, true}); if (pm.can_rightshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, next_nd, trie_char}); // rightshift + next_alive.push_back(PartialMatch{pm, cd + 1, next_nd, trie_char}); // rightshift } } } // frameshift if (cd + 1 <= max_dist && pm.can_downshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, trienode}); //downshift + next_alive.push_back(PartialMatch{pm, cd + 1, trienode}); //downshift } if (trienode->is_key_terminating()) { auto d = pm.num_q_char_left() + pm.curr_distance(); @@ -434,11 +434,11 @@ std::vector CompressedTrie::fuzzy_matches(const stored_str_t & std::vector results; results.reserve(20); const CTrieNode * root_nd = &(node_vec.at(0)); - std::list> alive; - alive.push_back(PartialMatch{query, root_nd}); + std::list alive; + alive.push_back(PartialMatch{query, root_nd}); while (!alive.empty()) { if (DB_FUZZY_MATCH) {std::cerr << " " << alive.size() << " alive partial matches and " << results.size() << " hits.\n";} - std::list> next_alive; + std::list next_alive; for (const auto & pm : alive) { auto prevnalen = next_alive.size(); extend_partial_match(pm, results, next_alive); From 9499755ea53731769e61557b8c2b92fc84245332 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 15:32:41 -0700 Subject: [PATCH 078/620] Remove unused field PartialMatch::growing_match --- otc/ctrie/search_data_models.h | 1 - 1 file changed, 1 deletion(-) diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index 74582b1c..92badd31 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -287,7 +287,6 @@ class PartialMatch { private: const FQuery & query; std::size_t qpos; - stored_str_t growing_match; unsigned int distance; const CTrieNode * next_node; stored_index_t prev_mismatched_trie; From 9e2e5da91168d70534f2d695d6515e9004a24d91 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 16:08:13 -0700 Subject: [PATCH 079/620] Use depth-first search on ctrie instead of breadth-first search. --- otc/ctrie/ctrie.h | 6 ++-- otc/ctrie/search_impl.cpp | 72 +++++++++++++++++++++------------------ 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 1f0ab96b..7c4b9af2 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -186,9 +186,9 @@ class CompressedTrie { const std::size_t trie_len, const unsigned int dist_threshold, stored_index_t prev_trie_match_char) const; - void extend_partial_match(const PartialMatch &pm, - std::vector & results, - std::list & next_alive) const; + + void extend_partial_match(const PartialMatch &pm, std::vector & results) const; + void db_write_pm(const char *, const PartialMatch &pm) const; unsigned int _calc_dist_impl(const PartialMatch &pm, const stored_index_t * suffix, diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index f88d3e42..b163579c 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -358,41 +358,52 @@ void CompressedTrie::db_write_pm(const char * context, const PartialMatch &pm) c out << ")\n"; } -void CompressedTrie::extend_partial_match(const PartialMatch & pm, - std::vector & results, - std::list & next_alive) const { +void CompressedTrie::extend_partial_match(const PartialMatch & pm, std::vector & results) const +{ if (DB_FUZZY_MATCH) {db_write_pm("extend", pm);} + const CTrieNode * trienode = pm.get_next_node(); + if (trienode->is_terminal()) { auto suffix_index = trienode->get_index(); _check_suffix_for_match(pm, get_suffix_as_indices(suffix_index), results); return; } + const unsigned int max_dist = pm.max_distance(); auto cd = pm.curr_distance(); auto qc = pm.query_char(); auto altqc = equivalent_letter[qc]; if (DB_FUZZY_MATCH) {trienode->log_state();} - for (auto [trie_char, next_ind] : trienode->children()) + for (auto [letter, index] : trienode->children()) { - const CTrieNode * next_nd = &(node_vec[next_ind]); - if (trie_char == qc || trie_char == altqc) { - if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd, next_nd, false}); - } else if (cd + 1 <= max_dist) { - if (DB_FUZZY_MATCH) {std::cerr << "mismatched " << to_char_str(letters[trie_char]) << " in pre adding extended pm.\n";} - next_alive.push_back(PartialMatch{pm, trie_char, cd + 1, next_nd, true}); - if (pm.can_rightshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, next_nd, trie_char}); // rightshift - } + const CTrieNode * next_nd = &(node_vec[index]); + if (letter == qc || letter == altqc) + { + if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[letter]) << " in pre adding extended pm.\n";} + + extend_partial_match(PartialMatch{pm, letter, cd, next_nd, false}, results); + } + else if (cd + 1 <= max_dist) + { + if (DB_FUZZY_MATCH) {std::cerr << "mismatched " << to_char_str(letters[letter]) << " in pre adding extended pm.\n";} + + extend_partial_match(PartialMatch{pm, letter, cd + 1, next_nd, true}, results); + + if (pm.can_rightshift()) + extend_partial_match(PartialMatch{pm, cd + 1, next_nd, letter}, results); + } } // frameshift - if (cd + 1 <= max_dist && pm.can_downshift()) { - next_alive.push_back(PartialMatch{pm, cd + 1, trienode}); //downshift + if (cd + 1 <= max_dist && pm.can_downshift()) + { + extend_partial_match(PartialMatch{pm, cd + 1, trienode}, results); } - if (trienode->is_key_terminating()) { + + if (trienode->is_key_terminating()) + { auto d = pm.num_q_char_left() + pm.curr_distance(); if (d <= max_dist) { pm.store_result(results, nullptr, 0, d); @@ -413,8 +424,8 @@ void CompressedTrie::_finish_query_result(FuzzyQueryResult & res) const { } -std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, - unsigned int max_dist) const { +std::vector CompressedTrie::fuzzy_matches(const stored_str_t & query_str, unsigned int max_dist) const +{ if (DB_FUZZY_MATCH) {std::cerr << "fuzzy_matches (within " << max_dist << " edits) of \"" << to_char_str(query_str) << "\"\n";} if (query_str.length() == 0) { return std::vector{}; @@ -433,22 +444,15 @@ std::vector CompressedTrie::fuzzy_matches(const stored_str_t & // non-trivial case std::vector results; results.reserve(20); - const CTrieNode * root_nd = &(node_vec.at(0)); - std::list alive; - alive.push_back(PartialMatch{query, root_nd}); - while (!alive.empty()) { - if (DB_FUZZY_MATCH) {std::cerr << " " << alive.size() << " alive partial matches and " << results.size() << " hits.\n";} - std::list next_alive; - for (const auto & pm : alive) { - auto prevnalen = next_alive.size(); - extend_partial_match(pm, results, next_alive); - if (DB_FUZZY_MATCH) {if (next_alive.size() != prevnalen) {std::cerr << "added " << next_alive.size() - prevnalen << " PMs.\n"; }} - } - std::swap(alive, next_alive); - } - for (auto & r : results) { + + auto root_node = &(node_vec.at(0)); + + // Do a depth-first search using the stack. + extend_partial_match(PartialMatch(query, root_node), results); + + for (auto & r : results) _finish_query_result(r); - } + return results; } From 684d62cde8da2f148deb44dcb45fa9a3ef80b71c Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 5 May 2020 19:48:06 -0700 Subject: [PATCH 080/620] Make sure we return [] instead of null if there are no matches. --- otc/ws/tnrsws.cpp | 2 +- .../match_names/empty/expected.json | 25 +++++++++++++++++++ .../match_names/empty/method.json | 7 ++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 test/tnrs/expectedws-induced/match_names/empty/expected.json create mode 100644 test/tnrs/expectedws-induced/match_names/empty/method.json diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index fd667d9a..048accae 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -368,7 +368,7 @@ pair ContextSearcher::match_name(const string & raw_query, bool do_approximate_matching, bool include_suppressed) { auto query = normalize_query(raw_query); - json results; + json results = json::array(); match_status status = unmatched; // 1. See if we can find an exact name match auto exact_name_matches = exact_name_search(taxonomy, context_root, query, include_suppressed); diff --git a/test/tnrs/expectedws-induced/match_names/empty/expected.json b/test/tnrs/expectedws-induced/match_names/empty/expected.json new file mode 100644 index 00000000..f1cef1de --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/empty/expected.json @@ -0,0 +1,25 @@ +{ + "context": "All life", + "governing_code": "undefined", + "includes_approximate_matches": false, + "includes_deprecated_taxa": false, + "includes_suppressed_names": false, + "matched_names": [], + "results": [ + { + "matches": [], + "name": "asdfghjkl" + } + ], + "taxonomy": { + "author": "open tree of life project", + "name": "ott", + "source": "ott0.0draft0", + "version": "0.0", + "weburl": "https://tree.opentreeoflife.org/about/taxonomy-version/ott0.0" + }, + "unambiguous_names": [], + "unmatched_names": [ + "asdfghjkl" + ] +} \ No newline at end of file diff --git a/test/tnrs/expectedws-induced/match_names/empty/method.json b/test/tnrs/expectedws-induced/match_names/empty/method.json new file mode 100644 index 00000000..f39d6f3c --- /dev/null +++ b/test/tnrs/expectedws-induced/match_names/empty/method.json @@ -0,0 +1,7 @@ +{ + "url_fragment": "v3/tnrs/match_names", + "verb": "POST", + "arguments": { + "names": ["asdfghjkl"] + } +} From 5d14980f067ef0f20e5bfb0d772b680bc87261be Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 6 May 2020 12:14:01 -0700 Subject: [PATCH 081/620] Compute match_coded when needed. --- otc/ctrie/search_data_models.h | 79 ++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index 92badd31..c39dad60 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "otc/otc_base_includes.h" #include "otc/ctrie/str_utils.h" @@ -165,10 +166,9 @@ class PartialMatch { distance(0), next_node(nextn), prev_mismatched_trie(NO_MATCHING_CHAR_CODE), - create_mode(creation_modes::MATCH) { - auto max_match_len = q.max_dist + q.as_indices.size(); - match_coded.reserve(max_match_len); - } + create_mode(creation_modes::MATCH) + { + } // create a partial match previous match and a char match PartialMatch(const PartialMatch & prevpm, @@ -176,52 +176,51 @@ class PartialMatch { unsigned int start_dist, const CTrieNode *nextn, bool was_match) - :query(prevpm.query), + :prev_match(&prevpm), + letter(match_char), + query(prevpm.query), qpos(prevpm.qpos + 1), distance(start_dist), next_node(nextn), prev_mismatched_trie(NO_MATCHING_CHAR_CODE), - create_mode(creation_modes::MATCH) { - match_coded.reserve(prevpm.match_coded.capacity()); - match_coded = prevpm.match_coded; - match_coded.push_back(match_char); - if (!was_match) { - prev_mismatched_trie = match_char; + create_mode(creation_modes::MATCH) + { + if (not was_match) + prev_mismatched_trie = match_char; + assert(nextn != prevpm.next_node); } - assert(nextn != prevpm.next_node); - } // create a partial match from a gap, moving through query but not trie PartialMatch(const PartialMatch & prevpm, unsigned int start_dist, const CTrieNode *nextn) - :query(prevpm.query), + :prev_match(&prevpm), + query(prevpm.query), qpos(prevpm.qpos + 1), distance(start_dist), next_node(nextn), prev_mismatched_trie(NO_MATCHING_CHAR_CODE), - create_mode(creation_modes::DOWN) { - match_coded.reserve(prevpm.match_coded.capacity()); - match_coded = prevpm.match_coded; - assert(nextn == prevpm.next_node); + create_mode(creation_modes::DOWN) + { + assert(nextn == prevpm.next_node); + } - } // create a partial match from a gap, moving through trie but not query PartialMatch(const PartialMatch & prevpm, unsigned int start_dist, const CTrieNode *nextn, stored_index_t match_char) - :query(prevpm.query), + :prev_match(&prevpm), + letter(match_char), + query(prevpm.query), qpos(prevpm.qpos), distance(start_dist), next_node(nextn), prev_mismatched_trie(NO_MATCHING_CHAR_CODE), - create_mode(creation_modes::RIGHT) { - match_coded.reserve(prevpm.match_coded.capacity()); - match_coded = prevpm.match_coded; - match_coded.push_back(match_char); - assert(nextn != prevpm.next_node); - } + create_mode(creation_modes::RIGHT) + { + assert(nextn != prevpm.next_node); + } bool can_downshift() const { return create_mode != creation_modes::RIGHT; @@ -259,10 +258,6 @@ class PartialMatch { return query.max_dist; } - const std::vector & get_prev_match_coded() const { - return match_coded; - } - const stored_index_t * query_data() const { return &(query.as_indices[0]); } @@ -282,15 +277,33 @@ class PartialMatch { stored_index_t get_prev_mismatched_trie() const { return prev_mismatched_trie; } - - private: + void make_match_coded(std::vector& s) const + { + if (letter) + s.push_back(*letter); + if (prev_match) + prev_match->make_match_coded(s); + } + + std::vector get_prev_match_coded() const + { + std::vector s; + s.reserve(query.max_dist + query.as_indices.size()); + make_match_coded(s); + std::reverse(s.begin(), s.end()); + return s; + } + +private: + const PartialMatch* prev_match = nullptr; + std::optional letter; + const FQuery & query; std::size_t qpos; unsigned int distance; const CTrieNode * next_node; stored_index_t prev_mismatched_trie; - std::vector match_coded; const creation_modes create_mode; }; From f1889338f137b2e65029962afc06e87f3d068dab Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 6 May 2020 12:54:03 -0700 Subject: [PATCH 082/620] Don't use match_coded() just to get the last emitted target letter. --- otc/ctrie/search_data_models.h | 14 +++++++++----- otc/ctrie/search_impl.cpp | 3 ++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index c39dad60..d9340a1d 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -177,7 +177,7 @@ class PartialMatch { const CTrieNode *nextn, bool was_match) :prev_match(&prevpm), - letter(match_char), + match_letter(match_char), query(prevpm.query), qpos(prevpm.qpos + 1), distance(start_dist), @@ -211,7 +211,7 @@ class PartialMatch { const CTrieNode *nextn, stored_index_t match_char) :prev_match(&prevpm), - letter(match_char), + match_letter(match_char), query(prevpm.query), qpos(prevpm.qpos), distance(start_dist), @@ -280,8 +280,8 @@ class PartialMatch { void make_match_coded(std::vector& s) const { - if (letter) - s.push_back(*letter); + if (match_letter) + s.push_back(*match_letter); if (prev_match) prev_match->make_match_coded(s); } @@ -295,9 +295,13 @@ class PartialMatch { return s; } + std::optional prev_match_letter() const + { + return match_letter; + } private: const PartialMatch* prev_match = nullptr; - std::optional letter; + std::optional match_letter; const FQuery & query; std::size_t qpos; diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index b163579c..69357df2 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -21,7 +21,8 @@ unsigned CompressedTrie::_calc_dist_impl(const PartialMatch &pm, stored_index_t prev_query_char = NO_MATCHING_CHAR_CODE; if (prev_trie_match_char != NO_MATCHING_CHAR_CODE) { assert(pm.query_pos() > 0); - prev_query_char = *(pm.get_prev_match_coded().rbegin()); + assert(pm.prev_match_letter()); + prev_query_char = *pm.prev_match_letter(); } auto d = _calc_dist_prim_impl(prev_query_char, quer_suff + pm.query_pos(), From dc68c1fda37b3a61c3dfc22eee50511b2157ed4b Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 6 May 2020 12:54:12 -0700 Subject: [PATCH 083/620] Add some notes. --- otc/ctrie/search_data_models.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/otc/ctrie/search_data_models.h b/otc/ctrie/search_data_models.h index d9340a1d..e0b9270f 100644 --- a/otc/ctrie/search_data_models.h +++ b/otc/ctrie/search_data_models.h @@ -299,6 +299,15 @@ class PartialMatch { { return match_letter; } + +// We are calling 4 things a "match": +// * a path through the DP matrix (e.g. "PartialMatch") +// * the string we are aligning to the query (e.g. "get_prev_match_coded()") +// * creation_modes::MATCH -- to move both DOWN and RIGHT +// * if we do creation_modes::MATCH, then did both letters agree. ("bool was_match") + +// Following all paths through the DP matrix is actually exponential, not quadratic! + private: const PartialMatch* prev_match = nullptr; std::optional match_letter; From 5989d82981e279130f32d558600aae4dfae21888 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 6 May 2020 14:28:20 -0700 Subject: [PATCH 084/620] It looks like the was_match argument was inverted. --- otc/ctrie/search_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index 69357df2..086d150b 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -384,13 +384,13 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, std::vector Date: Wed, 6 May 2020 15:07:29 -0700 Subject: [PATCH 085/620] Remove equivalent_letter machinery. We handle this by normalizing names and queries, which maps letters to their lower-case versions. --- otc/ctrie/ctrie.cpp | 35 ----------------------------------- otc/ctrie/ctrie.h | 4 ---- otc/ctrie/search_impl.cpp | 11 +++++------ 3 files changed, 5 insertions(+), 45 deletions(-) diff --git a/otc/ctrie/ctrie.cpp b/otc/ctrie/ctrie.cpp index f67e6fc9..88d1ee11 100644 --- a/otc/ctrie/ctrie.cpp +++ b/otc/ctrie/ctrie.cpp @@ -88,33 +88,6 @@ void CompressedTrie::_store_suffix_node(CTrieNode & curr_node, } -void CompressedTrie::fill_equivalent_letter_array() { - equivalent_letter.reserve(letters.length()); - equivalent_letter.clear(); - for (auto nl : letters) { - std::string uncov = to_char_str(nl); - std::string lccov = lower_case_version(uncov); - stored_index_t char_ind = NO_MATCHING_CHAR_CODE; - if (lccov != uncov) { - auto alt = to_u32string(lccov); - if (alt.length() != 1) { - throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << lccov << "\"\n"; - } - char_ind = ctrie_get_index_for_letter(alt[0]); - } else { - std::string uccov = upper_case_version(uncov); - if (uccov != uncov) { - auto alt = to_u32string(uccov); - if (alt.length() != 1) { - throw OTCError() << "lower case version of \"" << uncov << "\" was not one character: \"" << uccov << "\"\n"; - } - char_ind = ctrie_get_index_for_letter(alt[0]); - } - } - equivalent_letter.push_back(char_ind); - } -} - void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & letter_var) { clear(); // max_node_index = 0; @@ -135,7 +108,6 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & le for (auto nl : letters) { letter_to_ind[nl] = curr_ind++; } - fill_equivalent_letter_array(); null_char_index = letters.length(); letters.append(1, '\0'); @@ -214,13 +186,6 @@ void CompressedTrie::init(const ctrie_init_set_t & keys, const stored_str_t & le // std::cerr << "ROOT child for \"" << to_char_str(letters[trie_char]) << "\" "; next_nd->log_state(); } - for (unsigned int eli = 0; eli < equivalent_letter.size(); ++eli) { - if (equivalent_letter[eli] == NO_MATCHING_CHAR_CODE) { - std::cerr << to_char_str(letters[eli]) << " = \n"; - } else { - std::cerr << to_char_str(letters[eli]) << " = " << to_char_str(letters[equivalent_letter[eli]]) << "\n"; - } - } auto nvs = sizeof(CTrieNode)*node_vec.size(); auto suffs = concat_suff.size(); std::cerr << "vecsize = " << nvs << " bytes\n"; diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index 7c4b9af2..c45d1cec 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -43,7 +43,6 @@ class CompressedTrie { std::set ils{std::begin(inp_letters), std::end(inp_letters)}; std::string uniq{std::begin(ils), std::end(ils)}; letters = to_u32string(uniq); - fill_equivalent_letter_array(); null_char_index = letters.size(); } @@ -144,7 +143,6 @@ class CompressedTrie { private: void init(const ctrie_init_set_t & keys, const stored_str_t & letter_var); - void fill_equivalent_letter_array(); void _process_prefix(const stored_str_t & curr_pref, std::stack & todo_q, const stored_str_t & rev_letters, @@ -171,7 +169,6 @@ class CompressedTrie { concat_suff.clear(); node_vec.clear(); letter_to_ind.clear(); - equivalent_letter.clear(); } bool _are_equivalent(stored_char_t prev_q, const stored_index_t * quer_suff, @@ -212,7 +209,6 @@ class CompressedTrie { } std::unordered_map letter_to_ind; - std::vector equivalent_letter; stored_str_t letters; std::list node_list; std::vector concat_suff; diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index 086d150b..50a6b4a3 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -64,15 +64,15 @@ unsigned int CompressedTrie::_match_cost(stored_char_t prev_q_match_char, if (q_match_char == NO_MATCHING_CHAR_CODE || trie_match_char == NO_MATCHING_CHAR_CODE) { return 1; } - if (q_match_char == trie_match_char || q_match_char == equivalent_letter[trie_match_char]) { + if (q_match_char == trie_match_char) { return 0; } if (prev_trie_match_char == NO_MATCHING_CHAR_CODE) { // transposition is not possible return 1; } - if ((prev_q_match_char == trie_match_char || prev_q_match_char == equivalent_letter[trie_match_char]) - && (q_match_char == prev_trie_match_char || q_match_char == equivalent_letter[prev_trie_match_char])) { + if ((prev_q_match_char == trie_match_char) + && (q_match_char == prev_trie_match_char)) { // transpostion, don't double penalize return 0; } @@ -84,7 +84,7 @@ unsigned int CompressedTrie::_match_cost_no_transp(stored_char_t q_match_char, if (q_match_char == NO_MATCHING_CHAR_CODE || trie_match_char == NO_MATCHING_CHAR_CODE) { return 1; } - if (q_match_char == trie_match_char || q_match_char == equivalent_letter[trie_match_char]) { + if (q_match_char == trie_match_char) { return 0; } return 1; @@ -374,13 +374,12 @@ void CompressedTrie::extend_partial_match(const PartialMatch & pm, std::vectorlog_state();} for (auto [letter, index] : trienode->children()) { const CTrieNode * next_nd = &(node_vec[index]); - if (letter == qc || letter == altqc) + if (letter == qc) { if (DB_FUZZY_MATCH) {std::cerr << "matched " << to_char_str(letters[letter]) << " in pre adding extended pm.\n";} From deb7970fced01b88f143269a456a2b580b1445a7 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 14:43:48 -0700 Subject: [PATCH 086/620] Add method to find child for a single letter. --- otc/ctrie/ctrie_node.cpp | 18 ++++++++++++++++++ otc/ctrie/ctrie_node.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/otc/ctrie/ctrie_node.cpp b/otc/ctrie/ctrie_node.cpp index ede25be3..0cdd3991 100644 --- a/otc/ctrie/ctrie_node.cpp +++ b/otc/ctrie/ctrie_node.cpp @@ -1,5 +1,7 @@ #include "otc/ctrie/ctrie_node.h" +using std::optional; + namespace otc { void CTrieNode::flag_letter(unsigned int i) @@ -15,4 +17,20 @@ void CTrieNode::flag_letter(unsigned int i) } } +optional CTrieNode::child_index_for_letter(stored_index_t letter) const +{ + auto lbits = get_letter_bits(); + + // We don't have a bit for that letter. + if ((lbits & (ONE_64<<(63-letter))) == 0) return {}; + + // remove bits for previous letters + uint64_t mask = ((uint64_t)(-1))<<(64-letter); + + // The number of letters BEFORE this letter + int delta = __builtin_popcountl(mask & lbits); + + return get_index() + delta; +} + } diff --git a/otc/ctrie/ctrie_node.h b/otc/ctrie/ctrie_node.h index 25d579fa..d02bddf3 100644 --- a/otc/ctrie/ctrie_node.h +++ b/otc/ctrie/ctrie_node.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "otc/error.h" #include "otc/otc_base_includes.h" @@ -184,6 +185,7 @@ class CTrieNode { set_index(index); } + std::optional child_index_for_letter(stored_index_t letter) const; void flag_letter(unsigned int i); }; From 9c61e225e1ff43900f93ddec5619ef9591e2e48b Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 14:44:35 -0700 Subject: [PATCH 087/620] Add method to do a prefix_query( ). --- otc/ctrie/ctrie.h | 3 +++ otc/ctrie/search_impl.cpp | 49 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/otc/ctrie/ctrie.h b/otc/ctrie/ctrie.h index c45d1cec..45d1350d 100644 --- a/otc/ctrie/ctrie.h +++ b/otc/ctrie/ctrie.h @@ -208,6 +208,9 @@ class CompressedTrie { return ltiit->second; } + void all_descendants(stored_str_t& prefix, uint64_t index, std::vector& results) const; + std::vector prefix_query(const stored_str_t& uquery) const; + std::unordered_map letter_to_ind; stored_str_t letters; std::list node_list; diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index 50a6b4a3..9968b5bc 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -1,5 +1,8 @@ #include "otc/ctrie/ctrie.h" +using std::vector; +using std::string; + namespace otc { std::vector _init_prev_row(unsigned int dist_threshold) { @@ -456,6 +459,52 @@ std::vector CompressedTrie::fuzzy_matches(const stored_str_t & return results; } +void CompressedTrie::all_descendants(stored_str_t& prefix, uint64_t index, vector& results) const +{ + auto& node = node_vec[index]; + + if (node.is_key_terminating()) + results.push_back(to_char_str(prefix)); + + if (node.is_terminal()) { + auto suffix_index = node.get_index(); + auto suffix = get_suffix(suffix_index); + results.push_back(to_char_str(prefix + suffix)); + } + else + { + for (auto [letter, next_index] : node.children()) + { + prefix.push_back(letters[letter]); + all_descendants(prefix, next_index, results); + prefix.pop_back(); + } + } +} + +vector CompressedTrie::prefix_query(const stored_str_t& uquery) const +{ + if (node_vec.empty()) return {}; + + auto letters = encode_as_indices(uquery); + + int index = 0; + for(auto letter: letters) + { + auto next_index = node_vec[index].child_index_for_letter(letter); + if (next_index) + index = *next_index; + else + return {}; + } + + vector results; + auto prefix = uquery; + all_descendants(prefix, index, results); + + return results; +} + } // namespace otc From d27743ce7655b2f1726b2d63be261d8efcc61639 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 14:45:10 -0700 Subject: [PATCH 088/620] Add prefix_query method to the ctrie db. --- otc/ctrie/ctrie_db.cpp | 18 ++++++++++++++++++ otc/ctrie/ctrie_db.h | 1 + 2 files changed, 19 insertions(+) diff --git a/otc/ctrie/ctrie_db.cpp b/otc/ctrie/ctrie_db.cpp index 4c8fb338..888b7f6a 100644 --- a/otc/ctrie/ctrie_db.cpp +++ b/otc/ctrie/ctrie_db.cpp @@ -1,5 +1,8 @@ #include "otc/ctrie/ctrie_db.h" +using std::vector; +using std::string; + namespace otc { std::set CompressedTrieBasedDB::fuzzy_query(const std::string & query_str) const { @@ -42,6 +45,21 @@ std::set CompressedTrieBasedDB::exact_ return sorted; } +vector CompressedTrieBasedDB::prefix_query(const std::string & query_str) const +{ + auto conv_query = to_u32string(query_str); + + auto sorted = thin_trie.prefix_query(conv_query); + + auto from_full = wide_trie.prefix_query(conv_query); + sorted.insert(sorted.end(), std::begin(from_full), std::end(from_full)); + + // I'm not sure this is a good idea... + std::sort(sorted.begin(), sorted.end()); + + return sorted; +} + void CompressedTrieBasedDB::initialize(const std::set & keys) { ctrie_init_set_t for_wide; ctrie_init_set_t for_thin; diff --git a/otc/ctrie/ctrie_db.h b/otc/ctrie/ctrie_db.h index 30ecc30c..c828008d 100644 --- a/otc/ctrie/ctrie_db.h +++ b/otc/ctrie/ctrie_db.h @@ -11,6 +11,7 @@ class CompressedTrieBasedDB { void initialize(const std::set & keys); std::set fuzzy_query(const std::string & query_str) const; std::set exact_query(const std::string & query_str) const; + std::vector prefix_query(const std::string & query_str) const; private: CompressedTrie wide_trie; CompressedTrie thin_trie; From 9ed587aaab2f66912fe31ab334d580fb20a0087d Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 14:45:59 -0700 Subject: [PATCH 089/620] Add method to do a prefix_query on a ContextAware DB. --- otc/ctrie/context_ctrie_db.cpp | 66 ++++++++++++++++++++++++++++++++++ otc/ctrie/context_ctrie_db.h | 7 ++++ 2 files changed, 73 insertions(+) diff --git a/otc/ctrie/context_ctrie_db.cpp b/otc/ctrie/context_ctrie_db.cpp index 5326b844..43df3418 100644 --- a/otc/ctrie/context_ctrie_db.cpp +++ b/otc/ctrie/context_ctrie_db.cpp @@ -101,6 +101,15 @@ optional ContextAwareCTrieBasedDB::exact_query(const std::string & query return {}; } +vector ContextAwareCTrieBasedDB::prefix_query(const std::string & query_str) const +{ + auto nquery = normalize_query(query_str); + + if (nquery.size() < 3) return {}; + + return trie.prefix_query(nquery); +} + using vec_fqr_w_t = std::vector; vec_fqr_w_t ContextAwareCTrieBasedDB::to_taxa(const set& sorted, const RTRichTaxNode * context_root, @@ -198,6 +207,63 @@ ContextAwareCTrieBasedDB::to_taxa(const optional& n_query, return results; } +vector +ContextAwareCTrieBasedDB::to_taxa(const vector& n_queries, + const RTRichTaxNode * context_root, + const RichTaxonomy & /*taxonomy*/, + bool include_suppressed) const +{ + if (n_queries.empty()) + { + LOG(DEBUG) << "no matches"; + return {}; + } + + vector results; + + const auto & tax_data = context_root->get_data(); + const auto filter_trav_enter = tax_data.trav_enter; + const auto filter_trav_exit = tax_data.trav_exit; + + for(auto& n_query: n_queries) + { + const auto & vec_taxon_and_syn_ptrs = match_name_to_taxon.at(n_query); + LOG(DEBUG) << "prefix_query(match=\"" << n_query << ") -> vec size = " << vec_taxon_and_syn_ptrs.size(); + for (auto & [tax_ptr, rec_or_syn_ptr] : vec_taxon_and_syn_ptrs) + { + if (tax_ptr == nullptr) + { + LOG(DEBUG) << "matched suppressed and include_suppressed = " << include_suppressed; + if (include_suppressed) + { + const TaxonomyRecord * tr = (const TaxonomyRecord *) rec_or_syn_ptr; + results.push_back(TaxonResult(tr)); + } + } + else + { + const auto & res_tax_data = tax_ptr->get_data(); + LOG(DEBUG) << "matched taxon trav = (" << res_tax_data.trav_enter << ", " << res_tax_data.trav_exit << "). filter.trav = (" << filter_trav_enter << ", " << filter_trav_exit << ")"; + if (res_tax_data.trav_exit <= filter_trav_exit && res_tax_data.trav_enter >= filter_trav_enter) + { + const TaxonomicJuniorSynonym * syn_ptr = (const TaxonomicJuniorSynonym *) rec_or_syn_ptr; + if (syn_ptr == nullptr) + { + LOG(DEBUG) << "pushing non-syn"; + results.push_back(TaxonResult(tax_ptr)); + } + else + { + LOG(DEBUG) << "pushing synonym"; + results.push_back(TaxonResult(tax_ptr, syn_ptr)); + } + } + } + } + } + return results; +} + vec_fqr_w_t ContextAwareCTrieBasedDB::fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, const RichTaxonomy & taxonomy, diff --git a/otc/ctrie/context_ctrie_db.h b/otc/ctrie/context_ctrie_db.h index ed5d969d..b1293329 100644 --- a/otc/ctrie/context_ctrie_db.h +++ b/otc/ctrie/context_ctrie_db.h @@ -22,6 +22,9 @@ class ContextAwareCTrieBasedDB { // Does anything match this normalized query string? std::optional exact_query(const std::string & query_str) const; + // Does anything match this normalized query string? + std::vector prefix_query(const std::string & query_str) const; + std::vector fuzzy_query_to_taxa(const std::string & query_str, const RTRichTaxNode * context_root, const RichTaxonomy & taxonomy, @@ -37,6 +40,10 @@ class ContextAwareCTrieBasedDB { const RichTaxonomy & taxonomy, bool include_suppressed) const; + std::vector to_taxa(const std::vector& result, + const RTRichTaxNode * context_root, + const RichTaxonomy & taxonomy, + bool include_suppressed) const; private: const Context & context; From 9da04551a6d14569ce7a6c4a970fb881ece4616d Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 14:46:30 -0700 Subject: [PATCH 090/620] Try to new prefix_query and compare to previous results. --- otc/ws/tnrsws.cpp | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 048accae..e07804f9 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -228,7 +228,8 @@ vector exact_name_search_higher(const RichTaxonomy& taxonomy, return exact_name_search(taxonomy, context_root, query, ok); } -vector prefix_name_search(const Taxon* context_root, +vector prefix_name_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, const string& query, tax_pred_t ok = [](const Taxon*){return true;}) { vector hits; @@ -240,6 +241,24 @@ vector prefix_name_search(const Taxon* context_root, hits.push_back(taxon); } } + + auto ctp = taxonomy.get_fuzzy_matcher(); + auto results = ctp->to_taxa(ctp->prefix_query(query), context_root, taxonomy, true); + vector hits2; + for(auto& result: results) + { + if (not result.is_synonym()) + { + auto t = result.get_taxon(); + if (ok(t)) + hits2.push_back(t); + } + } + std::sort(hits.begin(), hits.end()); + std::sort(hits2.begin(), hits2.end()); + hits2.erase( unique( hits2.begin(), hits2.end() ), hits2.end() ); + assert(hits == hits2); + return hits; } @@ -250,7 +269,7 @@ vector prefix_name_search(const RichTaxonomy& taxonomy, tax_pred_t ok = [&](const Taxon* taxon) { return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); }; - return prefix_name_search(context_root, query, ok); + return prefix_name_search(taxonomy, context_root, query, ok); } vector prefix_name_search_higher(const RichTaxonomy& taxonomy, @@ -263,10 +282,11 @@ vector prefix_name_search_higher(const RichTaxonomy& taxonomy, } return taxon_is_higher(taxon); }; - return prefix_name_search(context_root, query, ok); + return prefix_name_search(taxonomy, context_root, query, ok); } -vec_tax_str_pair_t prefix_synonym_search(const Taxon* context_root, +vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, string query, tax_pred_t ok = [](const Taxon*){return true;}) { @@ -291,7 +311,7 @@ vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, std::function ok = [&](const Taxon* taxon) { return include_suppressed or not taxonomy.node_is_suppressed_from_tnrs(taxon); }; - return prefix_synonym_search(context_root, query, ok); + return prefix_synonym_search(taxonomy, context_root, query, ok); } inline json get_taxon_json(const RichTaxonomy& taxonomy, const Taxon& taxon) { From f74ead9321841c35b89b11083cbff6f53f27ff5d Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 15:02:38 -0700 Subject: [PATCH 091/620] Factor out prefix_name_search_slow( ). --- otc/ws/tnrsws.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index e07804f9..591bcd21 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -228,10 +228,10 @@ vector exact_name_search_higher(const RichTaxonomy& taxonomy, return exact_name_search(taxonomy, context_root, query, ok); } -vector prefix_name_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - const string& query, - tax_pred_t ok = [](const Taxon*){return true;}) { +vector prefix_name_search_slow(const Taxon* context_root, + const string& query, + tax_pred_t ok = [](const Taxon*){return true;}) +{ vector hits; for(auto taxon: iter_post_n_const(*context_root)) { if (not ok(taxon)) { @@ -241,22 +241,32 @@ vector prefix_name_search(const RichTaxonomy& taxonomy, hits.push_back(taxon); } } + return hits; +} + +vector prefix_name_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, + const string& query, + tax_pred_t ok = [](const Taxon*){return true;}) +{ auto ctp = taxonomy.get_fuzzy_matcher(); auto results = ctp->to_taxa(ctp->prefix_query(query), context_root, taxonomy, true); - vector hits2; + vector hits; for(auto& result: results) { if (not result.is_synonym()) { auto t = result.get_taxon(); if (ok(t)) - hits2.push_back(t); + hits.push_back(t); } } std::sort(hits.begin(), hits.end()); + hits.erase( unique( hits.begin(), hits.end() ), hits.end() ); + + auto hits2 = prefix_name_search_slow(context_root, query, ok); std::sort(hits2.begin(), hits2.end()); - hits2.erase( unique( hits2.begin(), hits2.end() ), hits2.end() ); assert(hits == hits2); return hits; From f7c2f0122e61268127008bf768ff836e37d5a314 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 12 May 2020 15:02:58 -0700 Subject: [PATCH 092/620] Update tests for changed ordering of same results. --- .../autocomplete_name/1/expected.json | 24 +++---- .../autocomplete_name/2/expected.json | 72 +++++++++---------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/test/tnrs/expectedws-induced/autocomplete_name/1/expected.json b/test/tnrs/expectedws-induced/autocomplete_name/1/expected.json index 4b1dfb3c..1df868b5 100644 --- a/test/tnrs/expectedws-induced/autocomplete_name/1/expected.json +++ b/test/tnrs/expectedws-induced/autocomplete_name/1/expected.json @@ -2,37 +2,37 @@ { "is_higher": true, "is_suppressed": false, - "ott_id": 655592, - "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" + "ott_id": 5092163, + "unique_name": "Amaryllididae" }, { "is_higher": true, "is_suppressed": false, - "ott_id": 267867, - "unique_name": "Amaryllidoideae" + "ott_id": 5764812, + "unique_name": "Amaryllidinae" }, { "is_higher": true, "is_suppressed": false, - "ott_id": 258473, - "unique_name": "Amaryllidaceae" + "ott_id": 4671782, + "unique_name": "Amaryllis (genus in Opisthokonta)" }, { "is_higher": true, "is_suppressed": false, - "ott_id": 4671782, - "unique_name": "Amaryllis (genus in Opisthokonta)" + "ott_id": 258473, + "unique_name": "Amaryllidaceae" }, { "is_higher": true, "is_suppressed": false, - "ott_id": 5764812, - "unique_name": "Amaryllidinae" + "ott_id": 267867, + "unique_name": "Amaryllidoideae" }, { "is_higher": true, "is_suppressed": false, - "ott_id": 5092163, - "unique_name": "Amaryllididae" + "ott_id": 655592, + "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" } ] \ No newline at end of file diff --git a/test/tnrs/expectedws-induced/autocomplete_name/2/expected.json b/test/tnrs/expectedws-induced/autocomplete_name/2/expected.json index 724277ad..1df868b5 100644 --- a/test/tnrs/expectedws-induced/autocomplete_name/2/expected.json +++ b/test/tnrs/expectedws-induced/autocomplete_name/2/expected.json @@ -1,38 +1,38 @@ [ - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 655592, - "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" - }, - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 267867, - "unique_name": "Amaryllidoideae" - }, - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 258473, - "unique_name": "Amaryllidaceae" - }, - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 4671782, - "unique_name": "Amaryllis (genus in Opisthokonta)" - }, - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 5764812, - "unique_name": "Amaryllidinae" - }, - { - "is_higher": true, - "is_suppressed": false, - "ott_id": 5092163, - "unique_name": "Amaryllididae" - } + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5092163, + "unique_name": "Amaryllididae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 5764812, + "unique_name": "Amaryllidinae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 4671782, + "unique_name": "Amaryllis (genus in Opisthokonta)" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 258473, + "unique_name": "Amaryllidaceae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 267867, + "unique_name": "Amaryllidoideae" + }, + { + "is_higher": true, + "is_suppressed": false, + "ott_id": 655592, + "unique_name": "Amaryllis (genus in kingdom Archaeplastida)" + } ] \ No newline at end of file From c90c4439c25b87309a5b1fd7b7505b36923620e9 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 13 May 2020 15:38:03 -0700 Subject: [PATCH 093/620] Handle cases where the prefix query extends past a terminal node into a suffix. --- otc/ctrie/search_impl.cpp | 45 +++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/otc/ctrie/search_impl.cpp b/otc/ctrie/search_impl.cpp index 9968b5bc..13c6cbbe 100644 --- a/otc/ctrie/search_impl.cpp +++ b/otc/ctrie/search_impl.cpp @@ -482,27 +482,60 @@ void CompressedTrie::all_descendants(stored_str_t& prefix, uint64_t index, vecto } } +// can we have both is_terminal() and key_terminating() on the same node? +// if we could have them both set and have an empty suffix, then we would match the same string twice. +// therefore perhaps, we can have them both set, but we only have is_terminal() set if there is a non-empty suffix. vector CompressedTrie::prefix_query(const stored_str_t& uquery) const { if (node_vec.empty()) return {}; - auto letters = encode_as_indices(uquery); + auto query_letters = encode_as_indices(uquery); int index = 0; - for(auto letter: letters) + int letters_matched = 0; + for(int i=0;i results; - auto prefix = uquery; - all_descendants(prefix, index, results); + // If we have not matched all the prefix letters, check the suffix + if (letters_matched results; + auto prefix = uquery; + all_descendants(prefix, index, results); + + return results; + } } From 4a7e3f466833b174473c83b60c6034db2bf81a0a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 13 May 2020 15:39:15 -0700 Subject: [PATCH 094/620] Only check against the slow version #ifdef DEBUG_NAME_SEARCH --- otc/ws/tnrsws.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 591bcd21..6ba75cb7 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -265,9 +265,11 @@ vector prefix_name_search(const RichTaxonomy& taxonomy, std::sort(hits.begin(), hits.end()); hits.erase( unique( hits.begin(), hits.end() ), hits.end() ); +#ifdef DEBUG_NAME_SEARCH auto hits2 = prefix_name_search_slow(context_root, query, ok); std::sort(hits2.begin(), hits2.end()); assert(hits == hits2); +#endif return hits; } From fc6513f658400e71b925cf5ac013eb2b5ac6ca4a Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 13 May 2020 15:39:37 -0700 Subject: [PATCH 095/620] Speed up prefix_synonym_search as well. --- otc/ws/tnrsws.cpp | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/otc/ws/tnrsws.cpp b/otc/ws/tnrsws.cpp index 6ba75cb7..53adaeeb 100644 --- a/otc/ws/tnrsws.cpp +++ b/otc/ws/tnrsws.cpp @@ -85,7 +85,7 @@ bool taxon_is_higher(const Taxon* taxon) { return taxon->get_data().rank < TaxonomicRank::RANK_SPECIES; } -using vec_tax_str_pair_t = vector >; +using vec_tax_str_pair_t = vector>; vec_tax_str_pair_t exact_synonym_search_slow(const RichTaxonomy& /*taxonomy*/, const Taxon* context_root, @@ -297,10 +297,9 @@ vector prefix_name_search_higher(const RichTaxonomy& taxonomy, return prefix_name_search(taxonomy, context_root, query, ok); } -vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, - const Taxon* context_root, - string query, - tax_pred_t ok = [](const Taxon*){return true;}) +vec_tax_str_pair_t prefix_synonym_search_slow(const Taxon* context_root, + string query, + tax_pred_t ok = [](const Taxon*){return true;}) { vec_tax_str_pair_t hits; for(auto taxon: iter_post_n_const(*context_root)) { @@ -316,6 +315,36 @@ vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, return hits; } +vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, + const Taxon* context_root, + string query, + tax_pred_t ok = [](const Taxon*){return true;}) +{ + auto ctp = taxonomy.get_fuzzy_matcher(); + auto results = ctp->to_taxa(ctp->prefix_query(query), context_root, taxonomy, true); + vec_tax_str_pair_t hits; + for(auto& result: results) + { + if (result.is_synonym()) + { + auto t = result.get_taxon(); + assert(t); + if (ok(t)) + hits.push_back({t,result.get_matched_name()}); + } + } + std::sort(hits.begin(), hits.end()); + hits.erase( unique( hits.begin(), hits.end() ), hits.end() ); + +#ifdef DEBUG_NAME_SEARCH + auto hits2 = prefix_synonym_search_slow(context_root, query, ok); + std::sort(hits2.begin(), hits2.end()); + assert(hits == hits2); +#endif + + return hits; +} + vec_tax_str_pair_t prefix_synonym_search(const RichTaxonomy& taxonomy, const Taxon* context_root, string query, From 98bcf7124eac484a63ca663452b26b19d204a411 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 2 Jun 2020 12:29:13 -0700 Subject: [PATCH 096/620] Move from_json(json&, SummaryTreeAnnotation&) to otc/ws/tolws.cpp. This fixes a linking error. --- otc/ws/tolws.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++ ws/tolwsbooting.cpp | 54 --------------------------------------------- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/otc/ws/tolws.cpp b/otc/ws/tolws.cpp index 6640a9d8..ed0574c4 100644 --- a/otc/ws/tolws.cpp +++ b/otc/ws/tolws.cpp @@ -917,6 +917,59 @@ void delete_subtree_and_monotypic_ancestors(Tree& tree, typename Tree::node_type } } +void from_json(const nlohmann::json &j, SummaryTreeAnnotation & sta) { + sta.date_completed = extract_string(j, "date_completed"); + sta.filtered_flags = extract_string(j, "filtered_flags"); + auto splitff = split_string(sta.filtered_flags, ','); + sta.filtered_flags_vec.assign(splitff.begin(), splitff.end()); + // generated_by gets converted back to a string + auto gb_el = j.find("generated_by"); + if (gb_el == j.end()) { + throw OTCError() << "Missing generated_by field.\n"; + } + sta.generated_by = gb_el->dump(); + sta.num_leaves_in_exemplified_taxonomy = extract_unsigned_long(j, "num_leaves_in_exemplified_taxonomy"); + sta.num_source_studies = extract_unsigned_long(j, "num_source_studies"); + sta.num_source_trees = extract_unsigned_long(j, "num_source_trees"); + sta.num_tips = extract_unsigned_long(j, "num_tips"); + sta.root_ott_id = extract_ott_id(j, "root_ott_id"); + sta.root_taxon_name = extract_string(j, "root_taxon_name"); + sta.synth_id = extract_string(j, "synth_id"); + sta.taxonomy_version = extract_string(j, "taxonomy_version"); + sta.tree_id = extract_string(j, "tree_id"); + auto sim_el = j.find("source_id_map"); + if (sim_el == j.end()) { + throw OTCError() << "Missing source_id_map field.\n"; + } + if (!sim_el->is_object()) { + throw OTCError() << "Expected \"source_id_map\" field to be an object.\n"; + } + try { + for (nlohmann::json::const_iterator sim_it = sim_el->begin(); sim_it != sim_el->end(); ++sim_it) { + sta.source_id_map[sim_it.key()] = sim_it.value(); + } + } catch (OTCError & x) { + throw OTCError() << "Error reading source_id_map field: " << x.what(); + } + sta.full_source_id_map_json = *sim_el; + auto s_el = j.find("sources"); + if (s_el == j.end()) { + throw OTCError() << "Missing sources field.\n"; + } + if (!s_el->is_array()) { + throw OTCError() << "Expected \"sources\" field to be an array.\n"; + } + sta.sources.resize(s_el->size()); + for (auto i = 0U; i < s_el->size(); ++i) { + try { + sta.sources[i] = s_el->at(i).get(); + } catch (OTCError & x) { + throw OTCError() << "Error expected each element of the sources array to be a string: " << x.what(); + } + } +} + + } //namespace otc diff --git a/ws/tolwsbooting.cpp b/ws/tolwsbooting.cpp index f07b6814..c47c37f7 100644 --- a/ws/tolwsbooting.cpp +++ b/ws/tolwsbooting.cpp @@ -963,61 +963,7 @@ po::variables_map parse_cmd_line(int argc, char* argv[]) { } /// end formerly tolwsadaptors.cpp - namespace otc { -void from_json(const nlohmann::json &j, SummaryTreeAnnotation & sta) { - sta.date_completed = extract_string(j, "date_completed"); - sta.filtered_flags = extract_string(j, "filtered_flags"); - auto splitff = split_string(sta.filtered_flags, ','); - sta.filtered_flags_vec.assign(splitff.begin(), splitff.end()); - // generated_by gets converted back to a string - auto gb_el = j.find("generated_by"); - if (gb_el == j.end()) { - throw OTCError() << "Missing generated_by field.\n"; - } - sta.generated_by = gb_el->dump(); - sta.num_leaves_in_exemplified_taxonomy = extract_unsigned_long(j, "num_leaves_in_exemplified_taxonomy"); - sta.num_source_studies = extract_unsigned_long(j, "num_source_studies"); - sta.num_source_trees = extract_unsigned_long(j, "num_source_trees"); - sta.num_tips = extract_unsigned_long(j, "num_tips"); - sta.root_ott_id = extract_ott_id(j, "root_ott_id"); - sta.root_taxon_name = extract_string(j, "root_taxon_name"); - sta.synth_id = extract_string(j, "synth_id"); - sta.taxonomy_version = extract_string(j, "taxonomy_version"); - sta.tree_id = extract_string(j, "tree_id"); - auto sim_el = j.find("source_id_map"); - if (sim_el == j.end()) { - throw OTCError() << "Missing source_id_map field.\n"; - } - if (!sim_el->is_object()) { - throw OTCError() << "Expected \"source_id_map\" field to be an object.\n"; - } - try { - for (nlohmann::json::const_iterator sim_it = sim_el->begin(); sim_it != sim_el->end(); ++sim_it) { - sta.source_id_map[sim_it.key()] = sim_it.value(); - } - } catch (OTCError & x) { - throw OTCError() << "Error reading source_id_map field: " << x.what(); - } - sta.full_source_id_map_json = *sim_el; - auto s_el = j.find("sources"); - if (s_el == j.end()) { - throw OTCError() << "Missing sources field.\n"; - } - if (!s_el->is_array()) { - throw OTCError() << "Expected \"sources\" field to be an array.\n"; - } - sta.sources.resize(s_el->size()); - for (auto i = 0U; i < s_el->size(); ++i) { - try { - sta.sources[i] = s_el->at(i).get(); - } catch (OTCError & x) { - throw OTCError() << "Error expected each element of the sources array to be a string: " << x.what(); - } - } -} - - bool read_tree_and_annotations(const fs::path & configpath, const fs::path & treepath, const fs::path & annotationspath, From 66344e8d2e3de24cc54176a08bfccecbfa3c0c72 Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Tue, 2 Jun 2020 12:32:50 -0700 Subject: [PATCH 097/620] include boost/functional/hash.hpp to fix link error. --- otc/config_file.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/otc/config_file.cpp b/otc/config_file.cpp index cf70f412..2d6c0889 100644 --- a/otc/config_file.cpp +++ b/otc/config_file.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace fs = boost::filesystem; From 53ada96072562bf06fa78201e53e1ca1a2270e3c Mon Sep 17 00:00:00 2001 From: Benjamin Redelings Date: Wed, 3 Jun 2020 14:44:14 -0700 Subject: [PATCH 098/620] Avoid having nlohmann::json undo our assert handler. --- otc/json.hpp | 20408 +--------------------------------------- otc/nlohmann/json.hpp | 20406 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 20408 insertions(+), 20406 deletions(-) create mode 100644 otc/nlohmann/json.hpp diff --git a/otc/json.hpp b/otc/json.hpp index c9af0bed..01f68540 100644 --- a/otc/json.hpp +++ b/otc/json.hpp @@ -1,20406 +1,2 @@ -/* - __ _____ _____ _____ - __| | __| | | | JSON for Modern C++ -| | |__ | | | | | | version 3.5.0 -|_____|_____|_____|_|___| https://github.com/nlohmann/json - -Licensed under the MIT License . -SPDX-License-Identifier: MIT -Copyright (c) 2013-2018 Niels Lohmann . - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#ifndef NLOHMANN_JSON_HPP -#define NLOHMANN_JSON_HPP - -#define NLOHMANN_JSON_VERSION_MAJOR 3 -#define NLOHMANN_JSON_VERSION_MINOR 5 -#define NLOHMANN_JSON_VERSION_PATCH 0 - -#include // all_of, find, for_each -#include // assert -#include // and, not, or -#include // nullptr_t, ptrdiff_t, size_t -#include // hash, less -#include // initializer_list -#include // istream, ostream -#include // random_access_iterator_tag -#include // accumulate -#include // string, stoi, to_string -#include // declval, forward, move, pair, swap - -// #include -#ifndef NLOHMANN_JSON_FWD_HPP -#define NLOHMANN_JSON_FWD_HPP - -#include // int64_t, uint64_t -#include // map -#include // allocator -#include // string -#include // vector - -/*! -@brief namespace for Niels Lohmann -@see https://github.com/nlohmann -@since version 1.0.0 -*/ -namespace nlohmann -{ -/*! -@brief default JSONSerializer template argument - -This serializer ignores the template arguments and uses ADL -([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) -for serialization. -*/ -template -struct adl_serializer; - -template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer> -class basic_json; - -/*! -@brief JSON Pointer - -A JSON pointer defines a string syntax for identifying a specific value -within a JSON document. It can be used with functions `at` and -`operator[]`. Furthermore, JSON pointers are the base for JSON patches. - -@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) - -@since version 2.0.0 -*/ -template -class json_pointer; - -/*! -@brief default JSON class - -This type is the default specialization of the @ref basic_json class which -uses the standard template types. - -@since version 1.0.0 -*/ -using json = basic_json<>; -} // namespace nlohmann - -#endif - -// #include - - -// This file contains all internal macro definitions -// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them - -// exclude unsupported compilers -#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) - #if defined(__clang__) - #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 - #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) - #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 - #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #endif -#endif - -// disable float-equal warnings on GCC/clang -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - -// disable documentation warnings on clang -#if defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wdocumentation" -#endif - -// allow for portable deprecation warnings -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) - #define JSON_DEPRECATED __declspec(deprecated) -#else - #define JSON_DEPRECATED -#endif - -// allow to disable exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) - #define JSON_THROW(exception) throw exception - #define JSON_TRY try - #define JSON_CATCH(exception) catch(exception) - #define JSON_INTERNAL_CATCH(exception) catch(exception) -#else - #define JSON_THROW(exception) std::abort() - #define JSON_TRY if(true) - #define JSON_CATCH(exception) if(false) - #define JSON_INTERNAL_CATCH(exception) if(false) -#endif - -// override exception macros -#if defined(JSON_THROW_USER) - #undef JSON_THROW - #define JSON_THROW JSON_THROW_USER -#endif -#if defined(JSON_TRY_USER) - #undef JSON_TRY - #define JSON_TRY JSON_TRY_USER -#endif -#if defined(JSON_CATCH_USER) - #undef JSON_CATCH - #define JSON_CATCH JSON_CATCH_USER - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_CATCH_USER -#endif -#if defined(JSON_INTERNAL_CATCH_USER) - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER -#endif - -// manual branch prediction -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_LIKELY(x) __builtin_expect(!!(x), 1) - #define JSON_UNLIKELY(x) __builtin_expect(!!(x), 0) -#else - #define JSON_LIKELY(x) x - #define JSON_UNLIKELY(x) x -#endif - -// C++ language standard detection -#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) - #define JSON_HAS_CPP_14 -#endif - -/*! -@brief macro to briefly define a mapping between an enum and JSON -@def NLOHMANN_JSON_SERIALIZE_ENUM -@since version 3.4.0 -*/ -#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ - template \ - inline void to_json(BasicJsonType& j, const ENUM_TYPE& e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [e](const std::pair& ej_pair) -> bool \ - { \ - return ej_pair.first == e; \ - }); \ - j = ((it != std::end(m)) ? it : std::begin(m))->second; \ - } \ - template \ - inline void from_json(const BasicJsonType& j, ENUM_TYPE& e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [j](const std::pair& ej_pair) -> bool \ - { \ - return ej_pair.second == j; \ - }); \ - e = ((it != std::end(m)) ? it : std::begin(m))->first; \ - } - -// Ugly macros to avoid uglier copy-paste when specializing basic_json. They -// may be removed in the future once the class is split. - -#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ - template class ObjectType, \ - template class ArrayType, \ - class StringType, class BooleanType, class NumberIntegerType, \ - class NumberUnsignedType, class NumberFloatType, \ - template class AllocatorType, \ - template class JSONSerializer> - -#define NLOHMANN_BASIC_JSON_TPL \ - basic_json - -// #include - - -#include // not -#include // size_t -#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type - -namespace nlohmann -{ -namespace detail -{ -// alias templates to reduce boilerplate -template -using enable_if_t = typename std::enable_if::type; - -template -using uncvref_t = typename std::remove_cv::type>::type; - -// implementation of C++14 index_sequence and affiliates -// source: https://stackoverflow.com/a/32223343 -template -struct index_sequence -{ - using type = index_sequence; - using value_type = std::size_t; - static constexpr std::size_t size() noexcept - { - return sizeof...(Ints); - } -}; - -template -struct merge_and_renumber; - -template -struct merge_and_renumber, index_sequence> - : index_sequence < I1..., (sizeof...(I1) + I2)... > {}; - -template -struct make_index_sequence - : merge_and_renumber < typename make_index_sequence < N / 2 >::type, - typename make_index_sequence < N - N / 2 >::type > {}; - -template<> struct make_index_sequence<0> : index_sequence<> {}; -template<> struct make_index_sequence<1> : index_sequence<0> {}; - -template -using index_sequence_for = make_index_sequence; - -// dispatch utility (taken from ranges-v3) -template struct priority_tag : priority_tag < N - 1 > {}; -template<> struct priority_tag<0> {}; - -// taken from ranges-v3 -template -struct static_const -{ - static constexpr T value{}; -}; - -template -constexpr T static_const::value; -} // namespace detail -} // namespace nlohmann - -// #include - - -#include // not -#include // numeric_limits -#include // false_type, is_constructible, is_integral, is_same, true_type -#include // declval - -// #include - -// #include - - -#include // random_access_iterator_tag - -// #include - - -namespace nlohmann -{ -namespace detail -{ -template struct make_void -{ - using type = void; -}; -template using void_t = typename make_void::type; -} // namespace detail -} // namespace nlohmann - -// #include - - -namespace nlohmann -{ -namespace detail -{ -template -struct iterator_types {}; - -template -struct iterator_types < - It, - void_t> -{ - using difference_type = typename It::difference_type; - using value_type = typename It::value_type; - using pointer = typename It::pointer; - using reference = typename It::reference; - using iterator_category = typename It::iterator_category; -}; - -// This is required as some compilers implement std::iterator_traits in a way that -// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341. -template -struct iterator_traits -{ -}; - -template -struct iterator_traits < T, enable_if_t < !std::is_pointer::value >> - : iterator_types -{ -}; - -template -struct iterator_traits::value>> -{ - using iterator_category = std::random_access_iterator_tag; - using value_type = T; - using difference_type = ptrdiff_t; - using pointer = T*; - using reference = T&; -}; -} -} - -// #include - -// #include - - -#include - -// #include - - -// http://en.cppreference.com/w/cpp/experimental/is_detected -namespace nlohmann -{ -namespace detail -{ -struct nonesuch -{ - nonesuch() = delete; - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - void operator=(nonesuch const&) = delete; -}; - -template class Op, - class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -template