This is the Pythia difficulty prediction module for Coraxlib. Since this module relies on Coraxlib types and Coraxlib cmake scopes this module is only intended for use in combination with Coraxlib.
The module provides functionality for computing and collecting the features for a given multiple sequence alignment (MSA) that are required to predict the difficulty, as well as a function for predicting the difficulty.
The prediction code in ./difficulty
is an autogenerated export of a trained scikit-learn Random Decision Forest (exported using the Treelite model compiler).
The same functionality is also available as lightweight, stand-along Python library here. If you are only interested in the difficulty of your MSA, we recommend using the Python version of Pythia. If you want to incorporate the difficulty prediction in a phylogenetic tool, we recommend using this faster C library.
#include <iostream>
#include <vector>
#include <map>
#include <tuple>
#include <math.h>
#include "corax/corax.h"
#include "difficulty.h"
std::vector<corax_split_t *> get_pars_splits(corax_msa_t *msa, int n_trees)
{
/**
* Infers n_trees parsimony trees for the given MSA using Coraxlib, creates and returns the splits.
*/
std::vector<corax_utree_t *> pars_trees(n_trees);
std::vector<corax_split_t *> splits(n_trees);
std::map<std::string, unsigned int> labelToId;
unsigned int score;
int n_taxa = msa->count;
for (int i = 0; i < n_trees; ++i)
{
pars_trees[i] = corax_utree_create_parsimony(n_taxa,
msa->length,
msa->label,
msa->sequence,
NULL, /* site weights */
corax_map_nt,
4,
0,
i, /* seed */
&score);
}
for (int i = 0; i < n_taxa; ++i)
{
labelToId.insert({std::string(pars_trees[0]->nodes[i]->label), i});
}
for (corax_utree_t * tree: pars_trees)
{
for (int i = 0; i < n_taxa; ++i)
{
auto leaf = tree->nodes[i];
auto id = labelToId.at(std::string(leaf->label));
leaf->node_index = leaf->clv_index = id;
}
}
for (int i = 0; i < n_trees; ++i)
{
splits[i] = corax_utree_split_create(pars_trees[i]->vroot, n_taxa, nullptr);
}
for (int i = 0; i < n_trees; ++i)
{
corax_utree_destroy(pars_trees[i], NULL);
}
return splits;
}
std::tuple<int, double> get_num_unique_and_rel_rfdist(std::vector<corax_split_t *> splits, int n_taxa)
{
/**
* Computes the average relative RF distance and the number of unique topologies for the given splits.
*/
int num_trees = splits.size();
int num_unique = 1;
double avg_rrf = 0.0;
double max_rf = (double)2 * (n_taxa - 3);
size_t num_pairs = 0;
for (int i = 0; i < num_trees - 1; ++i)
{
bool uniq = true;
for (int j = i + 1; j < num_trees; ++j)
{
double rf = corax_utree_split_rf_distance(splits[i], splits[j], n_taxa);
avg_rrf += ((double)rf) / max_rf;
num_pairs++;
uniq &= (rf > 0);
}
if (uniq)
num_unique++;
}
avg_rrf /= num_pairs;
return {num_unique, avg_rrf};
}
int main(int argc, char *argv[])
{
const char *filename = "path/to/msa.phy";
/*
* make sure to update this function call based on your MSA:
* - for MSAs in phylip format use corax_phylip_load and set the interleaved flag accordingly
* - for MSAs in fasta format use corax_fasta_load
*/
corax_msa_t *msa = corax_phylip_load(filename, CORAX_FALSE);
size_t _num_trees = 100;
int n_taxa = msa->count;
std::vector<corax_split_t *> splits = get_pars_splits(msa, _num_trees);
int num_unique;
double avg_rrf;
std::tie(num_unique, avg_rrf) = get_num_unique_and_rel_rfdist(splits, n_taxa);
corax_msa_features * features = corax_msa_compute_features(msa, 4, corax_map_nt);
double out_pred = corax_msa_predict_difficulty(features, avg_rrf, num_unique / _num_trees);
out_pred = round(out_pred * 100.0) / 100.0;
std::cout << "The predicted difficulty for MSA " << filename << " is: " << out_pred << "\n";
corax_msa_destroy(msa);
free(features);
for (auto s : splits) corax_utree_split_destroy(s);
}
The paper explaining the details of Pythia is published in MBE: Haag, J., Höhler, D., Bettisworth, B., & Stamatakis, A. (2022). From Easy to Hopeless - Predicting the Difficulty of Phylogenetic Analyses. Molecular Biology and Evolution, 39(12). https://doi.org/10.1093/molbev/msac254