Skip to content

Commit

Permalink
BUG: fix duplicated genus in _combine_ncbi_trees
Browse files Browse the repository at this point in the history
The solves a specific issue in scikit-bio where TreeNode.find will exit the subtree defined by a node and find an unrelated previously cached node

Co-authored-by: ebolyen <[email protected]>
  • Loading branch information
cherman2 and ebolyen committed Nov 19, 2024
1 parent 6be63ce commit b264687
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 9 deletions.
32 changes: 23 additions & 9 deletions q2_moshpit/kraken2/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,21 +223,35 @@ def _kraken_to_ncbi_tree(df):
def _combine_ncbi_trees(trees):
full_tree = trees[0]
for tree in trees[1:]:
tip_cache = {t.name: t for t in full_tree.tips()}
for tip in list(tree.tips()):
try:
# check if taxid is already in this tree
full_tree.find(tip.name)
# check if taxid is already in this tree
if tip.name in tip_cache:
continue # for clarity
except skbio.tree.MissingNodeError:
else:
parents = list(tip.ancestors())[:-1] # ignore unnamed root
matching = full_tree
while parents:
subtree_inserted = False
while parents and not subtree_inserted:
node = parents.pop()
try:
matching = matching.find(node.name)
except skbio.tree.MissingNodeError:
ancestor_found = False

for child in matching.children:
if child.name == node.name:
matching = child
ancestor_found = True
break
if not ancestor_found:
matching.append(node)
break
for t in node.tips():
tip_cache[t.name] = t
assert tip.name in tip_cache
subtree_inserted = True

if not subtree_inserted:
# should be impossible. Implies ancestor_found was always
# True but not in tip_cache
raise AssertionError(f"{tip.name} could not be inserted")
return full_tree


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
100 9332144 0 R 1 root
96.08 8966446 0 R1 131567 cellular organisms
5.3 494861 0 D 2759 Eukaryota
4.81 448908 0 D1 33154 Opisthokonta
4.54 423718 0 K 33208 Metazoa
4.53 422551 0 K1 6072 Eumetazoa
4.5 419698 0 K2 33213 Bilateria
3.42 318923 0 K3 33317 Protostomia
3.1 289656 0 K4 1206794 Ecdysozoa
3.07 286854 0 K5 88770 Panarthropoda
3.07 286847 0 P 6656 Arthropoda
3.01 281163 0 P1 197563 Mandibulata
3.01 281054 0 P2 197562 Pancrustacea
2.94 274151 0 P3 6960 Hexapoda
2.93 273688 0 C 50557 Insecta
2.93 273688 0 C1 85512 Dicondylia
2.93 273688 0 C2 7496 Pterygota
2.92 272146 0 C3 33340 Neoptera
0.06 5786 0 C4 33342 Paraneoptera
0.06 5578 0 O 7524 Hemiptera
0.02 2099 0 O1 33343 Prosorrhyncha
0.02 2099 0 O2 33345 Heteroptera
0.02 2099 0 O3 33347 Euheteroptera
0.02 2099 0 O4 33349 Neoheteroptera
0.02 1673 0 O5 33351 Panheteroptera
0.01 962 0 O6 33354 Cimicomorpha
0.01 935 0 O7 33355 Cimicoidea
0.01 927 0 F 30083 Miridae
0 228 0 F1 236635 Phylinae
0 228 0 F2 236648 Pilophorini
0 228 0 G 237084 Pilophorus
0.27 25092 0 K 4751 Fungi
0.24 22558 0 K1 451864 Dikarya
0.22 20530 0 P 4890 Ascomycota
0.22 20419 0 P1 716545 saccharomyceta
0.2 18909 0 P2 147538 Pezizomycotina
0.2 18778 0 P3 716546 leotiomyceta
0 58 0 C 147547 Lecanoromycetes
0 58 0 C1 1520881 OSLEUM clade
0 58 0 C2 388435 Lecanoromycetidae
0 58 0 O 5197 Lecanorales
0 58 0 O1 157822 Lecanorineae
0 53 0 F 5198 Cladoniaceae
0 53 0 G 5199 Cladonia
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
100 7316706 0 R 1 root
95.72 7003211 0 R1 131567 cellular organisms
6.5 475365 0 D 2759 Eukaryota
5.81 425370 0 D1 33154 Opisthokonta
5.54 405469 0 K 33208 Metazoa
5.52 404192 0 K1 6072 Eumetazoa
5.47 400582 0 K2 33213 Bilateria
3.72 271980 0 K3 33317 Protostomia
3.32 242600 0 K4 1206794 Ecdysozoa
3.28 239851 0 K5 88770 Panarthropoda
3.28 239726 0 P 6656 Arthropoda
3.24 236861 0 P1 197563 Mandibulata
3.24 236717 0 P2 197562 Pancrustacea
3.18 232964 0 P3 6960 Hexapoda
3.18 232673 0 C 50557 Insecta
3.18 232673 0 C1 85512 Dicondylia
3.18 232673 0 C2 7496 Pterygota
3.16 230960 0 C3 33340 Neoptera
0.06 4692 0 C4 33342 Paraneoptera
0.06 4576 0 O 7524 Hemiptera
0.03 2165 0 O1 33343 Prosorrhyncha
0.03 2165 0 O2 33345 Heteroptera
0.03 2165 0 O3 33347 Euheteroptera
0.03 2165 0 O4 33349 Neoheteroptera
0.03 1877 0 O5 33351 Panheteroptera
0.01 874 0 O6 33354 Cimicomorpha
0.01 874 0 O7 33355 Cimicoidea
0 126 0 F1 236635 Phylinae
0 126 0 F2 236648 Pilophorini
0 126 0 G 237084 Pilophorus
0.27 19812 0 K 4751 Fungi
0.24 17222 0 K1 451864 Dikarya
0.2 14668 0 P 4890 Ascomycota
0.2 14565 0 P1 716545 saccharomyceta
0.14 10258 0 P2 147538 Pezizomycotina
0.14 10225 0 P3 716546 leotiomyceta
0 24 0 C 147547 Lecanoromycetes
0 24 0 C1 1520881 OSLEUM clade
0 24 0 C2 388435 Lecanoromycetidae
0 24 0 O 5197 Lecanorales
0 24 0 O1 157822 Lecanorineae
0 5 0 F 5198 Cladoniaceae
0 4 0 G 5199 Cladonia
0 1 0 G 51977 Pilophorus
19 changes: 19 additions & 0 deletions q2_moshpit/kraken2/tests/test_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,25 @@ def setUp(self):
def tearDown(self):
shutil.rmtree(self.temp_dir)

def test_kraken2_to_features_duplicated_genus(self):
reports = Kraken2ReportDirectoryFormat(
self.get_data_path("duplicated-genus/"), "r"
)
obs_table, obs_taxonomy = kraken2_to_features(
reports, coverage_threshold=0.0)
# 51977 is the taxon code for pilophorus which shares a genus name with
# taxon code 237084
assert '51977' in obs_taxonomy.index
assert '51977' in obs_table.columns
assert '237084' in obs_taxonomy.index
assert '237084' in obs_table.columns

# Taxon codes 51977 and 5199 are sister genera and 5199 needs to be in
# the first evaluated tree and 51977 must be absent
# to induce this duplicated genus name issue in the second tree
assert '5199' in obs_taxonomy.index
assert '5199' in obs_table.columns

def test_kraken2_to_features_coverage_threshold(self):
reports = Kraken2ReportDirectoryFormat(
self.get_data_path("kraken2-reports-select/samples"), "r"
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
'data/bracken-report/*/*',
'data/bracken-report-with-unclassified/*',
'data/bracken-report-with-unclassified/*/*',
'data/duplicated-genus/*',
'data/kraken2-reports-select/*',
'data/kraken2-reports-select/*/*',
'data/kraken2-to-ncbi-tree/*',
Expand Down

0 comments on commit b264687

Please sign in to comment.