Skip to content

Commit

Permalink
FIX: correctly label 'Unclassfied' MAGs in the respective taxonomy (#183
Browse files Browse the repository at this point in the history
)
  • Loading branch information
misialq authored Jun 21, 2024
1 parent 396c161 commit 3e7b75c
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 0 deletions.
7 changes: 7 additions & 0 deletions q2_moshpit/kraken2/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
RANKS = 'dkpcofgs'


def _fill_unclassified(row):
if row.isnull().all():
row[0] = 'Unclassified'
return row


def _find_lcas(taxa_list: List[pd.DataFrame], mode: str):
"""Find the least common ancestor in every DataFrame of taxa.
Expand Down Expand Up @@ -54,6 +60,7 @@ def _find_lcas(taxa_list: List[pd.DataFrame], mode: str):
results[mag_id] = result

results = pd.DataFrame.from_dict(results, orient='index')
results = results.apply(_fill_unclassified, axis=1)
results = results.apply(lambda x: x.tolist(), axis=1).to_frame()
results.columns = ['Taxon']

Expand Down
5 changes: 5 additions & 0 deletions q2_moshpit/kraken2/tests/data/mag-taxa-5.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Feature ID,Taxon,mag_id
k141_1566,d__Bacteria;k__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Mycobacteriales;f__Mycobacteriaceae;g__Mycobacterium;s__Mycobacterium florentinum,fed92059-3222-4573-b0ec-726c49fbfabb
k141_1045,d__Bacteria;k__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Mycobacteriales;f__Mycobacteriaceae;g__Mycobacterium;s__Mycobacterium florentinum,fed92059-3222-4573-b0ec-726c49fbfabb
k141_1566,d__Bacteria;k__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Mycobacteriales;f__Mycobacteriaceae;g__Mycobacterium;s__Mycobacterium florentinum,fed92059-3222-4573-b0ec-726c49fbfabb
k141_1045,d__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates;f__Hominidae;g__Homo;s__Homo sapiens,fed92059-3222-4573-b0ec-726c49fbfabb
21 changes: 21 additions & 0 deletions q2_moshpit/kraken2/tests/test_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def setUp(self):
self.taxa_mag4 = pd.read_csv(
self.get_data_path('mag-taxa-4.csv'), index_col=0
)
self.taxa_mag5 = pd.read_csv(
self.get_data_path('mag-taxa-5.csv'), index_col=0
)

def tearDown(self):
shutil.rmtree(self.temp_dir)
Expand Down Expand Up @@ -160,6 +163,24 @@ def test_find_lcas_mode_lca(self):
exp.index.name = 'Feature ID'
pandas.testing.assert_frame_equal(obs, exp)

def test_find_lcas_mode_lca_unclassified(self):
taxa = [self.taxa_mag1, self.taxa_mag2, self.taxa_mag5]
obs = _find_lcas(taxa, mode='lca')
exp = pd.DataFrame.from_dict({
'0e514d88-16c4-4273-a1df-1a360eb2c823': [
'd__Bacteria;k__Bacteria;p__Actinomycetota;c__Actinomycetes;'
'o__Mycobacteriales;f__Mycobacteriaceae;g__Mycobacterium'
],
'3acec411-b0d0-4441-b936-5b8b571fa328': [
'd__Bacteria;k__Bacteria;p__Actinomycetota;c__Actinomycetes;'
'o__Mycobacteriales;f__Mycobacteriaceae;g__Mycobacterium'
],
'fed92059-3222-4573-b0ec-726c49fbfabb': ['d__Unclassified']
}, orient='index')
exp.columns = ['Taxon']
exp.index.name = 'Feature ID'
pandas.testing.assert_frame_equal(obs, exp)

# def test_find_lcas_mode_majority(self):
# taxa = [
# self.taxa_mag1, self.taxa_mag2, self.taxa_mag3, self.taxa_mag4
Expand Down

0 comments on commit 3e7b75c

Please sign in to comment.