Skip to content

Commit

Permalink
added test for utils
Browse files Browse the repository at this point in the history
  • Loading branch information
Sann5 committed May 7, 2024
1 parent eedf915 commit 1d1d6bb
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 19 deletions.
9 changes: 5 additions & 4 deletions q2_moshpit/busco/busco.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

from q2_moshpit.busco.utils import (
_parse_busco_params, _collect_summaries, _rename_columns,
_parse_df_columns, _partition_dataframe, _calculate_summary_stats,
_parse_df_columns, _partition_dataframe_sample_data,
_calculate_summary_stats, _partition_dataframe_feature_data,
_get_feature_table, _cleanup_bootstrap, _get_mag_lengths
)
from q2_moshpit._utils import _process_common_input_params, run_command
Expand Down Expand Up @@ -155,10 +156,10 @@ def _visualize_busco(output_dir: str, busco_results: pd.DataFrame) -> None:
)
# Outputs different df for sample and feature data
busco_results = _parse_df_columns(busco_results)
n = 100
n = 100 # Max number of rows

if len(busco_results["sample_id"].unique()) >= 2:
dfs = _partition_dataframe(busco_results, max_rows=n)
dfs = _partition_dataframe_sample_data(busco_results, max_rows=n)
column_name = "sample_id"
assets_subdir = "sample_data"
tab_title = ["Sample details", "Feature details"]
Expand All @@ -169,7 +170,7 @@ def _visualize_busco(output_dir: str, busco_results: pd.DataFrame) -> None:
json.dumps(_draw_selectable_summary_histograms(busco_results))
}
else:
dfs = [busco_results[i:i+100] for i in range(0, len(busco_results), n)]
dfs = _partition_dataframe_feature_data(busco_results, n)
column_name = "mag_id"
tab_title = ["BUSCO Plots", "BUSCO Table"]
assets_subdir = "feature_data"
Expand Down
36 changes: 36 additions & 0 deletions q2_moshpit/busco/tests/data/feature_table_feature_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"columns": [
"MAG",
"Dataset",
"% single",
"% duplicated",
"% fragmented",
"% missing",
"% complete",
"Total markers",
"N50 contigs",
"Percent gaps",
"Contigs",
"Length (bp)"
],
"index": [
0
],
"data": [
[
"mag1",
"dataset1",
1,
4,
7,
10,
13,
16,
19,
22,
25,
28
]
]
}

112 changes: 98 additions & 14 deletions q2_moshpit/busco/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@

from q2_moshpit.busco.utils import (
_parse_busco_params, _collect_summaries, _parse_df_columns,
_partition_dataframe, _get_feature_table, _calculate_summary_stats,
_get_mag_lengths,
_partition_dataframe_sample_data, _partition_dataframe_feature_data,
_get_feature_table, _calculate_summary_stats, _get_mag_lengths,
)
from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt
from q2_types.feature_data_mag import MAGSequencesDirFmt


class TestBUSCOUtils(TestPluginBase):
Expand All @@ -28,6 +29,10 @@ def setUp(self):
path=self.get_data_path('mags'),
mode="r",
)
self.feature_data_mags = MAGSequencesDirFmt(
path=self.get_data_path('mags/sample1'),
mode="r",
)
self.df1 = pd.DataFrame({
'sample_id': ['sample1'] * 6 + ['sample2'] * 4 + ['sample3'] * 5,
'mag_id': [f'mag{i}' for i in range(1, 16)],
Expand Down Expand Up @@ -116,50 +121,118 @@ def test_parse_df_columns(self):
exp = self.df5
pd.testing.assert_frame_equal(obs, exp)

def test_partition_dataframe_max_rows_5(self):
partitions = _partition_dataframe(self.df1, max_rows=5)
def test_partition_dataframe_sample_data_max_rows_5(self):
partitions = _partition_dataframe_sample_data(self.df1, max_rows=5)
self.assertEqual(len(partitions), 3)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3), (4, 3), (5, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

partitions = _partition_dataframe(self.df2, max_rows=5)
partitions = _partition_dataframe_sample_data(self.df2, max_rows=5)
self.assertEqual(len(partitions), 3)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3), (6, 3), (3, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

def test_partition_dataframe_max_rows_10(self):
partitions = _partition_dataframe(self.df1, max_rows=10)
def test_partition_dataframe_sample_data_max_rows_10(self):
partitions = _partition_dataframe_sample_data(self.df1, max_rows=10)
self.assertEqual(len(partitions), 2)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(10, 3), (5, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

partitions = _partition_dataframe(self.df2, max_rows=10)
partitions = _partition_dataframe_sample_data(self.df2, max_rows=10)
self.assertEqual(len(partitions), 2)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3), (9, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

def test_partition_dataframe_max_rows_15(self):
partitions = _partition_dataframe(self.df1, max_rows=15)
def test_partition_dataframe_sample_data_max_rows_15(self):
partitions = _partition_dataframe_sample_data(self.df1, max_rows=15)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(15, 3),]
self.assertListEqual(obs_shapes, exp_shapes)

partitions = _partition_dataframe(self.df2, max_rows=15)
partitions = _partition_dataframe_sample_data(self.df2, max_rows=15)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(15, 3), ]
self.assertListEqual(obs_shapes, exp_shapes)

def test_get_feature_table(self):
def test_partition_dataframe_feature_data_max_rows_5(self):
n = 5
df1 = self.df1.copy()
df1 = df1.loc[df1["sample_id"] == "sample1"]
partitions = _partition_dataframe_feature_data(df1, max_rows=n)
self.assertEqual(len(partitions), 2)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(5, 3), (1, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

df2 = self.df2.copy()
df2 = df2.loc[df2["sample_id"] == "sample3"]
partitions = _partition_dataframe_feature_data(df2, max_rows=n)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(3, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

def test_partition_dataframe_feature_data_max_rows_10(self):
n = 10
df1 = self.df1.copy()
df1 = df1.loc[df1["sample_id"] == "sample1"]
partitions = _partition_dataframe_feature_data(df1, max_rows=n)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

df2 = self.df2.copy()
df2 = df2.loc[df2["sample_id"] == "sample2"]
partitions = _partition_dataframe_feature_data(df2, max_rows=n)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

def test_partition_dataframe_feature_data_max_rows_15(self):
n = 10
df1 = self.df1.copy()
df1 = df1.loc[df1["sample_id"] == "sample1"]
partitions = _partition_dataframe_feature_data(df1, max_rows=n)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

df2 = self.df2.copy()
df2 = df2.loc[df2["sample_id"] == "sample2"]
partitions = _partition_dataframe_feature_data(df2, max_rows=n)
self.assertEqual(len(partitions), 1)
obs_shapes = [p.shape for p in partitions]
exp_shapes = [(6, 3)]
self.assertListEqual(obs_shapes, exp_shapes)

def test_get_feature_table_sample_data(self):
obs = json.loads(
_get_feature_table(self.df3)
)
with open(self.get_data_path('feature_table.json'), 'r') as f:
with open(
self.get_data_path('feature_table_sample_data.json'), 'r'
) as f:
exp = json.load(f)
self.assertDictEqual(obs, exp)

def test_get_feature_table_feature_data(self):
df3 = self.df3.copy()
df3 = df3.loc[df3["sample_id"] == "sample1"]
obs = json.loads(
_get_feature_table(df3)
)
with open(
self.get_data_path('feature_table_feature_data.json'), 'r'
) as f:
exp = json.load(f)
self.assertDictEqual(obs, exp)

Expand Down Expand Up @@ -205,7 +278,7 @@ def test_calculate_summary_stats(self):

self.assertEqual(obs, exp)

def test_get_mag_lengths(self):
def test_get_mag_lengths_sample_data(self):
obs = _get_mag_lengths(self.mags)
exp = pd.Series(
{
Expand All @@ -218,3 +291,14 @@ def test_get_mag_lengths(self):
}, name="length"
)
pd.testing.assert_series_equal(obs, exp)

def test_get_mag_lengths_feature_data(self):
obs = _get_mag_lengths(self.feature_data_mags)
exp = pd.Series(
{
'24dee6fe-9b84-45bb-8145-de7b092533a1': 1935,
'ca7012fc-ba65-40c3-84f5-05aa478a7585': 3000,
'fb0bc871-04f6-486b-a10e-8e0cb66f8de3': 2000,
}, name="length"
)
pd.testing.assert_series_equal(obs, exp)
21 changes: 20 additions & 1 deletion q2_moshpit/busco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _parse_busco_params(arg_key, arg_val) -> List[str]:
return [f"--{arg_key}", str(arg_val)]


def _partition_dataframe(df: pd.DataFrame, max_rows: int) -> list:
def _partition_dataframe_sample_data(df: pd.DataFrame, max_rows: int) -> list:
"""
Partitions a DataFrame into smaller DataFrames based on
a maximum row limit.
Expand Down Expand Up @@ -93,6 +93,25 @@ def _partition_dataframe(df: pd.DataFrame, max_rows: int) -> list:
return partitions


def _partition_dataframe_feature_data(df: pd.DataFrame, max_rows: int) -> list:
"""
Partitions a DataFrame into smaller DataFrames based on
a maximum row limit. Each partition will have a total
row count less than or equal to the `max_rows` parameter.
Args:
df (pd.DataFrame): The DataFrame to partition. It should have a
'sample_id' column.
max_rows (int): The maximum number of rows that each partitioned
DataFrame should have.
Returns:
list: A list of partitioned DataFrames. Each DataFrame in the
list is a partition of the original DataFrame.
"""
return [df[i:i+max_rows] for i in range(0, len(df), max_rows)]


def _collect_summaries(run_summaries_fp_map: dict) -> pd.DataFrame:
"""
Reads-in the sample-wise summaries and concatenates them in one
Expand Down

0 comments on commit 1d1d6bb

Please sign in to comment.