added test for utils

bokulich-lab · May 7, 2024 · 1d1d6bb · 1d1d6bb
1 parent eedf915
commit 1d1d6bb
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 19 deletions.
diff --git a/q2_moshpit/busco/busco.py b/q2_moshpit/busco/busco.py
@@ -22,7 +22,8 @@
 
 from q2_moshpit.busco.utils import (
     _parse_busco_params, _collect_summaries, _rename_columns,
-    _parse_df_columns, _partition_dataframe, _calculate_summary_stats,
+    _parse_df_columns, _partition_dataframe_sample_data,
+    _calculate_summary_stats, _partition_dataframe_feature_data,
     _get_feature_table, _cleanup_bootstrap, _get_mag_lengths
 )
 from q2_moshpit._utils import _process_common_input_params, run_command
@@ -155,10 +156,10 @@ def _visualize_busco(output_dir: str, busco_results: pd.DataFrame) -> None:
     )
     # Outputs different df for sample and feature data
     busco_results = _parse_df_columns(busco_results)
-    n = 100
+    n = 100  # Max number of rows
 
     if len(busco_results["sample_id"].unique()) >= 2:
-        dfs = _partition_dataframe(busco_results, max_rows=n)
+        dfs = _partition_dataframe_sample_data(busco_results, max_rows=n)
         column_name = "sample_id"
         assets_subdir = "sample_data"
         tab_title = ["Sample details", "Feature details"]
@@ -169,7 +170,7 @@ def _visualize_busco(output_dir: str, busco_results: pd.DataFrame) -> None:
             json.dumps(_draw_selectable_summary_histograms(busco_results))
         }
     else:
-        dfs = [busco_results[i:i+100] for i in range(0, len(busco_results), n)]
+        dfs = _partition_dataframe_feature_data(busco_results, n)
         column_name = "mag_id"
         tab_title = ["BUSCO Plots", "BUSCO Table"]
         assets_subdir = "feature_data"

diff --git a/q2_moshpit/busco/tests/data/feature_table_feature_data.json b/q2_moshpit/busco/tests/data/feature_table_feature_data.json
@@ -0,0 +1,36 @@
+{
+    "columns": [
+      "MAG",
+      "Dataset",
+      "% single",
+      "% duplicated",
+      "% fragmented",
+      "% missing",
+      "% complete",
+      "Total markers",
+      "N50 contigs",
+      "Percent gaps",
+      "Contigs",
+      "Length (bp)"
+    ],
+    "index": [
+      0
+    ],
+    "data": [
+      [
+        "mag1",
+        "dataset1",
+        1,
+        4,
+        7,
+        10,
+        13,
+        16,
+        19,
+        22,
+        25,
+        28
+      ]
+    ]
+  }
+
diff --git a/...shpit/busco/tests/data/feature_table.json → ...tests/data/feature_table_sample_data.json b/...shpit/busco/tests/data/feature_table.json → ...tests/data/feature_table_sample_data.json
diff --git a/q2_moshpit/busco/tests/test_utils.py b/q2_moshpit/busco/tests/test_utils.py
@@ -13,10 +13,11 @@
 
 from q2_moshpit.busco.utils import (
     _parse_busco_params, _collect_summaries, _parse_df_columns,
-    _partition_dataframe, _get_feature_table, _calculate_summary_stats,
-    _get_mag_lengths,
+    _partition_dataframe_sample_data, _partition_dataframe_feature_data,
+    _get_feature_table, _calculate_summary_stats, _get_mag_lengths,
 )
 from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt
+from q2_types.feature_data_mag import MAGSequencesDirFmt
 
 
 class TestBUSCOUtils(TestPluginBase):
@@ -28,6 +29,10 @@ def setUp(self):
             path=self.get_data_path('mags'),
             mode="r",
         )
+        self.feature_data_mags = MAGSequencesDirFmt(
+            path=self.get_data_path('mags/sample1'),
+            mode="r",
+        )
         self.df1 = pd.DataFrame({
             'sample_id': ['sample1'] * 6 + ['sample2'] * 4 + ['sample3'] * 5,
             'mag_id': [f'mag{i}' for i in range(1, 16)],
@@ -116,50 +121,118 @@ def test_parse_df_columns(self):
         exp = self.df5
         pd.testing.assert_frame_equal(obs, exp)
 
-    def test_partition_dataframe_max_rows_5(self):
-        partitions = _partition_dataframe(self.df1, max_rows=5)
+    def test_partition_dataframe_sample_data_max_rows_5(self):
+        partitions = _partition_dataframe_sample_data(self.df1, max_rows=5)
         self.assertEqual(len(partitions), 3)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(6, 3), (4, 3), (5, 3)]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-        partitions = _partition_dataframe(self.df2, max_rows=5)
+        partitions = _partition_dataframe_sample_data(self.df2, max_rows=5)
         self.assertEqual(len(partitions), 3)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(6, 3), (6, 3), (3, 3)]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-    def test_partition_dataframe_max_rows_10(self):
-        partitions = _partition_dataframe(self.df1, max_rows=10)
+    def test_partition_dataframe_sample_data_max_rows_10(self):
+        partitions = _partition_dataframe_sample_data(self.df1, max_rows=10)
         self.assertEqual(len(partitions), 2)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(10, 3), (5, 3)]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-        partitions = _partition_dataframe(self.df2, max_rows=10)
+        partitions = _partition_dataframe_sample_data(self.df2, max_rows=10)
         self.assertEqual(len(partitions), 2)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(6, 3), (9, 3)]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-    def test_partition_dataframe_max_rows_15(self):
-        partitions = _partition_dataframe(self.df1, max_rows=15)
+    def test_partition_dataframe_sample_data_max_rows_15(self):
+        partitions = _partition_dataframe_sample_data(self.df1, max_rows=15)
         self.assertEqual(len(partitions), 1)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(15, 3),]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-        partitions = _partition_dataframe(self.df2, max_rows=15)
+        partitions = _partition_dataframe_sample_data(self.df2, max_rows=15)
         self.assertEqual(len(partitions), 1)
         obs_shapes = [p.shape for p in partitions]
         exp_shapes = [(15, 3), ]
         self.assertListEqual(obs_shapes, exp_shapes)
 
-    def test_get_feature_table(self):
+    def test_partition_dataframe_feature_data_max_rows_5(self):
+        n = 5
+        df1 = self.df1.copy()
+        df1 = df1.loc[df1["sample_id"] == "sample1"]
+        partitions = _partition_dataframe_feature_data(df1, max_rows=n)
+        self.assertEqual(len(partitions), 2)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(5, 3), (1, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+        df2 = self.df2.copy()
+        df2 = df2.loc[df2["sample_id"] == "sample3"]
+        partitions = _partition_dataframe_feature_data(df2, max_rows=n)
+        self.assertEqual(len(partitions), 1)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(3, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+    def test_partition_dataframe_feature_data_max_rows_10(self):
+        n = 10
+        df1 = self.df1.copy()
+        df1 = df1.loc[df1["sample_id"] == "sample1"]
+        partitions = _partition_dataframe_feature_data(df1, max_rows=n)
+        self.assertEqual(len(partitions), 1)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(6, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+        df2 = self.df2.copy()
+        df2 = df2.loc[df2["sample_id"] == "sample2"]
+        partitions = _partition_dataframe_feature_data(df2, max_rows=n)
+        self.assertEqual(len(partitions), 1)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(6, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+    def test_partition_dataframe_feature_data_max_rows_15(self):
+        n = 10
+        df1 = self.df1.copy()
+        df1 = df1.loc[df1["sample_id"] == "sample1"]
+        partitions = _partition_dataframe_feature_data(df1, max_rows=n)
+        self.assertEqual(len(partitions), 1)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(6, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+        df2 = self.df2.copy()
+        df2 = df2.loc[df2["sample_id"] == "sample2"]
+        partitions = _partition_dataframe_feature_data(df2, max_rows=n)
+        self.assertEqual(len(partitions), 1)
+        obs_shapes = [p.shape for p in partitions]
+        exp_shapes = [(6, 3)]
+        self.assertListEqual(obs_shapes, exp_shapes)
+
+    def test_get_feature_table_sample_data(self):
         obs = json.loads(
             _get_feature_table(self.df3)
         )
-        with open(self.get_data_path('feature_table.json'), 'r') as f:
+        with open(
+            self.get_data_path('feature_table_sample_data.json'), 'r'
+        ) as f:
+            exp = json.load(f)
+        self.assertDictEqual(obs, exp)
+
+    def test_get_feature_table_feature_data(self):
+        df3 = self.df3.copy()
+        df3 = df3.loc[df3["sample_id"] == "sample1"]
+        obs = json.loads(
+            _get_feature_table(df3)
+        )
+        with open(
+            self.get_data_path('feature_table_feature_data.json'), 'r'
+        ) as f:
             exp = json.load(f)
         self.assertDictEqual(obs, exp)
 
@@ -205,7 +278,7 @@ def test_calculate_summary_stats(self):
 
         self.assertEqual(obs, exp)
 
-    def test_get_mag_lengths(self):
+    def test_get_mag_lengths_sample_data(self):
         obs = _get_mag_lengths(self.mags)
         exp = pd.Series(
             {
@@ -218,3 +291,14 @@ def test_get_mag_lengths(self):
             }, name="length"
         )
         pd.testing.assert_series_equal(obs, exp)
+
+    def test_get_mag_lengths_feature_data(self):
+        obs = _get_mag_lengths(self.feature_data_mags)
+        exp = pd.Series(
+            {
+                '24dee6fe-9b84-45bb-8145-de7b092533a1': 1935,
+                'ca7012fc-ba65-40c3-84f5-05aa478a7585': 3000,
+                'fb0bc871-04f6-486b-a10e-8e0cb66f8de3': 2000,
+            }, name="length"
+        )
+        pd.testing.assert_series_equal(obs, exp)
diff --git a/q2_moshpit/busco/utils.py b/q2_moshpit/busco/utils.py
@@ -50,7 +50,7 @@ def _parse_busco_params(arg_key, arg_val) -> List[str]:
         return [f"--{arg_key}", str(arg_val)]
 
 
-def _partition_dataframe(df: pd.DataFrame, max_rows: int) -> list:
+def _partition_dataframe_sample_data(df: pd.DataFrame, max_rows: int) -> list:
     """
     Partitions a DataFrame into smaller DataFrames based on
     a maximum row limit.
@@ -93,6 +93,25 @@ def _partition_dataframe(df: pd.DataFrame, max_rows: int) -> list:
     return partitions
 
 
+def _partition_dataframe_feature_data(df: pd.DataFrame, max_rows: int) -> list:
+    """
+    Partitions a DataFrame into smaller DataFrames based on
+    a maximum row limit. Each partition will have a total
+    row count less than or equal to the `max_rows` parameter.
+
+    Args:
+        df (pd.DataFrame): The DataFrame to partition. It should have a
+            'sample_id' column.
+        max_rows (int): The maximum number of rows that each partitioned
+            DataFrame should have.
+
+    Returns:
+        list: A list of partitioned DataFrames. Each DataFrame in the
+            list is a partition of the original DataFrame.
+    """
+    return [df[i:i+max_rows] for i in range(0, len(df), max_rows)]
+
+
 def _collect_summaries(run_summaries_fp_map: dict) -> pd.DataFrame:
     """
     Reads-in the sample-wise summaries and concatenates them in one