From 2233cddd2da4aa3346cb3c506f63b398f0999dc3 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Wed, 23 Oct 2024 18:21:55 +0900
Subject: [PATCH 01/18] cluster and pad utility

---
 src/graphnet/models/graphs/utils.py | 236 +++++++++++++++++++++++++++-
 1 file changed, 235 insertions(+), 1 deletion(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index ea8445f90..4ba99d1a4 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -1,6 +1,6 @@
 """Utility functions for construction of graphs."""
 
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Union
 import os
 import numpy as np
 import pandas as pd
@@ -113,6 +113,7 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
+# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality
 def cluster_summarize_with_percentiles(
     x: np.ndarray,
     summarization_indices: List[int],
@@ -149,6 +150,9 @@ def cluster_summarize_with_percentiles(
     Returns:
         Percentile-summarized array
     """
+    print(
+        "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality"
+    )
     pct_dict = {}
     for feature_idx in summarization_indices:
         summarized_array, column_offset, counts = gather_cluster_sequence(
@@ -172,6 +176,236 @@ def cluster_summarize_with_percentiles(
     return array
 
 
+class cluster_and_pad:
+    """cluster and pad the data for further summarization."""
+
+    def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
+        """Initialize the class with the data and cluster columns.
+
+        Args:
+            x: Array to be clustered
+            cluster_columns: List of column indices on which the clusters
+                            are constructed.
+            Returns: None
+            Adds:
+                clustered_x: Added to the class
+                _counts: Added to the class
+                _padded_x: Added to the class
+        """
+        x = lex_sort(x=x, cluster_columns=cluster_columns)
+
+        unique_sensors, self._counts = np.unique(
+            x[:, cluster_columns], axis=0, return_counts=True
+        )
+
+        contingency_table = np.concatenate(
+            [unique_sensors, self._counts.reshape(-1, 1)], axis=1
+        )
+
+        contingency_table = lex_sort(
+            x=contingency_table, cluster_columns=cluster_columns
+        )
+
+        self.clustered_x = contingency_table[:, 0 : unique_sensors.shape[1]]
+        self._counts = (
+            contingency_table[:, self.clustered_x.shape[1] :]
+            .flatten()
+            .astype(int)
+        )
+
+        self._padded_x = np.empty(
+            (len(self._counts), max(self._counts), x.shape[1])
+        )
+        self._padded_x.fill(np.nan)
+
+        for i in range(len(self._counts)):
+            self._padded_x[i, : self._counts[i]] = x[: self._counts[i]]
+            x = x[self._counts[i] :]
+
+    def _add_column(
+        self, column: np.ndarray, location: Optional[int] = None
+    ) -> None:
+        """Add a column to the clustered tensor.
+
+        Args:
+            column: Column to be added to the tensor
+            location: Location to insert the column in the clustered tensor
+        Returns:
+            clustered_x: The clustered tensor with the column added
+        """
+        if location is None:
+            self.clustered_x = np.column_stack([self.clustered_x, column])
+        else:
+            self.clustered_x = np.insert(
+                self.clustered_x, location, column, axis=1
+            )
+
+    def add_charge_threshold_summary(
+        self,
+        summarization_indices: List[int],
+        percentiles: List[int],
+        charge_index: int,
+        location: Optional[int] = None,
+    ) -> np.ndarray:
+        """Summarize features through percentiles on charge of sensor.
+
+        Args:
+            summarization_indices: List of column indices that defines features
+                                    that will be summarized with percentiles.
+            percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+            charge_index: index of the charge column in the padded tensor
+            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+        Returns:
+            clustered_x: The clustered tensor with the summarization indices added
+        Adds:
+            _charge_sum: Added to the class
+            _charge_weights: Added to the class
+        Altered:
+            _padded_x: Charge is altered to be the cumulative sum
+                        of the charge divided by the total charge
+            clustered_x: The summarization indices are added at the end of the tensor
+        """
+        # convert the charge to the cumulative sum of the charge divided by the total charge
+        self._charge_weights = self._padded_x[:, :, charge_index]
+
+        self._padded_x[:, :, charge_index] = self._padded_x[
+            :, :, charge_index
+        ].cumsum(axis=1)
+
+        # add the charge sum to the class if it does not already exist
+        if not hasattr(self, "_charge_sum"):
+            self._charge_sum = np.nanmax(
+                self._padded_x[:, :, charge_index], axis=1
+            )
+
+        self._charge_weights = (
+            self._charge_weights / self._charge_sum[:, np.newaxis]
+        )
+
+        self._padded_x[:, :, charge_index] = (
+            self._padded_x[:, :, charge_index]
+            / self._charge_sum[:, np.newaxis]
+        )
+
+        # Summarize the charge at different percentiles
+        selections = np.argmax(
+            self._padded_x[:, :, charge_index][:, :, np.newaxis]
+            >= (np.array(percentiles) / 100),
+            axis=1,
+        )
+
+        selections += (np.arange(len(self._counts)) * self._padded_x.shape[1])[
+            :, np.newaxis
+        ]
+
+        selections = self._padded_x[:, :, summarization_indices].reshape(
+            -1, len(summarization_indices)
+        )[selections]
+        selections = selections.transpose(0, 2, 1).reshape(
+            len(self.clustered_x), -1
+        )
+        self._add_column(selections, location)
+        return self.clustered_x
+
+    def add_percentile_summary(
+        self,
+        summarization_indices: List[int],
+        percentiles: List[int],
+        method: str = "linear",
+        location: Optional[int] = None,
+    ) -> np.ndarray:
+        """Summarize the features of the sensors using percentiles.
+
+        Args:
+            summarization_indices: List of column indices that defines features
+                                    that will be summarized with percentiles.
+            percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+            method: Method to summarize the features. E.g. "linear"
+            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+        Returns:
+            None
+        Adds:
+            None
+        Altered:
+            clustered_x: The summarization indices are added at the end of the tensor
+        """
+        percentiles_x = np.nanpercentile(
+            self._padded_x[:, :, summarization_indices],
+            percentiles,
+            axis=1,
+            method=method,
+        )
+
+        percentiles_x = percentiles_x.transpose(1, 2, 0).reshape(
+            len(self.clustered_x), -1
+        )
+        self._add_column(percentiles_x, location)
+        return self.clustered_x
+
+    def add_counts(self, location: int) -> np.ndarray:
+        """Add the counts of the sensor to the summarization features."""
+        self._add_column(np.log10(self._counts), location)
+        return self.clustered_x
+
+    def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
+        """Calculate the sum of the charge."""
+        assert not hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has already been calculated, re-calculation is not allowed"
+        self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
+        return self._charge_sum
+
+    def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
+        """Calculate the weights of the charge."""
+        assert not hasattr(
+            self, "_charge_weights"
+        ), "Charge weights have already been calculated, re-calculation is not allowed"
+        assert hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        self._charge_weights = (
+            self._padded_x[:, :, charge_index]
+            / self._charge_sum[:, np.newaxis]
+        )
+        return self._charge_weights
+
+    def add_sum_charge(self, location: int) -> np.ndarray:
+        """Add the sum of the charge to the summarization features."""
+        assert hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        self._add_column(self._charge_sum, location)
+        return self.clustered_x
+
+    def add_std(
+        self,
+        column: int,
+        location: Optional[int] = None,
+        weights: Union[np.ndarray, int] = 1,
+    ) -> np.ndarray:
+        """Add the standard deviation of the column.
+
+        Args:
+            column: Index of the column in the padded tensor to calculate the standard deviation
+            location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end
+            weights: Optional weights to be applied to the standard deviation
+        """
+        self._add_column(
+            np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location
+        )
+        return self.clustered_x
+
+    def add_mean(
+        self, column: int, location: int, weights: Union[np.ndarray, int] = 1
+    ) -> np.ndarray:
+        """Add the mean of the column."""
+        self._add_column(
+            np.nanmean(self._padded_x[:, :, column] * weights, axis=1),
+            location,
+        )
+        return self.clustered_x
+
+
 def ice_transparency(
     z_offset: float = None, z_scaling: float = None
 ) -> Tuple[interp1d, interp1d]:

From 19122e7bef2ed2a2cef0286ebac401265eef4fcc Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 16:59:02 +0900
Subject: [PATCH 02/18] Location default None

---
 src/graphnet/models/graphs/utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 4ba99d1a4..078aa04f2 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -342,11 +342,6 @@ def add_percentile_summary(
         self._add_column(percentiles_x, location)
         return self.clustered_x
 
-    def add_counts(self, location: int) -> np.ndarray:
-        """Add the counts of the sensor to the summarization features."""
-        self._add_column(np.log10(self._counts), location)
-        return self.clustered_x
-
     def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
         """Calculate the sum of the charge."""
         assert not hasattr(
@@ -369,7 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         )
         return self._charge_weights
 
-    def add_sum_charge(self, location: int) -> np.ndarray:
+    def add_counts(self, location: Optional[int]) -> np.ndarray:
+        """Add the counts of the sensor to the summarization features."""
+        self._add_column(np.log10(self._counts), location)
+        return self.clustered_x
+
+    def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
         """Add the sum of the charge to the summarization features."""
         assert hasattr(
             self, "_charge_sum"
@@ -396,7 +396,10 @@ def add_std(
         return self.clustered_x
 
     def add_mean(
-        self, column: int, location: int, weights: Union[np.ndarray, int] = 1
+        self,
+        column: int,
+        location: Optional[int] = None,
+        weights: Union[np.ndarray, int] = 1,
     ) -> np.ndarray:
         """Add the mean of the column."""
         self._add_column(

From 1e9e45acbba757040fac5b30ff8363f04df38683 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 17:01:06 +0900
Subject: [PATCH 03/18] more default none

---
 src/graphnet/models/graphs/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 078aa04f2..16f7d63ee 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -364,7 +364,7 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         )
         return self._charge_weights
 
-    def add_counts(self, location: Optional[int]) -> np.ndarray:
+    def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)
         return self.clustered_x

From 81416b8d28159123eea7bb40372484cbf1ceab7f Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 17:01:42 +0900
Subject: [PATCH 04/18] Update PercentileCluster

---
 src/graphnet/models/graphs/nodes/nodes.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py
index 4e094e6be..139d851a0 100644
--- a/src/graphnet/models/graphs/nodes/nodes.py
+++ b/src/graphnet/models/graphs/nodes/nodes.py
@@ -9,7 +9,7 @@
 from graphnet.utilities.decorators import final
 from graphnet.models import Model
 from graphnet.models.graphs.utils import (
-    cluster_summarize_with_percentiles,
+    cluster_and_pad,
     identify_indices,
     lex_sort,
     ice_transparency,
@@ -198,13 +198,14 @@ def _construct_nodes(self, x: torch.Tensor) -> Data:
         x = x.numpy()
         # Construct clusters with percentile-summarized features
         if hasattr(self, "_summarization_indices"):
-            array = cluster_summarize_with_percentiles(
-                x=x,
+            cluster_class = cluster_and_pad(
+                x=x, cluster_columns=self._cluster_indices
+            )
+            array = cluster_class.add_percentile_summary(
                 summarization_indices=self._summarization_indices,
-                cluster_indices=self._cluster_indices,
                 percentiles=self._percentiles,
-                add_counts=self._add_counts,
             )
+            array = cluster_class.add_counts()
         else:
             self.error(
                 f"""{self.__class__.__name__} was not instatiated with

From 64b728c6737377ef63a88b8dfd0d5adb28dc8c0e Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Wed, 23 Oct 2024 18:21:55 +0900
Subject: [PATCH 05/18] cluster and pad utility

---
 src/graphnet/models/graphs/utils.py | 236 +++++++++++++++++++++++++++-
 1 file changed, 235 insertions(+), 1 deletion(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 77669eaeb..d068288ff 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -1,6 +1,6 @@
 """Utility functions for construction of graphs."""
 
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Union
 import os
 import numpy as np
 import pandas as pd
@@ -113,6 +113,7 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
+# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality
 def cluster_summarize_with_percentiles(
     x: np.ndarray,
     summarization_indices: List[int],
@@ -149,6 +150,9 @@ def cluster_summarize_with_percentiles(
     Returns:
         Percentile-summarized array
     """
+    print(
+        "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality"
+    )
     pct_dict = {}
     for feature_idx in summarization_indices:
         summarized_array, column_offset, counts = gather_cluster_sequence(
@@ -172,6 +176,236 @@ def cluster_summarize_with_percentiles(
     return array
 
 
+class cluster_and_pad:
+    """cluster and pad the data for further summarization."""
+
+    def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
+        """Initialize the class with the data and cluster columns.
+
+        Args:
+            x: Array to be clustered
+            cluster_columns: List of column indices on which the clusters
+                            are constructed.
+            Returns: None
+            Adds:
+                clustered_x: Added to the class
+                _counts: Added to the class
+                _padded_x: Added to the class
+        """
+        x = lex_sort(x=x, cluster_columns=cluster_columns)
+
+        unique_sensors, self._counts = np.unique(
+            x[:, cluster_columns], axis=0, return_counts=True
+        )
+
+        contingency_table = np.concatenate(
+            [unique_sensors, self._counts.reshape(-1, 1)], axis=1
+        )
+
+        contingency_table = lex_sort(
+            x=contingency_table, cluster_columns=cluster_columns
+        )
+
+        self.clustered_x = contingency_table[:, 0 : unique_sensors.shape[1]]
+        self._counts = (
+            contingency_table[:, self.clustered_x.shape[1] :]
+            .flatten()
+            .astype(int)
+        )
+
+        self._padded_x = np.empty(
+            (len(self._counts), max(self._counts), x.shape[1])
+        )
+        self._padded_x.fill(np.nan)
+
+        for i in range(len(self._counts)):
+            self._padded_x[i, : self._counts[i]] = x[: self._counts[i]]
+            x = x[self._counts[i] :]
+
+    def _add_column(
+        self, column: np.ndarray, location: Optional[int] = None
+    ) -> None:
+        """Add a column to the clustered tensor.
+
+        Args:
+            column: Column to be added to the tensor
+            location: Location to insert the column in the clustered tensor
+        Returns:
+            clustered_x: The clustered tensor with the column added
+        """
+        if location is None:
+            self.clustered_x = np.column_stack([self.clustered_x, column])
+        else:
+            self.clustered_x = np.insert(
+                self.clustered_x, location, column, axis=1
+            )
+
+    def add_charge_threshold_summary(
+        self,
+        summarization_indices: List[int],
+        percentiles: List[int],
+        charge_index: int,
+        location: Optional[int] = None,
+    ) -> np.ndarray:
+        """Summarize features through percentiles on charge of sensor.
+
+        Args:
+            summarization_indices: List of column indices that defines features
+                                    that will be summarized with percentiles.
+            percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+            charge_index: index of the charge column in the padded tensor
+            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+        Returns:
+            clustered_x: The clustered tensor with the summarization indices added
+        Adds:
+            _charge_sum: Added to the class
+            _charge_weights: Added to the class
+        Altered:
+            _padded_x: Charge is altered to be the cumulative sum
+                        of the charge divided by the total charge
+            clustered_x: The summarization indices are added at the end of the tensor
+        """
+        # convert the charge to the cumulative sum of the charge divided by the total charge
+        self._charge_weights = self._padded_x[:, :, charge_index]
+
+        self._padded_x[:, :, charge_index] = self._padded_x[
+            :, :, charge_index
+        ].cumsum(axis=1)
+
+        # add the charge sum to the class if it does not already exist
+        if not hasattr(self, "_charge_sum"):
+            self._charge_sum = np.nanmax(
+                self._padded_x[:, :, charge_index], axis=1
+            )
+
+        self._charge_weights = (
+            self._charge_weights / self._charge_sum[:, np.newaxis]
+        )
+
+        self._padded_x[:, :, charge_index] = (
+            self._padded_x[:, :, charge_index]
+            / self._charge_sum[:, np.newaxis]
+        )
+
+        # Summarize the charge at different percentiles
+        selections = np.argmax(
+            self._padded_x[:, :, charge_index][:, :, np.newaxis]
+            >= (np.array(percentiles) / 100),
+            axis=1,
+        )
+
+        selections += (np.arange(len(self._counts)) * self._padded_x.shape[1])[
+            :, np.newaxis
+        ]
+
+        selections = self._padded_x[:, :, summarization_indices].reshape(
+            -1, len(summarization_indices)
+        )[selections]
+        selections = selections.transpose(0, 2, 1).reshape(
+            len(self.clustered_x), -1
+        )
+        self._add_column(selections, location)
+        return self.clustered_x
+
+    def add_percentile_summary(
+        self,
+        summarization_indices: List[int],
+        percentiles: List[int],
+        method: str = "linear",
+        location: Optional[int] = None,
+    ) -> np.ndarray:
+        """Summarize the features of the sensors using percentiles.
+
+        Args:
+            summarization_indices: List of column indices that defines features
+                                    that will be summarized with percentiles.
+            percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+            method: Method to summarize the features. E.g. "linear"
+            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+        Returns:
+            None
+        Adds:
+            None
+        Altered:
+            clustered_x: The summarization indices are added at the end of the tensor
+        """
+        percentiles_x = np.nanpercentile(
+            self._padded_x[:, :, summarization_indices],
+            percentiles,
+            axis=1,
+            method=method,
+        )
+
+        percentiles_x = percentiles_x.transpose(1, 2, 0).reshape(
+            len(self.clustered_x), -1
+        )
+        self._add_column(percentiles_x, location)
+        return self.clustered_x
+
+    def add_counts(self, location: int) -> np.ndarray:
+        """Add the counts of the sensor to the summarization features."""
+        self._add_column(np.log10(self._counts), location)
+        return self.clustered_x
+
+    def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
+        """Calculate the sum of the charge."""
+        assert not hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has already been calculated, re-calculation is not allowed"
+        self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
+        return self._charge_sum
+
+    def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
+        """Calculate the weights of the charge."""
+        assert not hasattr(
+            self, "_charge_weights"
+        ), "Charge weights have already been calculated, re-calculation is not allowed"
+        assert hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        self._charge_weights = (
+            self._padded_x[:, :, charge_index]
+            / self._charge_sum[:, np.newaxis]
+        )
+        return self._charge_weights
+
+    def add_sum_charge(self, location: int) -> np.ndarray:
+        """Add the sum of the charge to the summarization features."""
+        assert hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        self._add_column(self._charge_sum, location)
+        return self.clustered_x
+
+    def add_std(
+        self,
+        column: int,
+        location: Optional[int] = None,
+        weights: Union[np.ndarray, int] = 1,
+    ) -> np.ndarray:
+        """Add the standard deviation of the column.
+
+        Args:
+            column: Index of the column in the padded tensor to calculate the standard deviation
+            location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end
+            weights: Optional weights to be applied to the standard deviation
+        """
+        self._add_column(
+            np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location
+        )
+        return self.clustered_x
+
+    def add_mean(
+        self, column: int, location: int, weights: Union[np.ndarray, int] = 1
+    ) -> np.ndarray:
+        """Add the mean of the column."""
+        self._add_column(
+            np.nanmean(self._padded_x[:, :, column] * weights, axis=1),
+            location,
+        )
+        return self.clustered_x
+
+
 def ice_transparency(
     z_offset: Optional[float] = None, z_scaling: Optional[float] = None
 ) -> Tuple[interp1d, interp1d]:

From b8654fa057bda1c8ac9b96f2fa080637e7021349 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 16:59:02 +0900
Subject: [PATCH 06/18] Location default None

---
 src/graphnet/models/graphs/utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index d068288ff..f65bc562d 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -342,11 +342,6 @@ def add_percentile_summary(
         self._add_column(percentiles_x, location)
         return self.clustered_x
 
-    def add_counts(self, location: int) -> np.ndarray:
-        """Add the counts of the sensor to the summarization features."""
-        self._add_column(np.log10(self._counts), location)
-        return self.clustered_x
-
     def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
         """Calculate the sum of the charge."""
         assert not hasattr(
@@ -369,7 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         )
         return self._charge_weights
 
-    def add_sum_charge(self, location: int) -> np.ndarray:
+    def add_counts(self, location: Optional[int]) -> np.ndarray:
+        """Add the counts of the sensor to the summarization features."""
+        self._add_column(np.log10(self._counts), location)
+        return self.clustered_x
+
+    def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
         """Add the sum of the charge to the summarization features."""
         assert hasattr(
             self, "_charge_sum"
@@ -396,7 +396,10 @@ def add_std(
         return self.clustered_x
 
     def add_mean(
-        self, column: int, location: int, weights: Union[np.ndarray, int] = 1
+        self,
+        column: int,
+        location: Optional[int] = None,
+        weights: Union[np.ndarray, int] = 1,
     ) -> np.ndarray:
         """Add the mean of the column."""
         self._add_column(

From d964efcdf5fcebdfef71f92bb07d13f47b85fe2a Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 17:01:06 +0900
Subject: [PATCH 07/18] more default none

---
 src/graphnet/models/graphs/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index f65bc562d..55c8fbd33 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -364,7 +364,7 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         )
         return self._charge_weights
 
-    def add_counts(self, location: Optional[int]) -> np.ndarray:
+    def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)
         return self.clustered_x

From e6357b50a472d491b4652357676f1622f5068c85 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Thu, 24 Oct 2024 17:01:42 +0900
Subject: [PATCH 08/18] Update PercentileCluster

---
 src/graphnet/models/graphs/nodes/nodes.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py
index 558ec96f4..59de864fd 100644
--- a/src/graphnet/models/graphs/nodes/nodes.py
+++ b/src/graphnet/models/graphs/nodes/nodes.py
@@ -9,7 +9,7 @@
 from graphnet.utilities.decorators import final
 from graphnet.models import Model
 from graphnet.models.graphs.utils import (
-    cluster_summarize_with_percentiles,
+    cluster_and_pad,
     identify_indices,
     lex_sort,
     ice_transparency,
@@ -198,13 +198,14 @@ def _construct_nodes(self, x: torch.Tensor) -> Data:
         x = x.numpy()
         # Construct clusters with percentile-summarized features
         if hasattr(self, "_summarization_indices"):
-            array = cluster_summarize_with_percentiles(
-                x=x,
+            cluster_class = cluster_and_pad(
+                x=x, cluster_columns=self._cluster_indices
+            )
+            array = cluster_class.add_percentile_summary(
                 summarization_indices=self._summarization_indices,
-                cluster_indices=self._cluster_indices,
                 percentiles=self._percentiles,
-                add_counts=self._add_counts,
             )
+            array = cluster_class.add_counts()
         else:
             self.error(
                 f"""{self.__class__.__name__} was not instatiated with

From 7fcc7fe4ce793e8695ffbfc667ed92858a404930 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 15 Nov 2024 14:48:34 +0900
Subject: [PATCH 09/18] align with prehooks

---
 src/graphnet/models/graphs/utils.py | 49 +++++++++++++++++++----------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 55c8fbd33..d2ed6cce4 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -113,7 +113,8 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
-# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality
+# TODO Remove this function as it is superseded by
+# cluster_and_pad wich has the same functionality
 def cluster_summarize_with_percentiles(
     x: np.ndarray,
     summarization_indices: List[int],
@@ -151,7 +152,9 @@ def cluster_summarize_with_percentiles(
         Percentile-summarized array
     """
     print(
-        "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality"
+        "This function is deprecated and will be removed,",
+        "use the class cluster_and_pad with add_percentile_summary",
+        "instead for the same functionality",
     )
     pct_dict = {}
     for feature_idx in summarization_indices:
@@ -177,7 +180,7 @@ def cluster_summarize_with_percentiles(
 
 
 class cluster_and_pad:
-    """cluster and pad the data for further summarization."""
+    """Cluster and pad the data for further summarization."""
 
     def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
         """Initialize the class with the data and cluster columns.
@@ -251,21 +254,25 @@ def add_charge_threshold_summary(
 
         Args:
             summarization_indices: List of column indices that defines features
-                                    that will be summarized with percentiles.
+                                   that will be summarized with percentiles.
             percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
             charge_index: index of the charge column in the padded tensor
-            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+            location: Location to insert the summarization indices in the
+                      clustered tensor defaults to adding at the end
         Returns:
-            clustered_x: The clustered tensor with the summarization indices added
+            clustered_x: The clustered tensor with the summarization indices
+                         added
         Adds:
             _charge_sum: Added to the class
             _charge_weights: Added to the class
         Altered:
             _padded_x: Charge is altered to be the cumulative sum
-                        of the charge divided by the total charge
-            clustered_x: The summarization indices are added at the end of the tensor
+                       of the charge divided by the total charge
+            clustered_x: The summarization indices are added at the end
+                         of the tensor
         """
-        # convert the charge to the cumulative sum of the charge divided by the total charge
+        # convert the charge to the cumulative sum of the charge divided
+        # by the total charge
         self._charge_weights = self._padded_x[:, :, charge_index]
 
         self._padded_x[:, :, charge_index] = self._padded_x[
@@ -321,13 +328,15 @@ def add_percentile_summary(
                                     that will be summarized with percentiles.
             percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
             method: Method to summarize the features. E.g. "linear"
-            location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end
+            location: Location to insert the summarization indices in the
+                       clustered tensor defaults to adding at the end
         Returns:
             None
         Adds:
             None
         Altered:
-            clustered_x: The summarization indices are added at the end of the tensor
+            clustered_x: The summarization indices are added at the end of
+                         the tensor
         """
         percentiles_x = np.nanpercentile(
             self._padded_x[:, :, summarization_indices],
@@ -346,7 +355,8 @@ def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
         """Calculate the sum of the charge."""
         assert not hasattr(
             self, "_charge_sum"
-        ), "Charge sum has already been calculated, re-calculation is not allowed"
+        ), "Charge sum has already been calculated, \
+            re-calculation is not allowed"
         self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
         return self._charge_sum
 
@@ -354,10 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         """Calculate the weights of the charge."""
         assert not hasattr(
             self, "_charge_weights"
-        ), "Charge weights have already been calculated, re-calculation is not allowed"
+        ), "Charge weights have already been calculated, \
+            re-calculation is not allowed"
         assert hasattr(
             self, "_charge_sum"
-        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        ), "Charge sum has not been calculated, \
+            please run calculate_charge_sum"
         self._charge_weights = (
             self._padded_x[:, :, charge_index]
             / self._charge_sum[:, np.newaxis]
@@ -373,7 +385,8 @@ def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
         """Add the sum of the charge to the summarization features."""
         assert hasattr(
             self, "_charge_sum"
-        ), "Charge sum has not been calculated, please run calculate_charge_sum"
+        ), "Charge sum has not been calculated, \
+            please run calculate_charge_sum"
         self._add_column(self._charge_sum, location)
         return self.clustered_x
 
@@ -386,8 +399,10 @@ def add_std(
         """Add the standard deviation of the column.
 
         Args:
-            column: Index of the column in the padded tensor to calculate the standard deviation
-            location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end
+            column: Index of the column in the padded tensor to
+                    calculate the standard deviation
+            location: Location to insert the standard deviation in the
+                      clustered tensor defaults to adding at the end
             weights: Optional weights to be applied to the standard deviation
         """
         self._add_column(

From 7f7000fb113c9f43412c528d398f8bca7f89991b Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 14:29:44 +0900
Subject: [PATCH 10/18] fix add_counts optional

---
 src/graphnet/models/graphs/nodes/nodes.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py
index 59de864fd..36afb4e1d 100644
--- a/src/graphnet/models/graphs/nodes/nodes.py
+++ b/src/graphnet/models/graphs/nodes/nodes.py
@@ -169,9 +169,7 @@ def _define_output_feature_names(
             cluster_idx,
             summ_idx,
             new_feature_names,
-        ) = self._get_indices_and_feature_names(
-            input_feature_names, self._add_counts
-        )
+        ) = self._get_indices_and_feature_names(input_feature_names)
         self._cluster_indices = cluster_idx
         self._summarization_indices = summ_idx
         return new_feature_names
@@ -179,7 +177,6 @@ def _define_output_feature_names(
     def _get_indices_and_feature_names(
         self,
         feature_names: List[str],
-        add_counts: bool,
     ) -> Tuple[List[int], List[int], List[str]]:
         cluster_idx, summ_idx, summ_names = identify_indices(
             feature_names, self._cluster_on
@@ -188,7 +185,7 @@ def _get_indices_and_feature_names(
         for feature in summ_names:
             for pct in self._percentiles:
                 new_feature_names.append(f"{feature}_pct{pct}")
-        if add_counts:
+        if self._add_counts:
             # add "counts" as the last feature
             new_feature_names.append("counts")
         return cluster_idx, summ_idx, new_feature_names
@@ -205,7 +202,8 @@ def _construct_nodes(self, x: torch.Tensor) -> Data:
                 summarization_indices=self._summarization_indices,
                 percentiles=self._percentiles,
             )
-            array = cluster_class.add_counts()
+            if self._add_counts:
+                array = cluster_class.add_counts()
         else:
             self.error(
                 f"""{self.__class__.__name__} was not instatiated with

From 505a82eebcebd3adcd24a12c213c742539ff98e9 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:10:34 +0900
Subject: [PATCH 11/18] update docstrings

---
 src/graphnet/models/graphs/utils.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index d2ed6cce4..85ea94d9d 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -180,7 +180,30 @@ def cluster_summarize_with_percentiles(
 
 
 class cluster_and_pad:
-    """Cluster and pad the data for further summarization."""
+    """Cluster and pad the data for further summarization.
+
+    Clusters the inptut data according to the specified columns
+    and computes aggregate statistics on the clusters.
+    The clustering will happen only ones creating a cluster matrix
+    which will hold all the aggregated statistics and a padded matrix which
+    will hold the padded data for quick calculation of aggregate statistics.
+
+    Example:
+    clustered_x = cluster_and_pad(x = single_event_as_array,
+                                 cluster_columns = [0,1,2])
+    # Creates a cluster matrix and a padded matrix,
+    # the cluster matrix will contain the unique values of the cluster columns,
+    # no additional aggregate statistics are added yet.
+
+    clustered_x_with_percentiles = cluster_class.add_percentile_summary(
+    summarization_indices = [3,4,5], percentiles = [10,50,90])
+    # Adds the 10th, 50th and 90th percentile of columns 3,4
+    # and 5 in the input data to the cluster matrix.
+
+    clustered_x_with_percentiles_and_std = cluster_class.add_std(column = 4)
+    # Adds the standard deviation of column 4 in the input data
+    # to the cluster matrix.
+    """
 
     def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
         """Initialize the class with the data and cluster columns.
@@ -269,7 +292,7 @@ def add_charge_threshold_summary(
             _padded_x: Charge is altered to be the cumulative sum
                        of the charge divided by the total charge
             clustered_x: The summarization indices are added at the end
-                         of the tensor
+                         of the tensor or inserted at the specified location.
         """
         # convert the charge to the cumulative sum of the charge divided
         # by the total charge
@@ -336,7 +359,7 @@ def add_percentile_summary(
             None
         Altered:
             clustered_x: The summarization indices are added at the end of
-                         the tensor
+                         the tensor or inserted at the specified location
         """
         percentiles_x = np.nanpercentile(
             self._padded_x[:, :, summarization_indices],

From 4a4083b5b4092d26ae811f63a4578ff7a1554552 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:17:44 +0900
Subject: [PATCH 12/18] move/use internal functions + output x

---
 src/graphnet/models/graphs/utils.py | 68 ++++++++++++-----------------
 1 file changed, 28 insertions(+), 40 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 85ea94d9d..9587010df 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -247,6 +247,7 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
         for i in range(len(self._counts)):
             self._padded_x[i, : self._counts[i]] = x[: self._counts[i]]
             x = x[self._counts[i] :]
+        return self.clustered_x
 
     def _add_column(
         self, column: np.ndarray, location: Optional[int] = None
@@ -266,6 +267,31 @@ def _add_column(
                 self.clustered_x, location, column, axis=1
             )
 
+    def _calculate_charge_sum(self, charge_index: int) -> np.ndarray:
+        """Calculate the sum of the charge."""
+        assert not hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has already been calculated, \
+            re-calculation is not allowed"
+        self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
+        return self._charge_sum
+
+    def _calculate_charge_weights(self, charge_index: int) -> np.ndarray:
+        """Calculate the weights of the charge."""
+        assert not hasattr(
+            self, "_charge_weights"
+        ), "Charge weights have already been calculated, \
+            re-calculation is not allowed"
+        assert hasattr(
+            self, "_charge_sum"
+        ), "Charge sum has not been calculated, \
+            please run calculate_charge_sum"
+        self._charge_weights = (
+            self._padded_x[:, :, charge_index]
+            / self._charge_sum[:, np.newaxis]
+        )
+        return self._charge_weights
+
     def add_charge_threshold_summary(
         self,
         summarization_indices: List[int],
@@ -296,21 +322,8 @@ def add_charge_threshold_summary(
         """
         # convert the charge to the cumulative sum of the charge divided
         # by the total charge
-        self._charge_weights = self._padded_x[:, :, charge_index]
-
-        self._padded_x[:, :, charge_index] = self._padded_x[
-            :, :, charge_index
-        ].cumsum(axis=1)
-
-        # add the charge sum to the class if it does not already exist
-        if not hasattr(self, "_charge_sum"):
-            self._charge_sum = np.nanmax(
-                self._padded_x[:, :, charge_index], axis=1
-            )
-
-        self._charge_weights = (
-            self._charge_weights / self._charge_sum[:, np.newaxis]
-        )
+        self._calculate_charge_sum(charge_index)
+        self._calculate_charge_weights(charge_index)
 
         self._padded_x[:, :, charge_index] = (
             self._padded_x[:, :, charge_index]
@@ -374,31 +387,6 @@ def add_percentile_summary(
         self._add_column(percentiles_x, location)
         return self.clustered_x
 
-    def calculate_charge_sum(self, charge_index: int) -> np.ndarray:
-        """Calculate the sum of the charge."""
-        assert not hasattr(
-            self, "_charge_sum"
-        ), "Charge sum has already been calculated, \
-            re-calculation is not allowed"
-        self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
-        return self._charge_sum
-
-    def calculate_charge_weights(self, charge_index: int) -> np.ndarray:
-        """Calculate the weights of the charge."""
-        assert not hasattr(
-            self, "_charge_weights"
-        ), "Charge weights have already been calculated, \
-            re-calculation is not allowed"
-        assert hasattr(
-            self, "_charge_sum"
-        ), "Charge sum has not been calculated, \
-            please run calculate_charge_sum"
-        self._charge_weights = (
-            self._padded_x[:, :, charge_index]
-            / self._charge_sum[:, np.newaxis]
-        )
-        return self._charge_weights
-
     def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)

From 87f41c6bc3254b9b1894e44b3ffe8792511fc2e2 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:25:15 +0900
Subject: [PATCH 13/18] remove warning

---
 src/graphnet/models/graphs/utils.py | 66 -----------------------------
 1 file changed, 66 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 9587010df..0ac8aeac2 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -113,72 +113,6 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
-# TODO Remove this function as it is superseded by
-# cluster_and_pad wich has the same functionality
-def cluster_summarize_with_percentiles(
-    x: np.ndarray,
-    summarization_indices: List[int],
-    cluster_indices: List[int],
-    percentiles: List[int],
-    add_counts: bool,
-) -> np.ndarray:
-    """Turn `x` into clusters with percentile summary.
-
-    From variables specified by column indices `cluster_indices`, `x` is turned
-    into clusters. Information in columns of `x` specified by indices
-    `summarization_indices` with each cluster is summarized using percentiles.
-    It is assumed `x` represents a single event.
-
-    **Example use-case**:
-    Suppose `x` contains raw pulses from a neutrino event where some DOMs have
-    multiple measurements of Cherenkov radiation. If `cluster_indices` is set
-    to the columns corresponding to the xyz-position of the DOMs, and the
-    features specified in `summarization_indices` correspond to time, charge,
-    then each row in the returned array will correspond to a DOM,
-    and the time and charge for each DOM will be summarized by percentiles.
-    Returned output array has dimensions
-    `[n_clusters,
-    len(percentiles)*len(summarization_indices) + len(cluster_indices)]`
-
-    Args:
-        x: Array to be clustered
-        summarization_indices: List of column indices that defines features
-                                that will be summarized with percentiles.
-        cluster_indices: List of column indices on which the clusters
-                        are constructed.
-        percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
-
-    Returns:
-        Percentile-summarized array
-    """
-    print(
-        "This function is deprecated and will be removed,",
-        "use the class cluster_and_pad with add_percentile_summary",
-        "instead for the same functionality",
-    )
-    pct_dict = {}
-    for feature_idx in summarization_indices:
-        summarized_array, column_offset, counts = gather_cluster_sequence(
-            x, feature_idx, cluster_indices
-        )
-        pct_dict[feature_idx] = np.nanpercentile(
-            summarized_array[:, column_offset:], percentiles, axis=1
-        ).T
-
-    for i, key in enumerate(pct_dict.keys()):
-        if i == 0:
-            array = summarized_array[:, 0:column_offset]
-
-        array = np.concatenate([array, pct_dict[key]], axis=1)
-
-    if add_counts:
-        array = np.concatenate(
-            [array, np.log10(counts).reshape(-1, 1)], axis=1
-        )
-
-    return array
-
-
 class cluster_and_pad:
     """Cluster and pad the data for further summarization.
 

From 51da4b0610bd64c8aa2a66ee86eef176168c1b56 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:45:03 +0900
Subject: [PATCH 14/18] re-add old function

---
 src/graphnet/models/graphs/utils.py | 61 +++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 0ac8aeac2..0fcd6c136 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -113,6 +113,67 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
+# TODO Remove this function as it is superseded by
+# cluster_and_pad wich has the same functionality
+def cluster_summarize_with_percentiles(
+    x: np.ndarray,
+    summarization_indices: List[int],
+    cluster_indices: List[int],
+    percentiles: List[int],
+    add_counts: bool,
+) -> np.ndarray:
+    """Turn `x` into clusters with percentile summary.
+
+    From variables specified by column indices `cluster_indices`, `x` is turned
+    into clusters. Information in columns of `x` specified by indices
+    `summarization_indices` with each cluster is summarized using percentiles.
+    It is assumed `x` represents a single event.
+
+    **Example use-case**:
+    Suppose `x` contains raw pulses from a neutrino event where some DOMs have
+    multiple measurements of Cherenkov radiation. If `cluster_indices` is set
+    to the columns corresponding to the xyz-position of the DOMs, and the
+    features specified in `summarization_indices` correspond to time, charge,
+    then each row in the returned array will correspond to a DOM,
+    and the time and charge for each DOM will be summarized by percentiles.
+    Returned output array has dimensions
+    `[n_clusters,
+    len(percentiles)*len(summarization_indices) + len(cluster_indices)]`
+
+    Args:
+        x: Array to be clustered
+        summarization_indices: List of column indices that defines features
+                                that will be summarized with percentiles.
+        cluster_indices: List of column indices on which the clusters
+                        are constructed.
+        percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+
+    Returns:
+        Percentile-summarized array
+    """
+    pct_dict = {}
+    for feature_idx in summarization_indices:
+        summarized_array, column_offset, counts = gather_cluster_sequence(
+            x, feature_idx, cluster_indices
+        )
+        pct_dict[feature_idx] = np.nanpercentile(
+            summarized_array[:, column_offset:], percentiles, axis=1
+        ).T
+
+    for i, key in enumerate(pct_dict.keys()):
+        if i == 0:
+            array = summarized_array[:, 0:column_offset]
+
+        array = np.concatenate([array, pct_dict[key]], axis=1)
+
+    if add_counts:
+        array = np.concatenate(
+            [array, np.log10(counts).reshape(-1, 1)], axis=1
+        )
+
+    return array
+
+
 class cluster_and_pad:
     """Cluster and pad the data for further summarization.
 

From 3e21f7f6d251bd938505636efada00fa795a268f Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:51:10 +0900
Subject: [PATCH 15/18] remove returns

---
 src/graphnet/models/graphs/nodes/nodes.py |  5 +++--
 src/graphnet/models/graphs/utils.py       | 19 ++++++-------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py
index 36afb4e1d..11c03ae84 100644
--- a/src/graphnet/models/graphs/nodes/nodes.py
+++ b/src/graphnet/models/graphs/nodes/nodes.py
@@ -198,12 +198,13 @@ def _construct_nodes(self, x: torch.Tensor) -> Data:
             cluster_class = cluster_and_pad(
                 x=x, cluster_columns=self._cluster_indices
             )
-            array = cluster_class.add_percentile_summary(
+            cluster_class.add_percentile_summary(
                 summarization_indices=self._summarization_indices,
                 percentiles=self._percentiles,
             )
             if self._add_counts:
-                array = cluster_class.add_counts()
+                cluster_class.add_counts()
+            array = cluster_class.clustered_x
         else:
             self.error(
                 f"""{self.__class__.__name__} was not instatiated with
diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 0fcd6c136..4093dc288 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -184,20 +184,22 @@ class cluster_and_pad:
     will hold the padded data for quick calculation of aggregate statistics.
 
     Example:
-    clustered_x = cluster_and_pad(x = single_event_as_array,
+    cluster_and_pad(x = single_event_as_array,
                                  cluster_columns = [0,1,2])
     # Creates a cluster matrix and a padded matrix,
     # the cluster matrix will contain the unique values of the cluster columns,
     # no additional aggregate statistics are added yet.
 
-    clustered_x_with_percentiles = cluster_class.add_percentile_summary(
-    summarization_indices = [3,4,5], percentiles = [10,50,90])
+    cluster_class.add_percentile_summary(summarization_indices = [3,4,5],
+                                         percentiles = [10,50,90])
     # Adds the 10th, 50th and 90th percentile of columns 3,4
     # and 5 in the input data to the cluster matrix.
 
-    clustered_x_with_percentiles_and_std = cluster_class.add_std(column = 4)
+    cluster_class.add_std(column = 4)
     # Adds the standard deviation of column 4 in the input data
     # to the cluster matrix.
+    x = cluster_class.clustered_x
+    # Gets the clustered matrix with all the aggregate statistics.
     """
 
     def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
@@ -242,7 +244,6 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
         for i in range(len(self._counts)):
             self._padded_x[i, : self._counts[i]] = x[: self._counts[i]]
             x = x[self._counts[i] :]
-        return self.clustered_x
 
     def _add_column(
         self, column: np.ndarray, location: Optional[int] = None
@@ -269,7 +270,6 @@ def _calculate_charge_sum(self, charge_index: int) -> np.ndarray:
         ), "Charge sum has already been calculated, \
             re-calculation is not allowed"
         self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1)
-        return self._charge_sum
 
     def _calculate_charge_weights(self, charge_index: int) -> np.ndarray:
         """Calculate the weights of the charge."""
@@ -285,7 +285,6 @@ def _calculate_charge_weights(self, charge_index: int) -> np.ndarray:
             self._padded_x[:, :, charge_index]
             / self._charge_sum[:, np.newaxis]
         )
-        return self._charge_weights
 
     def add_charge_threshold_summary(
         self,
@@ -343,7 +342,6 @@ def add_charge_threshold_summary(
             len(self.clustered_x), -1
         )
         self._add_column(selections, location)
-        return self.clustered_x
 
     def add_percentile_summary(
         self,
@@ -380,12 +378,10 @@ def add_percentile_summary(
             len(self.clustered_x), -1
         )
         self._add_column(percentiles_x, location)
-        return self.clustered_x
 
     def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)
-        return self.clustered_x
 
     def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
         """Add the sum of the charge to the summarization features."""
@@ -394,7 +390,6 @@ def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
         ), "Charge sum has not been calculated, \
             please run calculate_charge_sum"
         self._add_column(self._charge_sum, location)
-        return self.clustered_x
 
     def add_std(
         self,
@@ -414,7 +409,6 @@ def add_std(
         self._add_column(
             np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location
         )
-        return self.clustered_x
 
     def add_mean(
         self,
@@ -427,7 +421,6 @@ def add_mean(
             np.nanmean(self._padded_x[:, :, column] * weights, axis=1),
             location,
         )
-        return self.clustered_x
 
 
 def ice_transparency(

From eaa12e584beb2ace28f9f3ac2c8cc29c2e5f98e0 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 16:31:25 +0900
Subject: [PATCH 16/18] docstrings udpdates

---
 src/graphnet/models/graphs/utils.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 4093dc288..2d21dcd4f 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -209,7 +209,6 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
             x: Array to be clustered
             cluster_columns: List of column indices on which the clusters
                             are constructed.
-            Returns: None
             Adds:
                 clustered_x: Added to the class
                 _counts: Added to the class
@@ -252,9 +251,10 @@ def _add_column(
 
         Args:
             column: Column to be added to the tensor
-            location: Location to insert the column in the clustered tensor
-        Returns:
-            clustered_x: The clustered tensor with the column added
+            location: Location to insert the column in the clustered tensor.
+        Altered:
+            clustered_x: The column is added at the end of the tenor or
+                            inserted at the specified location
         """
         if location is None:
             self.clustered_x = np.column_stack([self.clustered_x, column])
@@ -302,9 +302,6 @@ def add_charge_threshold_summary(
             charge_index: index of the charge column in the padded tensor
             location: Location to insert the summarization indices in the
                       clustered tensor defaults to adding at the end
-        Returns:
-            clustered_x: The clustered tensor with the summarization indices
-                         added
         Adds:
             _charge_sum: Added to the class
             _charge_weights: Added to the class
@@ -359,10 +356,6 @@ def add_percentile_summary(
             method: Method to summarize the features. E.g. "linear"
             location: Location to insert the summarization indices in the
                        clustered tensor defaults to adding at the end
-        Returns:
-            None
-        Adds:
-            None
         Altered:
             clustered_x: The summarization indices are added at the end of
                          the tensor or inserted at the specified location

From 165fedb5b83dae1878023b83cf9fa4d45863978b Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 17:37:51 +0900
Subject: [PATCH 17/18] automatic_name_generation

---
 src/graphnet/models/graphs/utils.py | 94 +++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 12 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 2d21dcd4f..9c9a76062 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -202,13 +202,20 @@ class cluster_and_pad:
     # Gets the clustered matrix with all the aggregate statistics.
     """
 
-    def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
+    def __init__(
+        self,
+        x: np.ndarray,
+        cluster_columns: List[int],
+        input_names: Optional[List[str]] = None,
+    ) -> None:
         """Initialize the class with the data and cluster columns.
 
         Args:
             x: Array to be clustered
             cluster_columns: List of column indices on which the clusters
                             are constructed.
+            input_names: Names of the columns in the input data for automatic
+                        generation of names.
             Adds:
                 clustered_x: Added to the class
                 _counts: Added to the class
@@ -244,6 +251,14 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None:
             self._padded_x[i, : self._counts[i]] = x[: self._counts[i]]
             x = x[self._counts[i] :]
 
+        self._input_names = input_names
+        if self._input_names is not None:
+            assert (
+                len(self._input_names) == x.shape[1]
+            ), "The input names must have the same length as the input data"
+
+            self._cluster_names = np.array(input_names)[cluster_columns]
+
     def _add_column(
         self, column: np.ndarray, location: Optional[int] = None
     ) -> None:
@@ -263,6 +278,25 @@ def _add_column(
                 self.clustered_x, location, column, axis=1
             )
 
+    def _add_column_names(
+        self, names: List[str], location: Optional[int] = None
+    ) -> None:
+        """Add names to the columns of the clustered tensor.
+
+        Args:
+            names: Names to be added to the columns of the tensor
+            location: Location to insert the names in the clustered tensor
+        Altered:
+            _cluster_names: The names are added at the end of the tensor
+                            or inserted at the specified location
+        """
+        if location is None:
+            self._cluster_names = np.append(self._cluster_names, names)
+        else:
+            self._cluster_names = np.insert(
+                self._cluster_names, location, names
+            )
+
     def _calculate_charge_sum(self, charge_index: int) -> np.ndarray:
         """Calculate the sum of the charge."""
         assert not hasattr(
@@ -310,6 +344,8 @@ def add_charge_threshold_summary(
                        of the charge divided by the total charge
             clustered_x: The summarization indices are added at the end
                          of the tensor or inserted at the specified location.
+            _cluster_names: The names are added at the end of the tensor
+                            or inserted at the specified location
         """
         # convert the charge to the cumulative sum of the charge divided
         # by the total charge
@@ -340,6 +376,15 @@ def add_charge_threshold_summary(
         )
         self._add_column(selections, location)
 
+        # update the cluster names
+        if self._input_names is not None:
+            new_names = [
+                self._input_names[i] + "_charge_threshold_" + str(p)
+                for i in summarization_indices
+                for p in percentiles
+            ]
+            self._add_column_names(new_names, location)
+
     def add_percentile_summary(
         self,
         summarization_indices: List[int],
@@ -359,6 +404,8 @@ def add_percentile_summary(
         Altered:
             clustered_x: The summarization indices are added at the end of
                          the tensor or inserted at the specified location
+            _cluster_names: The names are added at the end of the tensor
+                            or inserted at the specified location
         """
         percentiles_x = np.nanpercentile(
             self._padded_x[:, :, summarization_indices],
@@ -372,48 +419,71 @@ def add_percentile_summary(
         )
         self._add_column(percentiles_x, location)
 
+        # update the cluster names
+        if self._input_names is not None:
+            new_names = [
+                self._input_names[i] + "_percentile_" + str(p)
+                for i in summarization_indices
+                for p in percentiles
+            ]
+            self._add_column_names(new_names, location)
+
     def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)
+        new_name = ["counts"]
+        self._add_column_names(new_name, location)
 
-    def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray:
+    def add_sum_charge(
+        self, charge_index: int, location: Optional[int] = None
+    ) -> np.ndarray:
         """Add the sum of the charge to the summarization features."""
-        assert hasattr(
-            self, "_charge_sum"
-        ), "Charge sum has not been calculated, \
-            please run calculate_charge_sum"
+        if not hasattr(self, "_charge_sum"):
+            self._calculate_charge_sum(charge_index)
         self._add_column(self._charge_sum, location)
+        # update the cluster names
+        if self._input_names is not None:
+            new_name = [self._input_names[charge_index] + "_sum"]
+            self._add_column_names(new_name, location)
 
     def add_std(
         self,
-        column: int,
+        columns: List[int],
         location: Optional[int] = None,
         weights: Union[np.ndarray, int] = 1,
     ) -> np.ndarray:
         """Add the standard deviation of the column.
 
         Args:
-            column: Index of the column in the padded tensor to
-                    calculate the standard deviation
+            columns: Index of the columns from which to calculate the standard
+                    deviation.
             location: Location to insert the standard deviation in the
                       clustered tensor defaults to adding at the end
             weights: Optional weights to be applied to the standard deviation
         """
         self._add_column(
-            np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location
+            np.nanstd(self._padded_x[:, :, columns] * weights, axis=1),
+            location,
         )
+        if self._input_names is not None:
+            new_names = [self._input_names[i] + "_std" for i in columns]
+            self._add_column_names(new_names, location)
 
     def add_mean(
         self,
-        column: int,
+        columns: List[int],
         location: Optional[int] = None,
         weights: Union[np.ndarray, int] = 1,
     ) -> np.ndarray:
         """Add the mean of the column."""
         self._add_column(
-            np.nanmean(self._padded_x[:, :, column] * weights, axis=1),
+            np.nanmean(self._padded_x[:, :, columns] * weights, axis=1),
             location,
         )
+        # update the cluster names
+        if self._input_names is not None:
+            new_names = [self._input_names[i] + "_mean" for i in columns]
+            self._add_column_names(new_names, location)
 
 
 def ice_transparency(

From 75b3260f830feff65c40dbff425e126e35724257 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 21:51:16 +0900
Subject: [PATCH 18/18] small_fix

---
 src/graphnet/models/graphs/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 9c9a76062..7bb746508 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -431,8 +431,9 @@ def add_percentile_summary(
     def add_counts(self, location: Optional[int] = None) -> np.ndarray:
         """Add the counts of the sensor to the summarization features."""
         self._add_column(np.log10(self._counts), location)
-        new_name = ["counts"]
-        self._add_column_names(new_name, location)
+        if self._input_names is not None:
+            new_name = ["counts"]
+            self._add_column_names(new_name, location)
 
     def add_sum_charge(
         self, charge_index: int, location: Optional[int] = None