From 51da4b0610bd64c8aa2a66ee86eef176168c1b56 Mon Sep 17 00:00:00 2001
From: "askerosted@gmail.com" <askerosted@gmail.com>
Date: Fri, 6 Dec 2024 15:45:03 +0900
Subject: [PATCH] re-add old function

---
 src/graphnet/models/graphs/utils.py | 61 +++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py
index 0ac8aeac2..0fcd6c136 100644
--- a/src/graphnet/models/graphs/utils.py
+++ b/src/graphnet/models/graphs/utils.py
@@ -113,6 +113,67 @@ def identify_indices(
     return cluster_indices, summarization_indices, features_for_summarization
 
 
+# TODO Remove this function as it is superseded by
+# cluster_and_pad wich has the same functionality
+def cluster_summarize_with_percentiles(
+    x: np.ndarray,
+    summarization_indices: List[int],
+    cluster_indices: List[int],
+    percentiles: List[int],
+    add_counts: bool,
+) -> np.ndarray:
+    """Turn `x` into clusters with percentile summary.
+
+    From variables specified by column indices `cluster_indices`, `x` is turned
+    into clusters. Information in columns of `x` specified by indices
+    `summarization_indices` with each cluster is summarized using percentiles.
+    It is assumed `x` represents a single event.
+
+    **Example use-case**:
+    Suppose `x` contains raw pulses from a neutrino event where some DOMs have
+    multiple measurements of Cherenkov radiation. If `cluster_indices` is set
+    to the columns corresponding to the xyz-position of the DOMs, and the
+    features specified in `summarization_indices` correspond to time, charge,
+    then each row in the returned array will correspond to a DOM,
+    and the time and charge for each DOM will be summarized by percentiles.
+    Returned output array has dimensions
+    `[n_clusters,
+    len(percentiles)*len(summarization_indices) + len(cluster_indices)]`
+
+    Args:
+        x: Array to be clustered
+        summarization_indices: List of column indices that defines features
+                                that will be summarized with percentiles.
+        cluster_indices: List of column indices on which the clusters
+                        are constructed.
+        percentiles: percentiles used to summarize `x`. E.g. [10,50,90].
+
+    Returns:
+        Percentile-summarized array
+    """
+    pct_dict = {}
+    for feature_idx in summarization_indices:
+        summarized_array, column_offset, counts = gather_cluster_sequence(
+            x, feature_idx, cluster_indices
+        )
+        pct_dict[feature_idx] = np.nanpercentile(
+            summarized_array[:, column_offset:], percentiles, axis=1
+        ).T
+
+    for i, key in enumerate(pct_dict.keys()):
+        if i == 0:
+            array = summarized_array[:, 0:column_offset]
+
+        array = np.concatenate([array, pct_dict[key]], axis=1)
+
+    if add_counts:
+        array = np.concatenate(
+            [array, np.log10(counts).reshape(-1, 1)], axis=1
+        )
+
+    return array
+
+
 class cluster_and_pad:
     """Cluster and pad the data for further summarization.