From 2233cddd2da4aa3346cb3c506f63b398f0999dc3 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Wed, 23 Oct 2024 18:21:55 +0900 Subject: [PATCH 01/18] cluster and pad utility --- src/graphnet/models/graphs/utils.py | 236 +++++++++++++++++++++++++++- 1 file changed, 235 insertions(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index ea8445f90..4ba99d1a4 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -1,6 +1,6 @@ """Utility functions for construction of graphs.""" -from typing import List, Tuple +from typing import List, Tuple, Optional, Union import os import numpy as np import pandas as pd @@ -113,6 +113,7 @@ def identify_indices( return cluster_indices, summarization_indices, features_for_summarization +# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality def cluster_summarize_with_percentiles( x: np.ndarray, summarization_indices: List[int], @@ -149,6 +150,9 @@ def cluster_summarize_with_percentiles( Returns: Percentile-summarized array """ + print( + "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality" + ) pct_dict = {} for feature_idx in summarization_indices: summarized_array, column_offset, counts = gather_cluster_sequence( @@ -172,6 +176,236 @@ def cluster_summarize_with_percentiles( return array +class cluster_and_pad: + """cluster and pad the data for further summarization.""" + + def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: + """Initialize the class with the data and cluster columns. + + Args: + x: Array to be clustered + cluster_columns: List of column indices on which the clusters + are constructed. + Returns: None + Adds: + clustered_x: Added to the class + _counts: Added to the class + _padded_x: Added to the class + """ + x = lex_sort(x=x, cluster_columns=cluster_columns) + + unique_sensors, self._counts = np.unique( + x[:, cluster_columns], axis=0, return_counts=True + ) + + contingency_table = np.concatenate( + [unique_sensors, self._counts.reshape(-1, 1)], axis=1 + ) + + contingency_table = lex_sort( + x=contingency_table, cluster_columns=cluster_columns + ) + + self.clustered_x = contingency_table[:, 0 : unique_sensors.shape[1]] + self._counts = ( + contingency_table[:, self.clustered_x.shape[1] :] + .flatten() + .astype(int) + ) + + self._padded_x = np.empty( + (len(self._counts), max(self._counts), x.shape[1]) + ) + self._padded_x.fill(np.nan) + + for i in range(len(self._counts)): + self._padded_x[i, : self._counts[i]] = x[: self._counts[i]] + x = x[self._counts[i] :] + + def _add_column( + self, column: np.ndarray, location: Optional[int] = None + ) -> None: + """Add a column to the clustered tensor. + + Args: + column: Column to be added to the tensor + location: Location to insert the column in the clustered tensor + Returns: + clustered_x: The clustered tensor with the column added + """ + if location is None: + self.clustered_x = np.column_stack([self.clustered_x, column]) + else: + self.clustered_x = np.insert( + self.clustered_x, location, column, axis=1 + ) + + def add_charge_threshold_summary( + self, + summarization_indices: List[int], + percentiles: List[int], + charge_index: int, + location: Optional[int] = None, + ) -> np.ndarray: + """Summarize features through percentiles on charge of sensor. + + Args: + summarization_indices: List of column indices that defines features + that will be summarized with percentiles. + percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. + charge_index: index of the charge column in the padded tensor + location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + Returns: + clustered_x: The clustered tensor with the summarization indices added + Adds: + _charge_sum: Added to the class + _charge_weights: Added to the class + Altered: + _padded_x: Charge is altered to be the cumulative sum + of the charge divided by the total charge + clustered_x: The summarization indices are added at the end of the tensor + """ + # convert the charge to the cumulative sum of the charge divided by the total charge + self._charge_weights = self._padded_x[:, :, charge_index] + + self._padded_x[:, :, charge_index] = self._padded_x[ + :, :, charge_index + ].cumsum(axis=1) + + # add the charge sum to the class if it does not already exist + if not hasattr(self, "_charge_sum"): + self._charge_sum = np.nanmax( + self._padded_x[:, :, charge_index], axis=1 + ) + + self._charge_weights = ( + self._charge_weights / self._charge_sum[:, np.newaxis] + ) + + self._padded_x[:, :, charge_index] = ( + self._padded_x[:, :, charge_index] + / self._charge_sum[:, np.newaxis] + ) + + # Summarize the charge at different percentiles + selections = np.argmax( + self._padded_x[:, :, charge_index][:, :, np.newaxis] + >= (np.array(percentiles) / 100), + axis=1, + ) + + selections += (np.arange(len(self._counts)) * self._padded_x.shape[1])[ + :, np.newaxis + ] + + selections = self._padded_x[:, :, summarization_indices].reshape( + -1, len(summarization_indices) + )[selections] + selections = selections.transpose(0, 2, 1).reshape( + len(self.clustered_x), -1 + ) + self._add_column(selections, location) + return self.clustered_x + + def add_percentile_summary( + self, + summarization_indices: List[int], + percentiles: List[int], + method: str = "linear", + location: Optional[int] = None, + ) -> np.ndarray: + """Summarize the features of the sensors using percentiles. + + Args: + summarization_indices: List of column indices that defines features + that will be summarized with percentiles. + percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. + method: Method to summarize the features. E.g. "linear" + location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + Returns: + None + Adds: + None + Altered: + clustered_x: The summarization indices are added at the end of the tensor + """ + percentiles_x = np.nanpercentile( + self._padded_x[:, :, summarization_indices], + percentiles, + axis=1, + method=method, + ) + + percentiles_x = percentiles_x.transpose(1, 2, 0).reshape( + len(self.clustered_x), -1 + ) + self._add_column(percentiles_x, location) + return self.clustered_x + + def add_counts(self, location: int) -> np.ndarray: + """Add the counts of the sensor to the summarization features.""" + self._add_column(np.log10(self._counts), location) + return self.clustered_x + + def calculate_charge_sum(self, charge_index: int) -> np.ndarray: + """Calculate the sum of the charge.""" + assert not hasattr( + self, "_charge_sum" + ), "Charge sum has already been calculated, re-calculation is not allowed" + self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) + return self._charge_sum + + def calculate_charge_weights(self, charge_index: int) -> np.ndarray: + """Calculate the weights of the charge.""" + assert not hasattr( + self, "_charge_weights" + ), "Charge weights have already been calculated, re-calculation is not allowed" + assert hasattr( + self, "_charge_sum" + ), "Charge sum has not been calculated, please run calculate_charge_sum" + self._charge_weights = ( + self._padded_x[:, :, charge_index] + / self._charge_sum[:, np.newaxis] + ) + return self._charge_weights + + def add_sum_charge(self, location: int) -> np.ndarray: + """Add the sum of the charge to the summarization features.""" + assert hasattr( + self, "_charge_sum" + ), "Charge sum has not been calculated, please run calculate_charge_sum" + self._add_column(self._charge_sum, location) + return self.clustered_x + + def add_std( + self, + column: int, + location: Optional[int] = None, + weights: Union[np.ndarray, int] = 1, + ) -> np.ndarray: + """Add the standard deviation of the column. + + Args: + column: Index of the column in the padded tensor to calculate the standard deviation + location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end + weights: Optional weights to be applied to the standard deviation + """ + self._add_column( + np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location + ) + return self.clustered_x + + def add_mean( + self, column: int, location: int, weights: Union[np.ndarray, int] = 1 + ) -> np.ndarray: + """Add the mean of the column.""" + self._add_column( + np.nanmean(self._padded_x[:, :, column] * weights, axis=1), + location, + ) + return self.clustered_x + + def ice_transparency( z_offset: float = None, z_scaling: float = None ) -> Tuple[interp1d, interp1d]: From 19122e7bef2ed2a2cef0286ebac401265eef4fcc Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 16:59:02 +0900 Subject: [PATCH 02/18] Location default None --- src/graphnet/models/graphs/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 4ba99d1a4..078aa04f2 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -342,11 +342,6 @@ def add_percentile_summary( self._add_column(percentiles_x, location) return self.clustered_x - def add_counts(self, location: int) -> np.ndarray: - """Add the counts of the sensor to the summarization features.""" - self._add_column(np.log10(self._counts), location) - return self.clustered_x - def calculate_charge_sum(self, charge_index: int) -> np.ndarray: """Calculate the sum of the charge.""" assert not hasattr( @@ -369,7 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray: ) return self._charge_weights - def add_sum_charge(self, location: int) -> np.ndarray: + def add_counts(self, location: Optional[int]) -> np.ndarray: + """Add the counts of the sensor to the summarization features.""" + self._add_column(np.log10(self._counts), location) + return self.clustered_x + + def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: """Add the sum of the charge to the summarization features.""" assert hasattr( self, "_charge_sum" @@ -396,7 +396,10 @@ def add_std( return self.clustered_x def add_mean( - self, column: int, location: int, weights: Union[np.ndarray, int] = 1 + self, + column: int, + location: Optional[int] = None, + weights: Union[np.ndarray, int] = 1, ) -> np.ndarray: """Add the mean of the column.""" self._add_column( From 1e9e45acbba757040fac5b30ff8363f04df38683 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 17:01:06 +0900 Subject: [PATCH 03/18] more default none --- src/graphnet/models/graphs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 078aa04f2..16f7d63ee 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -364,7 +364,7 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray: ) return self._charge_weights - def add_counts(self, location: Optional[int]) -> np.ndarray: + def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) return self.clustered_x From 81416b8d28159123eea7bb40372484cbf1ceab7f Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 17:01:42 +0900 Subject: [PATCH 04/18] Update PercentileCluster --- src/graphnet/models/graphs/nodes/nodes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 4e094e6be..139d851a0 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -9,7 +9,7 @@ from graphnet.utilities.decorators import final from graphnet.models import Model from graphnet.models.graphs.utils import ( - cluster_summarize_with_percentiles, + cluster_and_pad, identify_indices, lex_sort, ice_transparency, @@ -198,13 +198,14 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: x = x.numpy() # Construct clusters with percentile-summarized features if hasattr(self, "_summarization_indices"): - array = cluster_summarize_with_percentiles( - x=x, + cluster_class = cluster_and_pad( + x=x, cluster_columns=self._cluster_indices + ) + array = cluster_class.add_percentile_summary( summarization_indices=self._summarization_indices, - cluster_indices=self._cluster_indices, percentiles=self._percentiles, - add_counts=self._add_counts, ) + array = cluster_class.add_counts() else: self.error( f"""{self.__class__.__name__} was not instatiated with From 64b728c6737377ef63a88b8dfd0d5adb28dc8c0e Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Wed, 23 Oct 2024 18:21:55 +0900 Subject: [PATCH 05/18] cluster and pad utility --- src/graphnet/models/graphs/utils.py | 236 +++++++++++++++++++++++++++- 1 file changed, 235 insertions(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 77669eaeb..d068288ff 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -1,6 +1,6 @@ """Utility functions for construction of graphs.""" -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Union import os import numpy as np import pandas as pd @@ -113,6 +113,7 @@ def identify_indices( return cluster_indices, summarization_indices, features_for_summarization +# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality def cluster_summarize_with_percentiles( x: np.ndarray, summarization_indices: List[int], @@ -149,6 +150,9 @@ def cluster_summarize_with_percentiles( Returns: Percentile-summarized array """ + print( + "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality" + ) pct_dict = {} for feature_idx in summarization_indices: summarized_array, column_offset, counts = gather_cluster_sequence( @@ -172,6 +176,236 @@ def cluster_summarize_with_percentiles( return array +class cluster_and_pad: + """cluster and pad the data for further summarization.""" + + def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: + """Initialize the class with the data and cluster columns. + + Args: + x: Array to be clustered + cluster_columns: List of column indices on which the clusters + are constructed. + Returns: None + Adds: + clustered_x: Added to the class + _counts: Added to the class + _padded_x: Added to the class + """ + x = lex_sort(x=x, cluster_columns=cluster_columns) + + unique_sensors, self._counts = np.unique( + x[:, cluster_columns], axis=0, return_counts=True + ) + + contingency_table = np.concatenate( + [unique_sensors, self._counts.reshape(-1, 1)], axis=1 + ) + + contingency_table = lex_sort( + x=contingency_table, cluster_columns=cluster_columns + ) + + self.clustered_x = contingency_table[:, 0 : unique_sensors.shape[1]] + self._counts = ( + contingency_table[:, self.clustered_x.shape[1] :] + .flatten() + .astype(int) + ) + + self._padded_x = np.empty( + (len(self._counts), max(self._counts), x.shape[1]) + ) + self._padded_x.fill(np.nan) + + for i in range(len(self._counts)): + self._padded_x[i, : self._counts[i]] = x[: self._counts[i]] + x = x[self._counts[i] :] + + def _add_column( + self, column: np.ndarray, location: Optional[int] = None + ) -> None: + """Add a column to the clustered tensor. + + Args: + column: Column to be added to the tensor + location: Location to insert the column in the clustered tensor + Returns: + clustered_x: The clustered tensor with the column added + """ + if location is None: + self.clustered_x = np.column_stack([self.clustered_x, column]) + else: + self.clustered_x = np.insert( + self.clustered_x, location, column, axis=1 + ) + + def add_charge_threshold_summary( + self, + summarization_indices: List[int], + percentiles: List[int], + charge_index: int, + location: Optional[int] = None, + ) -> np.ndarray: + """Summarize features through percentiles on charge of sensor. + + Args: + summarization_indices: List of column indices that defines features + that will be summarized with percentiles. + percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. + charge_index: index of the charge column in the padded tensor + location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + Returns: + clustered_x: The clustered tensor with the summarization indices added + Adds: + _charge_sum: Added to the class + _charge_weights: Added to the class + Altered: + _padded_x: Charge is altered to be the cumulative sum + of the charge divided by the total charge + clustered_x: The summarization indices are added at the end of the tensor + """ + # convert the charge to the cumulative sum of the charge divided by the total charge + self._charge_weights = self._padded_x[:, :, charge_index] + + self._padded_x[:, :, charge_index] = self._padded_x[ + :, :, charge_index + ].cumsum(axis=1) + + # add the charge sum to the class if it does not already exist + if not hasattr(self, "_charge_sum"): + self._charge_sum = np.nanmax( + self._padded_x[:, :, charge_index], axis=1 + ) + + self._charge_weights = ( + self._charge_weights / self._charge_sum[:, np.newaxis] + ) + + self._padded_x[:, :, charge_index] = ( + self._padded_x[:, :, charge_index] + / self._charge_sum[:, np.newaxis] + ) + + # Summarize the charge at different percentiles + selections = np.argmax( + self._padded_x[:, :, charge_index][:, :, np.newaxis] + >= (np.array(percentiles) / 100), + axis=1, + ) + + selections += (np.arange(len(self._counts)) * self._padded_x.shape[1])[ + :, np.newaxis + ] + + selections = self._padded_x[:, :, summarization_indices].reshape( + -1, len(summarization_indices) + )[selections] + selections = selections.transpose(0, 2, 1).reshape( + len(self.clustered_x), -1 + ) + self._add_column(selections, location) + return self.clustered_x + + def add_percentile_summary( + self, + summarization_indices: List[int], + percentiles: List[int], + method: str = "linear", + location: Optional[int] = None, + ) -> np.ndarray: + """Summarize the features of the sensors using percentiles. + + Args: + summarization_indices: List of column indices that defines features + that will be summarized with percentiles. + percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. + method: Method to summarize the features. E.g. "linear" + location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + Returns: + None + Adds: + None + Altered: + clustered_x: The summarization indices are added at the end of the tensor + """ + percentiles_x = np.nanpercentile( + self._padded_x[:, :, summarization_indices], + percentiles, + axis=1, + method=method, + ) + + percentiles_x = percentiles_x.transpose(1, 2, 0).reshape( + len(self.clustered_x), -1 + ) + self._add_column(percentiles_x, location) + return self.clustered_x + + def add_counts(self, location: int) -> np.ndarray: + """Add the counts of the sensor to the summarization features.""" + self._add_column(np.log10(self._counts), location) + return self.clustered_x + + def calculate_charge_sum(self, charge_index: int) -> np.ndarray: + """Calculate the sum of the charge.""" + assert not hasattr( + self, "_charge_sum" + ), "Charge sum has already been calculated, re-calculation is not allowed" + self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) + return self._charge_sum + + def calculate_charge_weights(self, charge_index: int) -> np.ndarray: + """Calculate the weights of the charge.""" + assert not hasattr( + self, "_charge_weights" + ), "Charge weights have already been calculated, re-calculation is not allowed" + assert hasattr( + self, "_charge_sum" + ), "Charge sum has not been calculated, please run calculate_charge_sum" + self._charge_weights = ( + self._padded_x[:, :, charge_index] + / self._charge_sum[:, np.newaxis] + ) + return self._charge_weights + + def add_sum_charge(self, location: int) -> np.ndarray: + """Add the sum of the charge to the summarization features.""" + assert hasattr( + self, "_charge_sum" + ), "Charge sum has not been calculated, please run calculate_charge_sum" + self._add_column(self._charge_sum, location) + return self.clustered_x + + def add_std( + self, + column: int, + location: Optional[int] = None, + weights: Union[np.ndarray, int] = 1, + ) -> np.ndarray: + """Add the standard deviation of the column. + + Args: + column: Index of the column in the padded tensor to calculate the standard deviation + location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end + weights: Optional weights to be applied to the standard deviation + """ + self._add_column( + np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location + ) + return self.clustered_x + + def add_mean( + self, column: int, location: int, weights: Union[np.ndarray, int] = 1 + ) -> np.ndarray: + """Add the mean of the column.""" + self._add_column( + np.nanmean(self._padded_x[:, :, column] * weights, axis=1), + location, + ) + return self.clustered_x + + def ice_transparency( z_offset: Optional[float] = None, z_scaling: Optional[float] = None ) -> Tuple[interp1d, interp1d]: From b8654fa057bda1c8ac9b96f2fa080637e7021349 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 16:59:02 +0900 Subject: [PATCH 06/18] Location default None --- src/graphnet/models/graphs/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index d068288ff..f65bc562d 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -342,11 +342,6 @@ def add_percentile_summary( self._add_column(percentiles_x, location) return self.clustered_x - def add_counts(self, location: int) -> np.ndarray: - """Add the counts of the sensor to the summarization features.""" - self._add_column(np.log10(self._counts), location) - return self.clustered_x - def calculate_charge_sum(self, charge_index: int) -> np.ndarray: """Calculate the sum of the charge.""" assert not hasattr( @@ -369,7 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray: ) return self._charge_weights - def add_sum_charge(self, location: int) -> np.ndarray: + def add_counts(self, location: Optional[int]) -> np.ndarray: + """Add the counts of the sensor to the summarization features.""" + self._add_column(np.log10(self._counts), location) + return self.clustered_x + + def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: """Add the sum of the charge to the summarization features.""" assert hasattr( self, "_charge_sum" @@ -396,7 +396,10 @@ def add_std( return self.clustered_x def add_mean( - self, column: int, location: int, weights: Union[np.ndarray, int] = 1 + self, + column: int, + location: Optional[int] = None, + weights: Union[np.ndarray, int] = 1, ) -> np.ndarray: """Add the mean of the column.""" self._add_column( From d964efcdf5fcebdfef71f92bb07d13f47b85fe2a Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 17:01:06 +0900 Subject: [PATCH 07/18] more default none --- src/graphnet/models/graphs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index f65bc562d..55c8fbd33 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -364,7 +364,7 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray: ) return self._charge_weights - def add_counts(self, location: Optional[int]) -> np.ndarray: + def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) return self.clustered_x From e6357b50a472d491b4652357676f1622f5068c85 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Thu, 24 Oct 2024 17:01:42 +0900 Subject: [PATCH 08/18] Update PercentileCluster --- src/graphnet/models/graphs/nodes/nodes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 558ec96f4..59de864fd 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -9,7 +9,7 @@ from graphnet.utilities.decorators import final from graphnet.models import Model from graphnet.models.graphs.utils import ( - cluster_summarize_with_percentiles, + cluster_and_pad, identify_indices, lex_sort, ice_transparency, @@ -198,13 +198,14 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: x = x.numpy() # Construct clusters with percentile-summarized features if hasattr(self, "_summarization_indices"): - array = cluster_summarize_with_percentiles( - x=x, + cluster_class = cluster_and_pad( + x=x, cluster_columns=self._cluster_indices + ) + array = cluster_class.add_percentile_summary( summarization_indices=self._summarization_indices, - cluster_indices=self._cluster_indices, percentiles=self._percentiles, - add_counts=self._add_counts, ) + array = cluster_class.add_counts() else: self.error( f"""{self.__class__.__name__} was not instatiated with From 7fcc7fe4ce793e8695ffbfc667ed92858a404930 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 15 Nov 2024 14:48:34 +0900 Subject: [PATCH 09/18] align with prehooks --- src/graphnet/models/graphs/utils.py | 49 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 55c8fbd33..d2ed6cce4 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -113,7 +113,8 @@ def identify_indices( return cluster_indices, summarization_indices, features_for_summarization -# TODO Remove this function as it is superseded by the class cluster_and_pad wich has the same functionality +# TODO Remove this function as it is superseded by +# cluster_and_pad wich has the same functionality def cluster_summarize_with_percentiles( x: np.ndarray, summarization_indices: List[int], @@ -151,7 +152,9 @@ def cluster_summarize_with_percentiles( Percentile-summarized array """ print( - "This function is deprecated and will be removed, use the class cluster_and_pad with add_percentile_summary instead for the same functionality" + "This function is deprecated and will be removed,", + "use the class cluster_and_pad with add_percentile_summary", + "instead for the same functionality", ) pct_dict = {} for feature_idx in summarization_indices: @@ -177,7 +180,7 @@ def cluster_summarize_with_percentiles( class cluster_and_pad: - """cluster and pad the data for further summarization.""" + """Cluster and pad the data for further summarization.""" def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: """Initialize the class with the data and cluster columns. @@ -251,21 +254,25 @@ def add_charge_threshold_summary( Args: summarization_indices: List of column indices that defines features - that will be summarized with percentiles. + that will be summarized with percentiles. percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. charge_index: index of the charge column in the padded tensor - location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + location: Location to insert the summarization indices in the + clustered tensor defaults to adding at the end Returns: - clustered_x: The clustered tensor with the summarization indices added + clustered_x: The clustered tensor with the summarization indices + added Adds: _charge_sum: Added to the class _charge_weights: Added to the class Altered: _padded_x: Charge is altered to be the cumulative sum - of the charge divided by the total charge - clustered_x: The summarization indices are added at the end of the tensor + of the charge divided by the total charge + clustered_x: The summarization indices are added at the end + of the tensor """ - # convert the charge to the cumulative sum of the charge divided by the total charge + # convert the charge to the cumulative sum of the charge divided + # by the total charge self._charge_weights = self._padded_x[:, :, charge_index] self._padded_x[:, :, charge_index] = self._padded_x[ @@ -321,13 +328,15 @@ def add_percentile_summary( that will be summarized with percentiles. percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. method: Method to summarize the features. E.g. "linear" - location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end + location: Location to insert the summarization indices in the + clustered tensor defaults to adding at the end Returns: None Adds: None Altered: - clustered_x: The summarization indices are added at the end of the tensor + clustered_x: The summarization indices are added at the end of + the tensor """ percentiles_x = np.nanpercentile( self._padded_x[:, :, summarization_indices], @@ -346,7 +355,8 @@ def calculate_charge_sum(self, charge_index: int) -> np.ndarray: """Calculate the sum of the charge.""" assert not hasattr( self, "_charge_sum" - ), "Charge sum has already been calculated, re-calculation is not allowed" + ), "Charge sum has already been calculated, \ + re-calculation is not allowed" self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) return self._charge_sum @@ -354,10 +364,12 @@ def calculate_charge_weights(self, charge_index: int) -> np.ndarray: """Calculate the weights of the charge.""" assert not hasattr( self, "_charge_weights" - ), "Charge weights have already been calculated, re-calculation is not allowed" + ), "Charge weights have already been calculated, \ + re-calculation is not allowed" assert hasattr( self, "_charge_sum" - ), "Charge sum has not been calculated, please run calculate_charge_sum" + ), "Charge sum has not been calculated, \ + please run calculate_charge_sum" self._charge_weights = ( self._padded_x[:, :, charge_index] / self._charge_sum[:, np.newaxis] @@ -373,7 +385,8 @@ def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: """Add the sum of the charge to the summarization features.""" assert hasattr( self, "_charge_sum" - ), "Charge sum has not been calculated, please run calculate_charge_sum" + ), "Charge sum has not been calculated, \ + please run calculate_charge_sum" self._add_column(self._charge_sum, location) return self.clustered_x @@ -386,8 +399,10 @@ def add_std( """Add the standard deviation of the column. Args: - column: Index of the column in the padded tensor to calculate the standard deviation - location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end + column: Index of the column in the padded tensor to + calculate the standard deviation + location: Location to insert the standard deviation in the + clustered tensor defaults to adding at the end weights: Optional weights to be applied to the standard deviation """ self._add_column( From 7f7000fb113c9f43412c528d398f8bca7f89991b Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 14:29:44 +0900 Subject: [PATCH 10/18] fix add_counts optional --- src/graphnet/models/graphs/nodes/nodes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 59de864fd..36afb4e1d 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -169,9 +169,7 @@ def _define_output_feature_names( cluster_idx, summ_idx, new_feature_names, - ) = self._get_indices_and_feature_names( - input_feature_names, self._add_counts - ) + ) = self._get_indices_and_feature_names(input_feature_names) self._cluster_indices = cluster_idx self._summarization_indices = summ_idx return new_feature_names @@ -179,7 +177,6 @@ def _define_output_feature_names( def _get_indices_and_feature_names( self, feature_names: List[str], - add_counts: bool, ) -> Tuple[List[int], List[int], List[str]]: cluster_idx, summ_idx, summ_names = identify_indices( feature_names, self._cluster_on @@ -188,7 +185,7 @@ def _get_indices_and_feature_names( for feature in summ_names: for pct in self._percentiles: new_feature_names.append(f"{feature}_pct{pct}") - if add_counts: + if self._add_counts: # add "counts" as the last feature new_feature_names.append("counts") return cluster_idx, summ_idx, new_feature_names @@ -205,7 +202,8 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: summarization_indices=self._summarization_indices, percentiles=self._percentiles, ) - array = cluster_class.add_counts() + if self._add_counts: + array = cluster_class.add_counts() else: self.error( f"""{self.__class__.__name__} was not instatiated with From 505a82eebcebd3adcd24a12c213c742539ff98e9 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 15:10:34 +0900 Subject: [PATCH 11/18] update docstrings --- src/graphnet/models/graphs/utils.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index d2ed6cce4..85ea94d9d 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -180,7 +180,30 @@ def cluster_summarize_with_percentiles( class cluster_and_pad: - """Cluster and pad the data for further summarization.""" + """Cluster and pad the data for further summarization. + + Clusters the inptut data according to the specified columns + and computes aggregate statistics on the clusters. + The clustering will happen only ones creating a cluster matrix + which will hold all the aggregated statistics and a padded matrix which + will hold the padded data for quick calculation of aggregate statistics. + + Example: + clustered_x = cluster_and_pad(x = single_event_as_array, + cluster_columns = [0,1,2]) + # Creates a cluster matrix and a padded matrix, + # the cluster matrix will contain the unique values of the cluster columns, + # no additional aggregate statistics are added yet. + + clustered_x_with_percentiles = cluster_class.add_percentile_summary( + summarization_indices = [3,4,5], percentiles = [10,50,90]) + # Adds the 10th, 50th and 90th percentile of columns 3,4 + # and 5 in the input data to the cluster matrix. + + clustered_x_with_percentiles_and_std = cluster_class.add_std(column = 4) + # Adds the standard deviation of column 4 in the input data + # to the cluster matrix. + """ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: """Initialize the class with the data and cluster columns. @@ -269,7 +292,7 @@ def add_charge_threshold_summary( _padded_x: Charge is altered to be the cumulative sum of the charge divided by the total charge clustered_x: The summarization indices are added at the end - of the tensor + of the tensor or inserted at the specified location. """ # convert the charge to the cumulative sum of the charge divided # by the total charge @@ -336,7 +359,7 @@ def add_percentile_summary( None Altered: clustered_x: The summarization indices are added at the end of - the tensor + the tensor or inserted at the specified location """ percentiles_x = np.nanpercentile( self._padded_x[:, :, summarization_indices], From 4a4083b5b4092d26ae811f63a4578ff7a1554552 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 15:17:44 +0900 Subject: [PATCH 12/18] move/use internal functions + output x --- src/graphnet/models/graphs/utils.py | 68 ++++++++++++----------------- 1 file changed, 28 insertions(+), 40 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 85ea94d9d..9587010df 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -247,6 +247,7 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: for i in range(len(self._counts)): self._padded_x[i, : self._counts[i]] = x[: self._counts[i]] x = x[self._counts[i] :] + return self.clustered_x def _add_column( self, column: np.ndarray, location: Optional[int] = None @@ -266,6 +267,31 @@ def _add_column( self.clustered_x, location, column, axis=1 ) + def _calculate_charge_sum(self, charge_index: int) -> np.ndarray: + """Calculate the sum of the charge.""" + assert not hasattr( + self, "_charge_sum" + ), "Charge sum has already been calculated, \ + re-calculation is not allowed" + self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) + return self._charge_sum + + def _calculate_charge_weights(self, charge_index: int) -> np.ndarray: + """Calculate the weights of the charge.""" + assert not hasattr( + self, "_charge_weights" + ), "Charge weights have already been calculated, \ + re-calculation is not allowed" + assert hasattr( + self, "_charge_sum" + ), "Charge sum has not been calculated, \ + please run calculate_charge_sum" + self._charge_weights = ( + self._padded_x[:, :, charge_index] + / self._charge_sum[:, np.newaxis] + ) + return self._charge_weights + def add_charge_threshold_summary( self, summarization_indices: List[int], @@ -296,21 +322,8 @@ def add_charge_threshold_summary( """ # convert the charge to the cumulative sum of the charge divided # by the total charge - self._charge_weights = self._padded_x[:, :, charge_index] - - self._padded_x[:, :, charge_index] = self._padded_x[ - :, :, charge_index - ].cumsum(axis=1) - - # add the charge sum to the class if it does not already exist - if not hasattr(self, "_charge_sum"): - self._charge_sum = np.nanmax( - self._padded_x[:, :, charge_index], axis=1 - ) - - self._charge_weights = ( - self._charge_weights / self._charge_sum[:, np.newaxis] - ) + self._calculate_charge_sum(charge_index) + self._calculate_charge_weights(charge_index) self._padded_x[:, :, charge_index] = ( self._padded_x[:, :, charge_index] @@ -374,31 +387,6 @@ def add_percentile_summary( self._add_column(percentiles_x, location) return self.clustered_x - def calculate_charge_sum(self, charge_index: int) -> np.ndarray: - """Calculate the sum of the charge.""" - assert not hasattr( - self, "_charge_sum" - ), "Charge sum has already been calculated, \ - re-calculation is not allowed" - self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) - return self._charge_sum - - def calculate_charge_weights(self, charge_index: int) -> np.ndarray: - """Calculate the weights of the charge.""" - assert not hasattr( - self, "_charge_weights" - ), "Charge weights have already been calculated, \ - re-calculation is not allowed" - assert hasattr( - self, "_charge_sum" - ), "Charge sum has not been calculated, \ - please run calculate_charge_sum" - self._charge_weights = ( - self._padded_x[:, :, charge_index] - / self._charge_sum[:, np.newaxis] - ) - return self._charge_weights - def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) From 87f41c6bc3254b9b1894e44b3ffe8792511fc2e2 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 15:25:15 +0900 Subject: [PATCH 13/18] remove warning --- src/graphnet/models/graphs/utils.py | 66 ----------------------------- 1 file changed, 66 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 9587010df..0ac8aeac2 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -113,72 +113,6 @@ def identify_indices( return cluster_indices, summarization_indices, features_for_summarization -# TODO Remove this function as it is superseded by -# cluster_and_pad wich has the same functionality -def cluster_summarize_with_percentiles( - x: np.ndarray, - summarization_indices: List[int], - cluster_indices: List[int], - percentiles: List[int], - add_counts: bool, -) -> np.ndarray: - """Turn `x` into clusters with percentile summary. - - From variables specified by column indices `cluster_indices`, `x` is turned - into clusters. Information in columns of `x` specified by indices - `summarization_indices` with each cluster is summarized using percentiles. - It is assumed `x` represents a single event. - - **Example use-case**: - Suppose `x` contains raw pulses from a neutrino event where some DOMs have - multiple measurements of Cherenkov radiation. If `cluster_indices` is set - to the columns corresponding to the xyz-position of the DOMs, and the - features specified in `summarization_indices` correspond to time, charge, - then each row in the returned array will correspond to a DOM, - and the time and charge for each DOM will be summarized by percentiles. - Returned output array has dimensions - `[n_clusters, - len(percentiles)*len(summarization_indices) + len(cluster_indices)]` - - Args: - x: Array to be clustered - summarization_indices: List of column indices that defines features - that will be summarized with percentiles. - cluster_indices: List of column indices on which the clusters - are constructed. - percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. - - Returns: - Percentile-summarized array - """ - print( - "This function is deprecated and will be removed,", - "use the class cluster_and_pad with add_percentile_summary", - "instead for the same functionality", - ) - pct_dict = {} - for feature_idx in summarization_indices: - summarized_array, column_offset, counts = gather_cluster_sequence( - x, feature_idx, cluster_indices - ) - pct_dict[feature_idx] = np.nanpercentile( - summarized_array[:, column_offset:], percentiles, axis=1 - ).T - - for i, key in enumerate(pct_dict.keys()): - if i == 0: - array = summarized_array[:, 0:column_offset] - - array = np.concatenate([array, pct_dict[key]], axis=1) - - if add_counts: - array = np.concatenate( - [array, np.log10(counts).reshape(-1, 1)], axis=1 - ) - - return array - - class cluster_and_pad: """Cluster and pad the data for further summarization. From 51da4b0610bd64c8aa2a66ee86eef176168c1b56 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 15:45:03 +0900 Subject: [PATCH 14/18] re-add old function --- src/graphnet/models/graphs/utils.py | 61 +++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 0ac8aeac2..0fcd6c136 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -113,6 +113,67 @@ def identify_indices( return cluster_indices, summarization_indices, features_for_summarization +# TODO Remove this function as it is superseded by +# cluster_and_pad wich has the same functionality +def cluster_summarize_with_percentiles( + x: np.ndarray, + summarization_indices: List[int], + cluster_indices: List[int], + percentiles: List[int], + add_counts: bool, +) -> np.ndarray: + """Turn `x` into clusters with percentile summary. + + From variables specified by column indices `cluster_indices`, `x` is turned + into clusters. Information in columns of `x` specified by indices + `summarization_indices` with each cluster is summarized using percentiles. + It is assumed `x` represents a single event. + + **Example use-case**: + Suppose `x` contains raw pulses from a neutrino event where some DOMs have + multiple measurements of Cherenkov radiation. If `cluster_indices` is set + to the columns corresponding to the xyz-position of the DOMs, and the + features specified in `summarization_indices` correspond to time, charge, + then each row in the returned array will correspond to a DOM, + and the time and charge for each DOM will be summarized by percentiles. + Returned output array has dimensions + `[n_clusters, + len(percentiles)*len(summarization_indices) + len(cluster_indices)]` + + Args: + x: Array to be clustered + summarization_indices: List of column indices that defines features + that will be summarized with percentiles. + cluster_indices: List of column indices on which the clusters + are constructed. + percentiles: percentiles used to summarize `x`. E.g. [10,50,90]. + + Returns: + Percentile-summarized array + """ + pct_dict = {} + for feature_idx in summarization_indices: + summarized_array, column_offset, counts = gather_cluster_sequence( + x, feature_idx, cluster_indices + ) + pct_dict[feature_idx] = np.nanpercentile( + summarized_array[:, column_offset:], percentiles, axis=1 + ).T + + for i, key in enumerate(pct_dict.keys()): + if i == 0: + array = summarized_array[:, 0:column_offset] + + array = np.concatenate([array, pct_dict[key]], axis=1) + + if add_counts: + array = np.concatenate( + [array, np.log10(counts).reshape(-1, 1)], axis=1 + ) + + return array + + class cluster_and_pad: """Cluster and pad the data for further summarization. From 3e21f7f6d251bd938505636efada00fa795a268f Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 15:51:10 +0900 Subject: [PATCH 15/18] remove returns --- src/graphnet/models/graphs/nodes/nodes.py | 5 +++-- src/graphnet/models/graphs/utils.py | 19 ++++++------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 36afb4e1d..11c03ae84 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -198,12 +198,13 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: cluster_class = cluster_and_pad( x=x, cluster_columns=self._cluster_indices ) - array = cluster_class.add_percentile_summary( + cluster_class.add_percentile_summary( summarization_indices=self._summarization_indices, percentiles=self._percentiles, ) if self._add_counts: - array = cluster_class.add_counts() + cluster_class.add_counts() + array = cluster_class.clustered_x else: self.error( f"""{self.__class__.__name__} was not instatiated with diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 0fcd6c136..4093dc288 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -184,20 +184,22 @@ class cluster_and_pad: will hold the padded data for quick calculation of aggregate statistics. Example: - clustered_x = cluster_and_pad(x = single_event_as_array, + cluster_and_pad(x = single_event_as_array, cluster_columns = [0,1,2]) # Creates a cluster matrix and a padded matrix, # the cluster matrix will contain the unique values of the cluster columns, # no additional aggregate statistics are added yet. - clustered_x_with_percentiles = cluster_class.add_percentile_summary( - summarization_indices = [3,4,5], percentiles = [10,50,90]) + cluster_class.add_percentile_summary(summarization_indices = [3,4,5], + percentiles = [10,50,90]) # Adds the 10th, 50th and 90th percentile of columns 3,4 # and 5 in the input data to the cluster matrix. - clustered_x_with_percentiles_and_std = cluster_class.add_std(column = 4) + cluster_class.add_std(column = 4) # Adds the standard deviation of column 4 in the input data # to the cluster matrix. + x = cluster_class.clustered_x + # Gets the clustered matrix with all the aggregate statistics. """ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: @@ -242,7 +244,6 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: for i in range(len(self._counts)): self._padded_x[i, : self._counts[i]] = x[: self._counts[i]] x = x[self._counts[i] :] - return self.clustered_x def _add_column( self, column: np.ndarray, location: Optional[int] = None @@ -269,7 +270,6 @@ def _calculate_charge_sum(self, charge_index: int) -> np.ndarray: ), "Charge sum has already been calculated, \ re-calculation is not allowed" self._charge_sum = self._padded_x[:, :, charge_index].sum(axis=1) - return self._charge_sum def _calculate_charge_weights(self, charge_index: int) -> np.ndarray: """Calculate the weights of the charge.""" @@ -285,7 +285,6 @@ def _calculate_charge_weights(self, charge_index: int) -> np.ndarray: self._padded_x[:, :, charge_index] / self._charge_sum[:, np.newaxis] ) - return self._charge_weights def add_charge_threshold_summary( self, @@ -343,7 +342,6 @@ def add_charge_threshold_summary( len(self.clustered_x), -1 ) self._add_column(selections, location) - return self.clustered_x def add_percentile_summary( self, @@ -380,12 +378,10 @@ def add_percentile_summary( len(self.clustered_x), -1 ) self._add_column(percentiles_x, location) - return self.clustered_x def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) - return self.clustered_x def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: """Add the sum of the charge to the summarization features.""" @@ -394,7 +390,6 @@ def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: ), "Charge sum has not been calculated, \ please run calculate_charge_sum" self._add_column(self._charge_sum, location) - return self.clustered_x def add_std( self, @@ -414,7 +409,6 @@ def add_std( self._add_column( np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location ) - return self.clustered_x def add_mean( self, @@ -427,7 +421,6 @@ def add_mean( np.nanmean(self._padded_x[:, :, column] * weights, axis=1), location, ) - return self.clustered_x def ice_transparency( From eaa12e584beb2ace28f9f3ac2c8cc29c2e5f98e0 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 16:31:25 +0900 Subject: [PATCH 16/18] docstrings udpdates --- src/graphnet/models/graphs/utils.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 4093dc288..2d21dcd4f 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -209,7 +209,6 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: x: Array to be clustered cluster_columns: List of column indices on which the clusters are constructed. - Returns: None Adds: clustered_x: Added to the class _counts: Added to the class @@ -252,9 +251,10 @@ def _add_column( Args: column: Column to be added to the tensor - location: Location to insert the column in the clustered tensor - Returns: - clustered_x: The clustered tensor with the column added + location: Location to insert the column in the clustered tensor. + Altered: + clustered_x: The column is added at the end of the tenor or + inserted at the specified location """ if location is None: self.clustered_x = np.column_stack([self.clustered_x, column]) @@ -302,9 +302,6 @@ def add_charge_threshold_summary( charge_index: index of the charge column in the padded tensor location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end - Returns: - clustered_x: The clustered tensor with the summarization indices - added Adds: _charge_sum: Added to the class _charge_weights: Added to the class @@ -359,10 +356,6 @@ def add_percentile_summary( method: Method to summarize the features. E.g. "linear" location: Location to insert the summarization indices in the clustered tensor defaults to adding at the end - Returns: - None - Adds: - None Altered: clustered_x: The summarization indices are added at the end of the tensor or inserted at the specified location From 165fedb5b83dae1878023b83cf9fa4d45863978b Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 17:37:51 +0900 Subject: [PATCH 17/18] automatic_name_generation --- src/graphnet/models/graphs/utils.py | 94 +++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 12 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 2d21dcd4f..9c9a76062 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -202,13 +202,20 @@ class cluster_and_pad: # Gets the clustered matrix with all the aggregate statistics. """ - def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: + def __init__( + self, + x: np.ndarray, + cluster_columns: List[int], + input_names: Optional[List[str]] = None, + ) -> None: """Initialize the class with the data and cluster columns. Args: x: Array to be clustered cluster_columns: List of column indices on which the clusters are constructed. + input_names: Names of the columns in the input data for automatic + generation of names. Adds: clustered_x: Added to the class _counts: Added to the class @@ -244,6 +251,14 @@ def __init__(self, x: np.ndarray, cluster_columns: List[int]) -> None: self._padded_x[i, : self._counts[i]] = x[: self._counts[i]] x = x[self._counts[i] :] + self._input_names = input_names + if self._input_names is not None: + assert ( + len(self._input_names) == x.shape[1] + ), "The input names must have the same length as the input data" + + self._cluster_names = np.array(input_names)[cluster_columns] + def _add_column( self, column: np.ndarray, location: Optional[int] = None ) -> None: @@ -263,6 +278,25 @@ def _add_column( self.clustered_x, location, column, axis=1 ) + def _add_column_names( + self, names: List[str], location: Optional[int] = None + ) -> None: + """Add names to the columns of the clustered tensor. + + Args: + names: Names to be added to the columns of the tensor + location: Location to insert the names in the clustered tensor + Altered: + _cluster_names: The names are added at the end of the tensor + or inserted at the specified location + """ + if location is None: + self._cluster_names = np.append(self._cluster_names, names) + else: + self._cluster_names = np.insert( + self._cluster_names, location, names + ) + def _calculate_charge_sum(self, charge_index: int) -> np.ndarray: """Calculate the sum of the charge.""" assert not hasattr( @@ -310,6 +344,8 @@ def add_charge_threshold_summary( of the charge divided by the total charge clustered_x: The summarization indices are added at the end of the tensor or inserted at the specified location. + _cluster_names: The names are added at the end of the tensor + or inserted at the specified location """ # convert the charge to the cumulative sum of the charge divided # by the total charge @@ -340,6 +376,15 @@ def add_charge_threshold_summary( ) self._add_column(selections, location) + # update the cluster names + if self._input_names is not None: + new_names = [ + self._input_names[i] + "_charge_threshold_" + str(p) + for i in summarization_indices + for p in percentiles + ] + self._add_column_names(new_names, location) + def add_percentile_summary( self, summarization_indices: List[int], @@ -359,6 +404,8 @@ def add_percentile_summary( Altered: clustered_x: The summarization indices are added at the end of the tensor or inserted at the specified location + _cluster_names: The names are added at the end of the tensor + or inserted at the specified location """ percentiles_x = np.nanpercentile( self._padded_x[:, :, summarization_indices], @@ -372,48 +419,71 @@ def add_percentile_summary( ) self._add_column(percentiles_x, location) + # update the cluster names + if self._input_names is not None: + new_names = [ + self._input_names[i] + "_percentile_" + str(p) + for i in summarization_indices + for p in percentiles + ] + self._add_column_names(new_names, location) + def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) + new_name = ["counts"] + self._add_column_names(new_name, location) - def add_sum_charge(self, location: Optional[int] = None) -> np.ndarray: + def add_sum_charge( + self, charge_index: int, location: Optional[int] = None + ) -> np.ndarray: """Add the sum of the charge to the summarization features.""" - assert hasattr( - self, "_charge_sum" - ), "Charge sum has not been calculated, \ - please run calculate_charge_sum" + if not hasattr(self, "_charge_sum"): + self._calculate_charge_sum(charge_index) self._add_column(self._charge_sum, location) + # update the cluster names + if self._input_names is not None: + new_name = [self._input_names[charge_index] + "_sum"] + self._add_column_names(new_name, location) def add_std( self, - column: int, + columns: List[int], location: Optional[int] = None, weights: Union[np.ndarray, int] = 1, ) -> np.ndarray: """Add the standard deviation of the column. Args: - column: Index of the column in the padded tensor to - calculate the standard deviation + columns: Index of the columns from which to calculate the standard + deviation. location: Location to insert the standard deviation in the clustered tensor defaults to adding at the end weights: Optional weights to be applied to the standard deviation """ self._add_column( - np.nanstd(self._padded_x[:, :, column] * weights, axis=1), location + np.nanstd(self._padded_x[:, :, columns] * weights, axis=1), + location, ) + if self._input_names is not None: + new_names = [self._input_names[i] + "_std" for i in columns] + self._add_column_names(new_names, location) def add_mean( self, - column: int, + columns: List[int], location: Optional[int] = None, weights: Union[np.ndarray, int] = 1, ) -> np.ndarray: """Add the mean of the column.""" self._add_column( - np.nanmean(self._padded_x[:, :, column] * weights, axis=1), + np.nanmean(self._padded_x[:, :, columns] * weights, axis=1), location, ) + # update the cluster names + if self._input_names is not None: + new_names = [self._input_names[i] + "_mean" for i in columns] + self._add_column_names(new_names, location) def ice_transparency( From 75b3260f830feff65c40dbff425e126e35724257 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 6 Dec 2024 21:51:16 +0900 Subject: [PATCH 18/18] small_fix --- src/graphnet/models/graphs/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 9c9a76062..7bb746508 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -431,8 +431,9 @@ def add_percentile_summary( def add_counts(self, location: Optional[int] = None) -> np.ndarray: """Add the counts of the sensor to the summarization features.""" self._add_column(np.log10(self._counts), location) - new_name = ["counts"] - self._add_column_names(new_name, location) + if self._input_names is not None: + new_name = ["counts"] + self._add_column_names(new_name, location) def add_sum_charge( self, charge_index: int, location: Optional[int] = None