From 146f66ed6c6507c39dde170c5285199dd16da01c Mon Sep 17 00:00:00 2001 From: samadpls Date: Sun, 3 Dec 2023 01:32:52 +0500 Subject: [PATCH 001/124] Added flexible DataLoader configurations to `GraphNeTDataModule` Signed-off-by: samadpls --- setup.py | 1 + src/graphnet/data/datamodule.py | 338 ++++++++++++++++++++++++++++++++ src/graphnet/training/utils.py | 13 ++ 3 files changed, 352 insertions(+) create mode 100644 src/graphnet/data/datamodule.py diff --git a/setup.py b/setup.py index 3b70233ab..1ec34a0f0 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "timer>=0.2", "tqdm>=4.64", "wandb>=0.12", + "pytorch-lightning>=2.1.2", ] EXTRAS_REQUIRE = { diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py new file mode 100644 index 000000000..1f8398c8b --- /dev/null +++ b/src/graphnet/data/datamodule.py @@ -0,0 +1,338 @@ +from typing import Dict, Any, Optional, List, Tuple, Union +import lightning as L +from torch.utils.data import DataLoader +from copy import deepcopy +from sklearn.model_selection import train_test_split +import pandas as pd + +from graphnet.data.dataset import ( + Dataset, + EnsembleDataset, + SQLiteDataset, + ParquetDataset, +) +from graphnet.utilities.logging import Logger +from graphnet.training.utils import save_selection + + +class GraphNeTDataModule(L.LightningDataModule, Logger): + """General Class for DataLoader Construction.""" + + def __init__( + self, + dataset_reference: Union[SQLiteDataset, ParquetDataset, Dataset], + selection: Optional[Union[List[int], List[List[int]]]], + test_selection: Optional[Union[List[int], List[List[int]]]], + dataset_args: Dict[str, Any], + train_dataloader_kwargs: Optional[Dict[str, Any]] = None, + validation_dataloader_kwargs: Optional[Dict[str, Any]] = None, + test_dataloader_kwargs: Optional[Dict[str, Any]] = None, + train_val_split: Optional[List[float, float]] = [0.9, 0.10], + split_seed: int = 42, + ) -> None: + """Create dataloaders from dataset. + + Args: + dataset_reference: A non-instantiated reference to the dataset class. + selection: (Optional) a list of event id's used for training and validation. + test_selection: (Optional) a list of event id's used for testing. + dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. + train_dataloader_kwargs: Arguments for the training DataLoader. + validation_dataloader_kwargs: Arguments for the validation DataLoader. + test_dataloader_kwargs: Arguments for the test DataLoader. + split_seed: seed used for shuffling and splitting selections into train/validation. + """ + self._dataset = dataset_reference + self._selection = selection + self._train_val_split = train_val_split + self._test_selection = test_selection + self._dataset_args = dataset_args + self._rng = split_seed + + self._train_dataloader_kwargs = train_dataloader_kwargs or {} + self._validation_dataloader_kwargs = validation_dataloader_kwargs or {} + self._test_dataloader_kwargs = test_dataloader_kwargs or {} + + # If multiple dataset paths are given, we should use EnsembleDataset + self._use_ensemble_dataset = isinstance( + self._dataset_args["path"], list + ) + + def prepare_data(self) -> None: + """Prepare the dataset for training.""" + # Download method for curated datasets. Method for download is + # likely dataset-specific, so we can leave it as-is + pass + + def setup(self, stage: str) -> None: + """Prepare Datasets for DataLoaders. + + Args: + stage: lightning stage. Either "fit, validate, test, predict" + """ + # Sanity Checks + self._validate_dataset_class() + self._validate_dataset_args() + self._validate_dataloader_args() + + # Case-handling of selection arguments + self._resolve_selections() + + # Creation of Datasets + self._train_dataset = self._create_dataset(self._train_selection) + self._val_dataset = self._create_dataset(self._val_selection) + self._test_dataset = self._create_dataset(self._test_selection) + + return + + def train_dataloader(self) -> DataLoader: + """Prepare and return the training DataLoader. + + Returns: + DataLoader: The DataLoader configured for training. + """ + return self._create_dataloader(self._train_dataset) + + def val_dataloader(self) -> DataLoader: + """Prepare and return the validation DataLoader. + + Returns: + DataLoader: The DataLoader configured for validation. + """ + return self._create_dataloader(self._val_dataset) + + def test_dataloader(self) -> DataLoader: + """Prepare and return the test DataLoader. + + Returns: + DataLoader: The DataLoader configured for testing. + """ + return self._create_dataloader(self._test_dataset) + + def teardown(self) -> None: + """Perform any necessary cleanup or shutdown procedures. + + This method can be used for tasks such as closing SQLite connections + after training. Override this method as needed. + + Returns: + None + """ + pass + + def _create_dataloader( + self, dataset: Union[Dataset, EnsembleDataset] + ) -> DataLoader: + """Create a DataLoader for the given dataset. + + Args: + dataset (Union[Dataset, EnsembleDataset]): The dataset to create a DataLoader for. + + Returns: + DataLoader: The DataLoader configured for the given dataset. + """ + return DataLoader(dataset=dataset, **self._dataloader_args) + + def _validate_dataset_class(self) -> None: + """Sanity checks on the dataset reference (self._dataset). + + Is it a GraphNeT-compatible dataset? has the class already been + instantiated? Did they try to pass EnsembleDataset? + """ + if not isinstance( + self._dataset, (SQLiteDataset, ParquetDataset, Dataset) + ): + raise TypeError( + "dataset_reference must be an instance of SQLiteDataset, ParquetDataset, or Dataset." + ) + if isinstance(self._dataset, EnsembleDataset): + raise TypeError( + "EnsembleDataset is not allowed as dataset_reference." + ) + + def _validate_dataset_args(self) -> None: + """Sanity checks on the arguments for the dataset reference.""" + if isinstance(self._dataset_args["path"], list): + if self._selection is not None: + try: + # Check that the number of dataset paths is equal to the + # number of selections given as arg. + assert len(self._dataset_args["path"]) == len( + self._selection + ) + except AssertionError: + raise ValueError( + f"The number of dataset paths ({len(self._dataset_args['path'])}) does not match the number of selections ({len(self._selection)})." + ) + + if self._test_selection is not None: + try: + # Check that the number of dataset paths is equal to the + # number of test selections. + assert len(self._dataset_args["path"]) == len( + self._test_selection + ) + except AssertionError: + raise ValueError( + f"The number of dataset paths ({len(self._dataset_args['path'])}) does not match the number of test selections ({len(self._test_selection)}). If you'd like to test on only a subset of the {len(self._dataset_args['path'])} datasets, please provide empty test selections for the others." + ) + + def _validate_dataloader_args(self) -> None: + """Sanity check on `dataloader_args`.""" + if "dataset" in self._dataloader_args: + raise ValueError("`dataloader_args` must not contain `dataset`") + + def _resolve_selections(self) -> None: + if self._test_selection is None: + self.warning_once( + f"{self.__class__.__name__} did not receive an argument for `test_selection` and will therefore not have a prediction dataloader available." + ) + if self._selection is not None: + # Split the selection into train/validation + if self._use_ensemble_dataset: + # Split every selection + self._train_selection = [] + self._val_selection = [] + for selection in self._selection: + train_selection, val_selection = self._split_selection( + selection + ) + self._train_selection.append(train_selection) + self._val_selection.append(val_selection) + + else: + # Split the only selection we got + ( + self._train_selection, + self._val_selection, + ) = self._split_selection(self._selection) + + if self._selection is None: + # If not provided, we infer it by grabbing all event ids in the dataset. + self.info( + f"{self.__class__.__name__} did not receive an argument for `selection`. Selection will automatically be created with a split of train: {self._train_val_split[0]} and validation: {self._train_val_split[1]}" + ) + ( + self._train_selection, + self._val_selection, + ) = self._infer_selections() + + def _split_selection( + self, selection: List[int] + ) -> Tuple[List[int], List[int]]: + """Split train selection into train/validation. + + Args: + selection: Training selection to be split + + Returns: + Training selection, Validation selection. + """ + train_selection, val_selection = train_test_split( + selection, + train_size=self._train_val_split[0], + test_size=self._train_val_split[1], + random_state=self._rng, + ) + return train_selection, val_selection + + def _infer_selections(self) -> Tuple[List[int], List[int]]: + """Automatically infer training and validation selections. + + Returns: + Training selection, Validation selection + """ + if self._use_ensemble_dataset: + # We must iterate through the dataset paths and infer a train/val + # selection for each. + self._train_selection = [] + self._val_selection = [] + for dataset_path in self._dataset_args["path"]: + ( + train_selection, + val_selection, + ) = self._infer_selections_on_single_dataset(dataset_path) + self._train_selection.append(train_selection) + self._val_selection.append(val_selection) + else: + # Infer selection on a single dataset + ( + self._train_selection, + self._val_selection, + ) = self._infer_selections_on_single_dataset( + self._dataset_args["path"] + ) + + def _infer_selections_on_single_dataset( + self, dataset_path: str + ) -> Tuple[List[int], List[int]]: + """Automatically infers training and validation selections for a single dataset. + + Args: + dataset_path (str): The path to the dataset. + + Returns: + Tuple[List[int], List[int]]: Training and validation selections. + """ + tmp_args = deepcopy(self._dataset_args) + tmp_args["path"] = dataset_path + tmp_dataset = self._construct_dataset(tmp_args) + + all_events = tmp_dataset._get_all_indices() # unshuffled list + + # Multiple lines to avoid one large + all_events = pd.DataFrame(all_events).sample( + frac=1, replace=False, random_state=self._rng + ) + + all_events = all_events.values.tolist() # shuffled list + return self._split_selection(all_events) + + def _create_dataset( + self, selection: Union[List[int], List[List[int]]] + ) -> Union[EnsembleDataset, Dataset]: + """Instantiate `dataset_reference`. + + Args: + selection: The selected event id's. + + Returns: + A dataset, either an instance of `EnsembleDataset` or `Dataset`. + """ + if self._use_ensemble_dataset: + # Construct multiple datasets and pass to EnsembleDataset + # At this point, we have checked that len(selection) == len(dataset_args['path']) + datasets = [] + for dataset_idx in range(len(selection)): + datasets.append( + self._create_single_dataset( + selection=selection[dataset_idx], + path=self._dataset_args["path"][dataset_idx], + ) + ) + + dataset = EnsembleDataset(datasets) + + else: + # Construct single dataset + dataset = self._create_single_dataset( + selection=selection, path=self._dataset_args["path"] + ) + return dataset + + def _create_single_dataset( + self, selection: List[int], path: str + ) -> Dataset: + """Instantiate a single `Dataset`. + + Args: + selection: A selection for a single dataset. + path: Path to a single dataset + + Returns: + An instance of `Dataset`. + """ + tmp_args = deepcopy(self._dataset_args) + tmp_args["path"] = path + tmp_args["selection"] = selection + return self._dataset(**tmp_args) diff --git a/src/graphnet/training/utils.py b/src/graphnet/training/utils.py index df7c92e15..b33089ec9 100644 --- a/src/graphnet/training/utils.py +++ b/src/graphnet/training/utils.py @@ -317,3 +317,16 @@ def save_results( model.save_state_dict(path + "/" + tag + "_state_dict.pth") model.save(path + "/" + tag + "_model.pth") Logger().info("Results saved at: \n %s" % path) + + +def save_selection(selection: List[int], file_path: str) -> None: + """Save the list of event numbers to a CSV file. + + Args: + selection: List of event ids. + file_path: File path to save the selection. + """ + with open(file_path, "w") as file: + file.write("event_id\n") + for event_id in selection: + file.write(f"{event_id}\n") From 2fb0bef4718b13a887f1d8c77ef8af1c0b2ecb57 Mon Sep 17 00:00:00 2001 From: samadpls Date: Sun, 3 Dec 2023 15:11:16 +0500 Subject: [PATCH 002/124] refactored the coding style --- src/graphnet/data/datamodule.py | 41 +++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 1f8398c8b..605cbb429 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -1,3 +1,4 @@ +"""Base `Dataloader` class(es) used in `graphnet`.""" from typing import Dict, Any, Optional, List, Tuple, Union import lightning as L from torch.utils.data import DataLoader @@ -27,7 +28,7 @@ def __init__( train_dataloader_kwargs: Optional[Dict[str, Any]] = None, validation_dataloader_kwargs: Optional[Dict[str, Any]] = None, test_dataloader_kwargs: Optional[Dict[str, Any]] = None, - train_val_split: Optional[List[float, float]] = [0.9, 0.10], + train_val_split: Optional[List[float]] = [0.9, 0.10], split_seed: int = 42, ) -> None: """Create dataloaders from dataset. @@ -40,6 +41,7 @@ def __init__( train_dataloader_kwargs: Arguments for the training DataLoader. validation_dataloader_kwargs: Arguments for the validation DataLoader. test_dataloader_kwargs: Arguments for the test DataLoader. + train_val_split (Optional): Split ratio for training and validation sets. Default is [0.9, 0.10]. split_seed: seed used for shuffling and splitting selections into train/validation. """ self._dataset = dataset_reference @@ -81,7 +83,8 @@ def setup(self, stage: str) -> None: # Creation of Datasets self._train_dataset = self._create_dataset(self._train_selection) self._val_dataset = self._create_dataset(self._val_selection) - self._test_dataset = self._create_dataset(self._test_selection) + if self._test_selection is not None: + self._test_dataset = self._create_dataset(self._test_selection) return @@ -191,8 +194,8 @@ def _resolve_selections(self) -> None: # Split the selection into train/validation if self._use_ensemble_dataset: # Split every selection - self._train_selection = [] - self._val_selection = [] + self._train_selection: List[List[int]] = [] + self._val_selection: List[List[int]] = [] for selection in self._selection: train_selection, val_selection = self._split_selection( selection @@ -218,7 +221,7 @@ def _resolve_selections(self) -> None: ) = self._infer_selections() def _split_selection( - self, selection: List[int] + self, selection: Union[int, List[int], List[List[int]]] ) -> Tuple[List[int], List[int]]: """Split train selection into train/validation. @@ -228,12 +231,26 @@ def _split_selection( Returns: Training selection, Validation selection. """ - train_selection, val_selection = train_test_split( - selection, - train_size=self._train_val_split[0], - test_size=self._train_val_split[1], - random_state=self._rng, - ) + if isinstance(selection, int): + train_selection, val_selection = [selection], [] + elif isinstance(selection[0], list): + flat_selection = [ + item for sublist in selection for item in sublist + ] + train_selection, val_selection = train_test_split( + flat_selection, + train_size=self._train_val_split[0], + test_size=self._train_val_split[1], + random_state=self._rng, + ) + else: + train_selection, val_selection = train_test_split( + selection, + train_size=self._train_val_split[0], + test_size=self._train_val_split[1], + random_state=self._rng, + ) + return train_selection, val_selection def _infer_selections(self) -> Tuple[List[int], List[int]]: @@ -266,7 +283,7 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: def _infer_selections_on_single_dataset( self, dataset_path: str ) -> Tuple[List[int], List[int]]: - """Automatically infers training and validation selections for a single dataset. + """Automatically infers dataset train/val selections. Args: dataset_path (str): The path to the dataset. From f4fbd05c7ec1d1f9e7abe2ffb126bf0c186da892 Mon Sep 17 00:00:00 2001 From: samadpls Date: Mon, 4 Dec 2023 12:25:22 +0500 Subject: [PATCH 003/124] Refactored the coding style --- setup.py | 1 - src/graphnet/data/datamodule.py | 89 +++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/setup.py b/setup.py index 1ec34a0f0..3b70233ab 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ "timer>=0.2", "tqdm>=4.64", "wandb>=0.12", - "pytorch-lightning>=2.1.2", ] EXTRAS_REQUIRE = { diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 605cbb429..aec94a481 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -1,6 +1,6 @@ """Base `Dataloader` class(es) used in `graphnet`.""" from typing import Dict, Any, Optional, List, Tuple, Union -import lightning as L +import pytorch_lightning as pl from torch.utils.data import DataLoader from copy import deepcopy from sklearn.model_selection import train_test_split @@ -16,7 +16,7 @@ from graphnet.training.utils import save_selection -class GraphNeTDataModule(L.LightningDataModule, Logger): +class GraphNeTDataModule(pl.LightningDataModule, Logger): """General Class for DataLoader Construction.""" def __init__( @@ -45,9 +45,9 @@ def __init__( split_seed: seed used for shuffling and splitting selections into train/validation. """ self._dataset = dataset_reference - self._selection = selection - self._train_val_split = train_val_split - self._test_selection = test_selection + self._selection = selection or [0] + self._train_val_split = train_val_split or [0.0] + self._test_selection = test_selection or [0.0] self._dataset_args = dataset_args self._rng = split_seed @@ -83,8 +83,7 @@ def setup(self, stage: str) -> None: # Creation of Datasets self._train_dataset = self._create_dataset(self._train_selection) self._val_dataset = self._create_dataset(self._val_selection) - if self._test_selection is not None: - self._test_dataset = self._create_dataset(self._test_selection) + self._test_dataset = self._create_dataset(self._test_selection) return @@ -112,16 +111,16 @@ def test_dataloader(self) -> DataLoader: """ return self._create_dataloader(self._test_dataset) - def teardown(self) -> None: - """Perform any necessary cleanup or shutdown procedures. + # def teardown(self) -> None: + # """Perform any necessary cleanup or shutdown procedures. - This method can be used for tasks such as closing SQLite connections - after training. Override this method as needed. + # This method can be used for tasks such as closing SQLite connections + # after training. Override this method as needed. - Returns: - None - """ - pass + # Returns: + # None + # """ + # return None def _create_dataloader( self, dataset: Union[Dataset, EnsembleDataset] @@ -134,7 +133,18 @@ def _create_dataloader( Returns: DataLoader: The DataLoader configured for the given dataset. """ - return DataLoader(dataset=dataset, **self._dataloader_args) + if dataset == self._train_dataset: + dataloader_args = self._train_dataloader_kwargs + elif dataset == self._val_dataset: + dataloader_args = self._validation_dataloader_kwargs + elif dataset == self._test_dataset: + dataloader_args = self._test_dataloader_kwargs + else: + raise ValueError( + "Unknown dataset encountered during dataloader creation." + ) + + return DataLoader(dataset=dataset, **dataloader_args) def _validate_dataset_class(self) -> None: """Sanity checks on the dataset reference (self._dataset). @@ -182,8 +192,18 @@ def _validate_dataset_args(self) -> None: def _validate_dataloader_args(self) -> None: """Sanity check on `dataloader_args`.""" - if "dataset" in self._dataloader_args: - raise ValueError("`dataloader_args` must not contain `dataset`") + if "dataset" in self._train_dataloader_kwargs: + raise ValueError( + "`train_dataloader_kwargs` must not contain `dataset`" + ) + if "dataset" in self._validation_dataloader_kwargs: + raise ValueError( + "`validation_dataloader_kwargs` must not contain `dataset`" + ) + if "dataset" in self._test_dataloader_kwargs: + raise ValueError( + "`test_dataloader_kwargs` must not contain `dataset`" + ) def _resolve_selections(self) -> None: if self._test_selection is None: @@ -232,25 +252,20 @@ def _split_selection( Training selection, Validation selection. """ if isinstance(selection, int): - train_selection, val_selection = [selection], [] + flat_selection = [selection] elif isinstance(selection[0], list): flat_selection = [ item for sublist in selection for item in sublist ] - train_selection, val_selection = train_test_split( - flat_selection, - train_size=self._train_val_split[0], - test_size=self._train_val_split[1], - random_state=self._rng, - ) else: - train_selection, val_selection = train_test_split( - selection, - train_size=self._train_val_split[0], - test_size=self._train_val_split[1], - random_state=self._rng, - ) + flat_selection = selection + train_selection, val_selection = train_test_split( + flat_selection, + train_size=self._train_val_split[0], + test_size=self._train_val_split[1], + random_state=self._rng, + ) return train_selection, val_selection def _infer_selections(self) -> Tuple[List[int], List[int]]: @@ -280,6 +295,8 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: self._dataset_args["path"] ) + return (self._train_selection, self._val_selection) + def _infer_selections_on_single_dataset( self, dataset_path: str ) -> Tuple[List[int], List[int]]: @@ -305,8 +322,16 @@ def _infer_selections_on_single_dataset( all_events = all_events.values.tolist() # shuffled list return self._split_selection(all_events) + def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dict[str, Any]: + """Construct dataset.""" + return tmp_args + + def _get_all_indices(self): + """Shuffle the list.""" + return list + def _create_dataset( - self, selection: Union[List[int], List[List[int]]] + self, selection: Union[List[int], List[List[int]], List[float]] ) -> Union[EnsembleDataset, Dataset]: """Instantiate `dataset_reference`. From 302c808877e5bab01aa2890460bf794bef4a0c34 Mon Sep 17 00:00:00 2001 From: samadpls Date: Mon, 18 Dec 2023 23:04:06 +0500 Subject: [PATCH 004/124] updated `datamodule.py` file --- src/graphnet/data/datamodule.py | 64 ++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index aec94a481..c16bee977 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -111,16 +111,16 @@ def test_dataloader(self) -> DataLoader: """ return self._create_dataloader(self._test_dataset) - # def teardown(self) -> None: - # """Perform any necessary cleanup or shutdown procedures. + def teardown(self) -> None: # type: ignore[override] + """Perform any necessary cleanup or shutdown procedures. - # This method can be used for tasks such as closing SQLite connections - # after training. Override this method as needed. + This method can be used for tasks such as closing SQLite connections + after training. Override this method as needed. - # Returns: - # None - # """ - # return None + Returns: + None + """ + return None def _create_dataloader( self, dataset: Union[Dataset, EnsembleDataset] @@ -214,8 +214,8 @@ def _resolve_selections(self) -> None: # Split the selection into train/validation if self._use_ensemble_dataset: # Split every selection - self._train_selection: List[List[int]] = [] - self._val_selection: List[List[int]] = [] + self._train_selection = [] + self._val_selection = [] for selection in self._selection: train_selection, val_selection = self._split_selection( selection @@ -225,10 +225,13 @@ def _resolve_selections(self) -> None: else: # Split the only selection we got + assert isinstance(self._selection, list) ( self._train_selection, self._val_selection, - ) = self._split_selection(self._selection) + ) = self._split_selection( # type: ignore + self._selection + ) if self._selection is None: # If not provided, we infer it by grabbing all event ids in the dataset. @@ -251,14 +254,16 @@ def _split_selection( Returns: Training selection, Validation selection. """ + assert isinstance(selection, (int, list)) if isinstance(selection, int): flat_selection = [selection] elif isinstance(selection[0], list): flat_selection = [ - item for sublist in selection for item in sublist + item for sublist in selection for item in sublist # type: ignore ] else: - flat_selection = selection + flat_selection = selection # type: ignore + assert isinstance(flat_selection, list) train_selection, val_selection = train_test_split( flat_selection, @@ -284,8 +289,8 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: train_selection, val_selection, ) = self._infer_selections_on_single_dataset(dataset_path) - self._train_selection.append(train_selection) - self._val_selection.append(val_selection) + self._train_selection.extend(train_selection) # type: ignore + self._val_selection.extend(val_selection) # type: ignore else: # Infer selection on a single dataset ( @@ -295,7 +300,7 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: self._dataset_args["path"] ) - return (self._train_selection, self._val_selection) + return (self._train_selection, self._val_selection) # type: ignore def _infer_selections_on_single_dataset( self, dataset_path: str @@ -322,14 +327,23 @@ def _infer_selections_on_single_dataset( all_events = all_events.values.tolist() # shuffled list return self._split_selection(all_events) - def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dict[str, Any]: - """Construct dataset.""" - return tmp_args - def _get_all_indices(self): - """Shuffle the list.""" + """Get all indices. + + Return: + List of indices in an unshuffled order. + """ return list + def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dict[str, Any]: + """Construct dataset. + + Return: + Dataset object constructed from input arguments. + """ + # instance dataset class , that set of argunment , + return tmp_args + def _create_dataset( self, selection: Union[List[int], List[List[int]], List[float]] ) -> Union[EnsembleDataset, Dataset]: @@ -348,7 +362,7 @@ def _create_dataset( for dataset_idx in range(len(selection)): datasets.append( self._create_single_dataset( - selection=selection[dataset_idx], + selection=selection[dataset_idx], # type: ignore path=self._dataset_args["path"][dataset_idx], ) ) @@ -358,12 +372,14 @@ def _create_dataset( else: # Construct single dataset dataset = self._create_single_dataset( - selection=selection, path=self._dataset_args["path"] + selection=selection, path=self._dataset_args["path"] # type: ignore ) return dataset def _create_single_dataset( - self, selection: List[int], path: str + self, + selection: Union[List[int], List[List[int]], List[float]], + path: str, ) -> Dataset: """Instantiate a single `Dataset`. From 0c4a9c69c08f005966d94da9ce655036a818a9fd Mon Sep 17 00:00:00 2001 From: samadpls Date: Sat, 23 Dec 2023 21:47:23 +0500 Subject: [PATCH 005/124] Refactored `GraphNeTDataModule` class Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 55 ++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index c16bee977..1d51a1658 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -5,6 +5,7 @@ from copy import deepcopy from sklearn.model_selection import train_test_split import pandas as pd +import random from graphnet.data.dataset import ( Dataset, @@ -120,7 +121,22 @@ def teardown(self) -> None: # type: ignore[override] Returns: None """ - return None + if hasattr(self, "_train_dataset") and isinstance( + self._train_dataset, SQLiteDataset + ): + self._train_dataset._close_connection() + + if hasattr(self, "_val_dataset") and isinstance( + self._val_dataset, SQLiteDataset + ): + self._val_dataset._close_connection() + + if hasattr(self, "_test_dataset") and isinstance( + self._test_dataset, SQLiteDataset + ): + self._test_dataset._close_connection() + + return def _create_dataloader( self, dataset: Union[Dataset, EnsembleDataset] @@ -149,8 +165,9 @@ def _create_dataloader( def _validate_dataset_class(self) -> None: """Sanity checks on the dataset reference (self._dataset). - Is it a GraphNeT-compatible dataset? has the class already been - instantiated? Did they try to pass EnsembleDataset? + Checks whether the dataset is an instance of SQLiteDataset, + ParquetDataset, or Dataset. Raises a TypeError if an invalid dataset + type is detected, or if an EnsembleDataset is used. """ if not isinstance( self._dataset, (SQLiteDataset, ParquetDataset, Dataset) @@ -296,7 +313,7 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: ( self._train_selection, self._val_selection, - ) = self._infer_selections_on_single_dataset( + ) = self._infer_selections_on_single_dataset( # type: ignore self._dataset_args["path"] ) @@ -317,32 +334,46 @@ def _infer_selections_on_single_dataset( tmp_args["path"] = dataset_path tmp_dataset = self._construct_dataset(tmp_args) - all_events = tmp_dataset._get_all_indices() # unshuffled list + all_events = ( + tmp_dataset._get_all_indices() + ) # unshuffled list, # sequential index # Multiple lines to avoid one large all_events = pd.DataFrame(all_events).sample( frac=1, replace=False, random_state=self._rng ) - all_events = all_events.values.tolist() # shuffled list + all_events = random.sample( + all_events, len(all_events) + ) # shuffled list return self._split_selection(all_events) - def _get_all_indices(self): + def _get_all_indices(self) -> List[int]: """Get all indices. Return: List of indices in an unshuffled order. """ - return list + if self._use_ensemble_dataset: + all_indices = [] + for dataset_path in self._dataset_args["path"]: + tmp_args = deepcopy(self._dataset_args) + tmp_args["path"] = dataset_path + tmp_dataset = self._construct_dataset(tmp_args) + all_indices.extend(tmp_dataset._get_all_indices()) + else: + all_indices = self._dataset._get_all_indices() + + return all_indices - def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dict[str, Any]: + def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: """Construct dataset. Return: Dataset object constructed from input arguments. """ - # instance dataset class , that set of argunment , - return tmp_args + dataset = self._dataset(**tmp_args) + return dataset def _create_dataset( self, selection: Union[List[int], List[List[int]], List[float]] @@ -393,4 +424,4 @@ def _create_single_dataset( tmp_args = deepcopy(self._dataset_args) tmp_args["path"] = path tmp_args["selection"] = selection - return self._dataset(**tmp_args) + return self._construct_dataset(tmp_args) From e0ea137e955eb5c6159e07066f383442a7e69212 Mon Sep 17 00:00:00 2001 From: samadpls Date: Sun, 24 Dec 2023 15:41:25 +0500 Subject: [PATCH 006/124] added `_construct_dataset` method Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 1d51a1658..c8268c990 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -348,24 +348,6 @@ def _infer_selections_on_single_dataset( ) # shuffled list return self._split_selection(all_events) - def _get_all_indices(self) -> List[int]: - """Get all indices. - - Return: - List of indices in an unshuffled order. - """ - if self._use_ensemble_dataset: - all_indices = [] - for dataset_path in self._dataset_args["path"]: - tmp_args = deepcopy(self._dataset_args) - tmp_args["path"] = dataset_path - tmp_dataset = self._construct_dataset(tmp_args) - all_indices.extend(tmp_dataset._get_all_indices()) - else: - all_indices = self._dataset._get_all_indices() - - return all_indices - def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: """Construct dataset. From ea6ebc8f040949cc1fca3f9f69dbcc3640d8be5d Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 21 Jan 2024 17:10:42 +0100 Subject: [PATCH 007/124] rename lr variable --- src/graphnet/models/standard_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/graphnet/models/standard_model.py b/src/graphnet/models/standard_model.py index 3c2c78bb7..5e90368eb 100644 --- a/src/graphnet/models/standard_model.py +++ b/src/graphnet/models/standard_model.py @@ -273,8 +273,8 @@ def training_step( on_step=True, sync_dist=True, ) - cur_lr = self.trainer.optimizers[0].param_groups[0]["lr"] - self.log("lr", cur_lr, prog_bar=True, on_step=True) + current_lr = self.trainer.optimizers[0].param_groups[0]["lr"] + self.log("lr", current_lr, prog_bar=True, on_step=True) return loss def validation_step( From b7a0a1dfc5274e0ba0ed812360220639e50b76c5 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 22 Jan 2024 15:04:35 +0900 Subject: [PATCH 008/124] add NodeAsDomTimeSeries --- src/graphnet/models/graphs/nodes/nodes.py | 95 +++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index fa0400b97..d8761b887 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -11,9 +11,12 @@ from graphnet.models.graphs.utils import ( cluster_summarize_with_percentiles, identify_indices, + lex_sort, ) from copy import deepcopy +import numpy as np + class NodeDefinition(Model): # pylint: disable=too-few-public-methods """Base class for graph building.""" @@ -211,3 +214,95 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: raise AttributeError return Data(x=torch.tensor(array)) + + +class NodeAsDOMTimeSeries: + """Represent each node as a DOM with time and charge time series data.""" + + def __init__( + self, + keys: List[str] = [ + "dom_x", + "dom_y", + "dom_z", + "dom_time", + "charge", + ], + id_columns: List[str] = ["dom_x", "dom_y", "dom_z"], + time_column: str = "dom_time", + charge_column: str = "charge", + max_activations: Optional[int] = None, + ) -> None: + """Construct `NodeAsDOMTimeSeries`. + + Args: + keys: Names of features in the data (in order). + id_columns: List of columns that uniquely identify a DOM. + time_column: Name of time column. + charge_column: Name of charge column. + max_activations: Maximum number of activations to include in the time series. + """ + self._keys = keys + self._id_columns = [self._keys.index(key) for key in id_columns] + self._time_index = self._keys.index(time_column) + self._charge_index = self._keys.index(charge_column) + self._max_activations = max_activations + super().__init__() + + def _define_output_feature_names( + self, input_feature_names: List[str] + ) -> List[str]: + return input_feature_names + + def _construct_nodes(self, x: torch.Tensor) -> Data: + """Construct nodes from raw node features ´x´.""" + # Cast to Numpy + x = x.numpy() + # Sort by time + x = x[x[:, self._time_index].argsort()] + # Undo log10 scaling so we can sum charges + x[:, self._charge_index] = np.power(10, x[:, self._charge_index]) + # Shift time to start at 0 + x[:, self._time_index] -= np.min(x[:, self._time_index]) + # Group pulses on the same DOM + x = lex_sort(x, self._id_columns) + + unique_sensors, counts = np.unique( + x[:, self._id_columns], axis=0, return_counts=True + ) + # sort DOMs and pulse-counts + sort_this = np.concatenate( + [unique_sensors, counts.reshape(-1, 1)], axis=1 + ) + sort_this = lex_sort(x=sort_this, cluster_columns=self._id_columns) + unique_sensors = sort_this[:, 0 : unique_sensors.shape[1]] + counts = sort_this[:, unique_sensors.shape[1] :].flatten().astype(int) + + time_series = np.split( + x[:, [self._charge_index, self._time_index]], counts.cumsum()[:-1] + ) + + # add total charge to unique dom features and apply log10 scaling + time_charge = np.stack( + [ + (image[0, 1], np.arcsinh(5 * image[:, 0].sum()) / 5) + for image in time_series + ] + ) + x = np.column_stack([unique_sensors, time_charge]) + + if self._max_activations is not None: + counts[counts > self._max_activations] = self._max_activations + time_series = [ + image[: self._max_activations] for image in time_series + ] + time_series = np.concatenate(time_series) + # apply inverse hyperbolic sine to charge values (handles zeros unlike log scaling) + time_series[:, 0] = np.arcsinh(5 * time_series[:, 0]) / 5 + + return Data( + x=torch.tensor(x), + time_series=torch.tensor(time_series), + cutter=torch.tensor(counts), + n_doms=len(x), + ) From 492acef4e7db41645c29335ef6b6c0a572586852 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 22 Jan 2024 15:36:40 +0900 Subject: [PATCH 009/124] small change to description --- src/graphnet/models/graphs/nodes/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index d8761b887..4056f93cf 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -282,7 +282,7 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: x[:, [self._charge_index, self._time_index]], counts.cumsum()[:-1] ) - # add total charge to unique dom features and apply log10 scaling + # add total charge to unique dom features and apply inverse hyperbolic sine scaling time_charge = np.stack( [ (image[0, 1], np.arcsinh(5 * image[:, 0].sum()) / 5) From 51a8817b706d75bee9ff460e90775630eb84e3c7 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 22 Jan 2024 15:38:17 +0900 Subject: [PATCH 010/124] add NodeAsDOMTimeSeries to init --- src/graphnet/models/graphs/nodes/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/graphnet/models/graphs/nodes/__init__.py b/src/graphnet/models/graphs/nodes/__init__.py index 0119d2b98..1fbafd43f 100644 --- a/src/graphnet/models/graphs/nodes/__init__.py +++ b/src/graphnet/models/graphs/nodes/__init__.py @@ -5,4 +5,9 @@ and their features. """ -from .nodes import NodeDefinition, NodesAsPulses, PercentileClusters +from .nodes import ( + NodeDefinition, + NodesAsPulses, + PercentileClusters, + NodeAsDOMTimeSeries, +) From 5695a5338ef26f183a6c8b28aaf27432ffc1474e Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 22 Jan 2024 15:47:20 +0900 Subject: [PATCH 011/124] More adjustability to TITO --- .../models/gnn/dynedge_kaggle_tito.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/graphnet/models/gnn/dynedge_kaggle_tito.py b/src/graphnet/models/gnn/dynedge_kaggle_tito.py index 78b5aebe5..c9fc41417 100644 --- a/src/graphnet/models/gnn/dynedge_kaggle_tito.py +++ b/src/graphnet/models/gnn/dynedge_kaggle_tito.py @@ -39,6 +39,10 @@ def __init__( global_pooling_schemes: List[str] = ["max"], use_global_features: bool = True, use_post_processing_layers: bool = True, + post_processing_layer_sizes: List[int] = None, + readout_layer_sizes: List[int] = None, + n_head: int = 8, + nb_neighbours: int = 8, ): """Construct `DynEdgeTITO`. @@ -53,8 +57,12 @@ def __init__( global_pooling_schemes: The list global pooling schemes to use. Options are: "min", "max", "mean", and "sum". use_global_features: Whether to use global features after pooling. - use_post_processing_layers: Whether to use post-processing layers - after the `DynTrans` layers. + use_post_processing_layers: Whether to use post-processing layers after the `DynTrans` layers. + post_processing_layer_sizes: (Optional) The layer sizes used in the post-processing layers. Defaults to [336, 256]. + readout_layer_sizes: (Optional) The layer sizes used in the readout layers. Defaults to [256, 128]. + n_head: The number of heads to use in the `DynTrans` layer. + nb_neighbours: The number of neighbours to use in the `DynTrans` + layer. """ # DynTrans layer sizes if dyntrans_layer_sizes is None: @@ -88,18 +96,20 @@ def __init__( self._dyntrans_layer_sizes = dyntrans_layer_sizes # Post-processing layer sizes - post_processing_layer_sizes = [ - 336, - 256, - ] + if post_processing_layer_sizes is None: + post_processing_layer_sizes = [ + 336, + 256, + ] self._post_processing_layer_sizes = post_processing_layer_sizes # Read-out layer sizes - readout_layer_sizes = [ - 256, - 128, - ] + if readout_layer_sizes is None: + readout_layer_sizes = [ + 256, + 128, + ] self._readout_layer_sizes = readout_layer_sizes @@ -129,10 +139,11 @@ def __init__( self._activation = torch.nn.LeakyReLU() self._nb_inputs = nb_inputs self._nb_global_variables = 5 + nb_inputs - self._nb_neighbours = 8 + self._nb_neighbours = nb_neighbours self._features_subset = features_subset or [0, 1, 2, 3] self._use_global_features = use_global_features self._use_post_processing_layers = use_post_processing_layers + self._n_head = n_head self._construct_layers() def _construct_layers(self) -> None: @@ -147,7 +158,7 @@ def _construct_layers(self) -> None: [nb_latent_features] + list(sizes), aggr="max", features_subset=self._features_subset, - n_head=8, + n_head=self._n_head, ) self._conv_layers.append(conv_layer) nb_latent_features = sizes[-1] From dd504bd4eb27529d7dc382b6817e1877a786f0c0 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 22 Jan 2024 16:44:09 +0900 Subject: [PATCH 012/124] add the RNN and RNN_TITO modules --- src/graphnet/models/gnn/RNN_tito.py | 129 ++++++++++++++++++++++++++++ src/graphnet/models/gnn/__init__.py | 1 + src/graphnet/models/rnn/__init__.py | 3 + src/graphnet/models/rnn/node_rnn.py | 85 ++++++++++++++++++ 4 files changed, 218 insertions(+) create mode 100644 src/graphnet/models/gnn/RNN_tito.py create mode 100644 src/graphnet/models/rnn/__init__.py create mode 100644 src/graphnet/models/rnn/node_rnn.py diff --git a/src/graphnet/models/gnn/RNN_tito.py b/src/graphnet/models/gnn/RNN_tito.py new file mode 100644 index 000000000..8facecd45 --- /dev/null +++ b/src/graphnet/models/gnn/RNN_tito.py @@ -0,0 +1,129 @@ +"""RNN_DynEdge model implementation.""" +from typing import List, Optional, Tuple, Union + +import torch +from graphnet.models.gnn.gnn import GNN +from graphnet.models.gnn.dynedge import DynEdge +from graphnet.models.gnn.dynedge_kaggle_tito import DynEdgeTITO +from graphnet.models.rnn.node_rnn import Node_RNN + +# from graphnet.models.rnn.dom_window_rnn import Dom_Window_RNN +from graphnet.models.rnn.node_transformer import Node_Transformer + +from graphnet.utilities.config import save_model_config +from torch_geometric.data import Data + + +class RNN_TITO(GNN): + """The RNN_DynEdge model class. + + Combines the Node_RNN and DynEdgeTITO models, intended for data with large + amount of DOM activations per event. This model works only with non- + standard dataset specific to the Node_RNN model see Node_RNN for more + details. + """ + + @save_model_config + def __init__( + self, + nb_inputs: int, + *, + nb_neighbours: int = 8, + RNN_layers: int = 2, + RNN_hidden_size: int = 64, + RNN_dropout: float = 0.5, + features_subset: Optional[List[int]] = None, + dyntrans_layer_sizes: Optional[List[Tuple[int, ...]]] = None, + post_processing_layer_sizes: Optional[List[int]] = None, + readout_layer_sizes: Optional[List[int]] = None, + global_pooling_schemes: List[str] = ["max"], + embedding_dim: Optional[int] = None, + n_head: int = 16, + use_global_features: bool = True, + use_post_processing_layers: bool = True, + ): + """Initialize the RNN_DynEdge model. + + Args: + nb_inputs (int): Number of input features. + nb_neighbours (int, optional): Number of neighbours to consider. + Defaults to 8. + RNN_layers (int, optional): Number of RNN layers. + Defaults to 1. + RNN_hidden_size (int, optional): Size of the hidden state of the RNN. Also determines the size of the output of the RNN. + Defaults to 64. + RNN_dropout (float, optional): Dropout to use in the RNN. Defaults to 0.5. + features_subset (List[int], optional): The subset of latent + features on each node that are used as metric dimensions when performing the k-nearest neighbours clustering. Defaults to [0,1,2,3] + dyntrans_layer_sizes (List[Tuple[int, ...]], optional): List of tuples representing the sizes of the hidden layers of the DynTrans model. + post_processing_layer_sizes (List[int], optional): List of integers representing the sizes of the hidden layers of the post-processing model. + readout_layer_sizes (List[int], optional): List of integers representing the sizes of the hidden layers of the readout model. + global_pooling_schemes (Union[str, List[str]], optional): Pooling schemes to use. Defaults to None. + embedding_dim (int, optional): Embedding dimension of the RNN. Defaults to None ie. no embedding. + n_head (int, optional): Number of heads to use in the DynTrans model. Defaults to 16. + use_global_features (bool, optional): Whether to use global features after pooling. Defaults to True. + use_post_processing_layers (bool, optional): Whether to use post-processing layers after the DynTrans layers. Defaults to True. + """ + self._nb_neighbours = nb_neighbours + self._nb_inputs = nb_inputs + self._RNN_layers = RNN_layers + self._RNN_hidden_size = RNN_hidden_size # RNN_hidden_size + self._RNN_dropout = RNN_dropout + self._embedding_dim = embedding_dim + self._n_head = n_head + self._use_global_features = use_global_features + self._use_post_processing_layers = use_post_processing_layers + + self._features_subset = features_subset + if dyntrans_layer_sizes is None: + dyntrans_layer_sizes = [ + (256, 256), + (256, 256), + (256, 256), + (256, 256), + ] + else: + dyntrans_layer_sizes = [ + tuple(layer_sizes) for layer_sizes in dyntrans_layer_sizes + ] + + self._dyntrans_layer_sizes = dyntrans_layer_sizes + self._post_processing_layer_sizes = post_processing_layer_sizes + self._global_pooling_schemes = global_pooling_schemes + if readout_layer_sizes is None: + readout_layer_sizes = [ + 256, + 128, + ] + self._readout_layer_sizes = readout_layer_sizes + + super().__init__(nb_inputs, self._readout_layer_sizes[-1]) + + self._rnn = Node_RNN( + num_layers=self._RNN_layers, + nb_inputs=2, + hidden_size=self._RNN_hidden_size, + RNN_dropout=self._RNN_dropout, + embedding_dim=self._embedding_dim, + ) + + self._dynedge_tito = DynEdgeTITO( + nb_inputs=self._RNN_hidden_size + 5, + dyntrans_layer_sizes=self._dyntrans_layer_sizes, + features_subset=self._features_subset, + global_pooling_schemes=self._global_pooling_schemes, + use_global_features=self._use_global_features, + use_post_processing_layers=self._use_post_processing_layers, + post_processing_layer_sizes=self._post_processing_layer_sizes, + readout_layer_sizes=self._readout_layer_sizes, + n_head=self._n_head, + nb_neighbours=self._nb_neighbours, + ) + + def forward(self, data: Data) -> torch.Tensor: + """Apply learnable forward pass of the RNN and tito model.""" + data = self._rnn(data) + # data = self._node_transformer(data) + readout = self._dynedge_tito(data) + + return readout diff --git a/src/graphnet/models/gnn/__init__.py b/src/graphnet/models/gnn/__init__.py index 2abe3d358..2d3ff7910 100644 --- a/src/graphnet/models/gnn/__init__.py +++ b/src/graphnet/models/gnn/__init__.py @@ -4,3 +4,4 @@ from .dynedge import DynEdge from .dynedge_jinst import DynEdgeJINST from .dynedge_kaggle_tito import DynEdgeTITO +from .RNN_tito import RNN_TITO diff --git a/src/graphnet/models/rnn/__init__.py b/src/graphnet/models/rnn/__init__.py new file mode 100644 index 000000000..21d29d7e7 --- /dev/null +++ b/src/graphnet/models/rnn/__init__.py @@ -0,0 +1,3 @@ +"""Recurrent neural network specific modules.""" + +from .node_rnn import Node_RNN diff --git a/src/graphnet/models/rnn/node_rnn.py b/src/graphnet/models/rnn/node_rnn.py new file mode 100644 index 000000000..a9855bce1 --- /dev/null +++ b/src/graphnet/models/rnn/node_rnn.py @@ -0,0 +1,85 @@ +"""Implementation of the NodeTimeRNN model. + +(cannot be used as a standalone model) +""" +import torch + +from graphnet.models.gnn.gnn import GNN +from graphnet.utilities.config import save_model_config +from torch_geometric.data import Data +from typing import Optional + +from graphnet.models.components.embedding import SinusoidalPosEmb + + +class Node_RNN(GNN): + """Implementation of the RNN model architecture. + + The model takes as input the typical DOM data format and transforms it into + a time series of DOM activations pr. DOM. before applying a RNN layer and + outputting the an RNN output for each DOM. This model is in it's current + state not intended to be used as a standalone model. Furthermore, it needs + to be used with a time-series dataset and a "cutter" (see + NodeAsDOMTimeSeries), which is not standard in the graphnet framework. + """ + + @save_model_config + def __init__( + self, + nb_inputs: int, + hidden_size: int, + num_layers: int, + RNN_dropout: float = 0.5, + embedding_dim: int = 0, + ) -> None: + """Construct `NodeTimeRNN`. + + Args: + nb_inputs: Number of features in the input data. + hidden_size: Number of features for the RNN output and hidden layers. + num_layers: Number of layers in the RNN. + nb_neighbours: Number of neighbours to use when reconstructing the graph representation. + RNN_dropout: Dropout fractio to use in the RNN. Defaults to 0.5. + embedding_dim: Embedding dimension of the RNN. Defaults to no embedding. + """ + self._num_layers = num_layers + self._hidden_size = hidden_size + self._embedding_dim = embedding_dim + self._nb_inputs = nb_inputs + + super().__init__(nb_inputs, hidden_size + 5) + + if self._embedding_dim != 0: + self._nb_inputs = self._embedding_dim * 2 * nb_inputs + + self._rnn = torch.nn.GRU( + num_layers=self._num_layers, + input_size=self._nb_inputs, + hidden_size=self._hidden_size, + batch_first=True, + dropout=RNN_dropout, + ) + self._emb = SinusoidalPosEmb(dim=self._embedding_dim) + + def forward(self, data: Data) -> torch.Tensor: + """Apply learnable forward pass to the GNN.""" + cutter = data.cutter.cumsum(0)[:-1] + # Optional embedding of the time and charge time series data. + if self._embedding_dim != 0: + time_series = self._emb(data.time_series * 4096).reshape( + ( + data.time_series.shape[0], + self._embedding_dim * 2 * data.time_series.shape[-1], + ) + ) + else: + time_series = data.time_series + + time_series = torch.nn.utils.rnn.pack_sequence( + time_series.tensor_split(cutter.cpu()), enforce_sorted=False + ) + # apply RNN per DOM irrespective of batch and return the final state. + rnn_out = self._rnn(time_series)[-1][0] + # combine the RNN output with the DOM summary features + data.x = torch.hstack([data.x, rnn_out]) + return data From e30b52ab3d1db283cb0e02eee18ec3ddde4f59ae Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Tue, 23 Jan 2024 11:28:51 +0900 Subject: [PATCH 013/124] add inusoidal positional embedding --- src/graphnet/models/components/embedding.py | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/graphnet/models/components/embedding.py diff --git a/src/graphnet/models/components/embedding.py b/src/graphnet/models/components/embedding.py new file mode 100644 index 000000000..c29f5562f --- /dev/null +++ b/src/graphnet/models/components/embedding.py @@ -0,0 +1,27 @@ +"""Classes for performing embedding of input data.""" +import torch + + +class SinusoidalPosEmb(torch.nn.Module): + """Sinusoidal positional embedding layer.""" + + def __init__(self, dim: int = 16, M: int = 10000) -> None: + """Construct `SinusoidalPosEmb`. + + Args: + dim: Embedding dimension. + M: Number of frequencies. + """ + super().__init__() + self.dim = dim + self.M = M + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply learnable forward pass to the layer.""" + device = x.device + half_dim = self.dim + emb = torch.log(torch.tensor(self.M, device=device)) / half_dim + emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) + emb = x[..., None] * emb[None, ...] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb From bed86f21a0b5b183ab3a6f4103cbc58b84fce6b4 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Tue, 23 Jan 2024 14:46:32 +0900 Subject: [PATCH 014/124] add example script --- examples/04_training/05_train_RNN_TITO.py | 266 ++++++++++++++++++++++ src/graphnet/models/graphs/nodes/nodes.py | 29 ++- 2 files changed, 288 insertions(+), 7 deletions(-) create mode 100644 examples/04_training/05_train_RNN_TITO.py diff --git a/examples/04_training/05_train_RNN_TITO.py b/examples/04_training/05_train_RNN_TITO.py new file mode 100644 index 000000000..d3832bafb --- /dev/null +++ b/examples/04_training/05_train_RNN_TITO.py @@ -0,0 +1,266 @@ +"""Example of training RNN-TITO model. + +with time-series data. +""" + +import os +from typing import Any, Dict, List, Optional + +from pytorch_lightning.loggers import WandbLogger +from torch.optim.adam import Adam +from torch.optim.lr_scheduler import ReduceLROnPlateau + +from graphnet.constants import EXAMPLE_DATA_DIR, EXAMPLE_OUTPUT_DIR +from graphnet.data.constants import FEATURES, TRUTH +from graphnet.models import StandardModel +from graphnet.models.detector.prometheus import Prometheus +from graphnet.models.gnn import RNN_TITO +from graphnet.models.graphs import KNNGraph +from graphnet.models.graphs.nodes import NodeAsDOMTimeSeries +from graphnet.models.task.reconstruction import ( + DirectionReconstructionWithKappa, +) +from graphnet.training.labels import Direction +from graphnet.training.loss_functions import VonMisesFisher3DLoss +from graphnet.training.utils import make_train_validation_dataloader +from graphnet.utilities.argparse import ArgumentParser +from graphnet.utilities.logging import Logger + +# Constants +features = FEATURES.PROMETHEUS +truth = TRUTH.PROMETHEUS + + +def main( + path: str, + pulsemap: str, + target: str, + truth_table: str, + gpus: Optional[List[int]], + max_epochs: int, + early_stopping_patience: int, + batch_size: int, + num_workers: int, + wandb: bool = False, +) -> None: + """Run example.""" + # Construct Logger + logger = Logger() + + # Initialise Weights & Biases (W&B) run + if wandb: + # Make sure W&B output directory exists + wandb_dir = "./wandb/" + os.makedirs(wandb_dir, exist_ok=True) + wandb_logger = WandbLogger( + project="example-script", + entity="graphnet-team", + save_dir=wandb_dir, + log_model=True, + ) + + logger.info(f"features: {features}") + logger.info(f"truth: {truth}") + + # Configuration + config: Dict[str, Any] = { + "path": path, + "pulsemap": pulsemap, + "batch_size": batch_size, + "num_workers": num_workers, + "target": target, + "early_stopping_patience": early_stopping_patience, + "fit": { + "gpus": gpus, + "max_epochs": max_epochs, + }, + } + + graph_definition = KNNGraph( + detector=Prometheus(), + node_definition=NodeAsDOMTimeSeries( + keys=features, + id_columns=features[0:3], + time_column=features[-1], + charge_column="None", + ), + ) + archive = os.path.join(EXAMPLE_OUTPUT_DIR, "train_RNN_TITO_model") + run_name = "RNN_TITO_{}_example".format(config["target"]) + if wandb: + # Log configuration to W&B + wandb_logger.experiment.config.update(config) + + ( + training_dataloader, + validation_dataloader, + ) = make_train_validation_dataloader( + db=config["path"], + graph_definition=graph_definition, + selection=None, + pulsemaps=config["pulsemap"], + features=features, + truth=truth, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + truth_table=truth_table, + index_column="event_no", + labels={ + "direction": Direction( + azimuth_key="injection_azimuth", zenith_key="injection_zenith" + ) + }, + ) + + # Building model + backbone = RNN_TITO( + nb_inputs=graph_definition.nb_outputs, + nb_neighbours=8, + RNN_layers=2, + RNN_hidden_size=64, + RNN_dropout=0.5, + features_subset=[0, 1, 2, 3], + dyntrans_layer_sizes=[(256, 256), (256, 256), (256, 256), (256, 256)], + post_processing_layer_sizes=[336, 256], + readout_layer_sizes=[256, 128], + global_pooling_schemes=["max"], + embedding_dim=0, + n_head=16, + use_global_features=True, + use_post_processing_layers=True, + ) + + task = DirectionReconstructionWithKappa( + hidden_size=backbone.nb_outputs, + target_labels=config["target"], + loss_function=VonMisesFisher3DLoss(), + ) + model = StandardModel( + graph_definition=graph_definition, + backbone=backbone, + tasks=[task], + optimizer_class=Adam, + optimizer_kwargs={"lr": 1e-03, "eps": 1e-03}, + scheduler_class=ReduceLROnPlateau, + scheduler_kwargs={ + "patience": config["early_stopping_patience"], + }, + scheduler_config={ + "frequency": 1, + "monitor": "val_loss", + }, + ) + + # Training model + + model.fit( + training_dataloader, + validation_dataloader, + early_stopping_patience=config["early_stopping_patience"], + logger=wandb_logger if wandb else None, + **config["fit"], + ) + + # Get predictions + additional_attributes = [ + "injection_zenith", + "injection_azimuth", + "event_no", + ] + prediction_columns = [ + config["target"][0] + "_x_pred", + config["target"][0] + "_y_pred", + config["target"][0] + "_z_pred", + config["target"][0] + "_kappa_pred", + ] + + assert isinstance(additional_attributes, list) # mypy + + results = model.predict_as_dataframe( + validation_dataloader, + additional_attributes=additional_attributes, + prediction_columns=prediction_columns, + gpus=config["fit"]["gpus"], + ) + + # Save predictions and model to file + db_name = path.split("/")[-1].split(".")[0] + path = os.path.join(archive, db_name, run_name) + logger.info(f"Writing results to {path}") + os.makedirs(path, exist_ok=True) + + # Save results as .csv + results.to_csv(f"{path}/results.csv") + + # Save full model (including weights) to .pth file - Not version proof + model.save(f"{path}/model.pth") + + # Save model config and state dict - Version safe save method. + model.save_state_dict(f"{path}/state_dict.pth") + model.save_config(f"{path}/model_config.yml") + + +if __name__ == "__main__": + + # Parse command-line arguments + parser = ArgumentParser( + description=""" +Train GNN model without the use of config files. +""" + ) + + parser.add_argument( + "--path", + help="Path to dataset file (default: %(default)s)", + default=f"{EXAMPLE_DATA_DIR}/sqlite/prometheus/prometheus-events.db", + ) + + parser.add_argument( + "--pulsemap", + help="Name of pulsemap to use (default: %(default)s)", + default="total", + ) + + parser.add_argument( + "--target", + help=( + "Name of feature to use as regression target (default: " + "%(default)s)" + ), + default="direction", + ) + + parser.add_argument( + "--truth-table", + help="Name of truth table to be used (default: %(default)s)", + default="mc_truth", + ) + + parser.with_standard_arguments( + "gpus", + ("max-epochs", 1), + ("early-stopping-patience", 2), + ("batch-size", 16), + "num-workers", + ) + + parser.add_argument( + "--wandb", + action="store_true", + help="If True, Weights & Biases are used to track the experiment.", + ) + + args, unknown = parser.parse_known_args() + + main( + args.path, + args.pulsemap, + args.target, + args.truth_table, + args.gpus, + args.max_epochs, + args.early_stopping_patience, + args.batch_size, + args.num_workers, + args.wandb, + ) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 4056f93cf..f81213a68 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -1,6 +1,6 @@ """Class(es) for building/connecting graphs.""" -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Union from abc import abstractmethod import torch @@ -216,7 +216,7 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: return Data(x=torch.tensor(array)) -class NodeAsDOMTimeSeries: +class NodeAsDOMTimeSeries(NodeDefinition): """Represent each node as a DOM with time and charge time series data.""" def __init__( @@ -243,11 +243,21 @@ def __init__( max_activations: Maximum number of activations to include in the time series. """ self._keys = keys + super().__init__(input_feature_names=self._keys) self._id_columns = [self._keys.index(key) for key in id_columns] self._time_index = self._keys.index(time_column) - self._charge_index = self._keys.index(charge_column) + try: + self._charge_index: Optional[int] = self._keys.index(charge_column) + except ValueError: + self.warning( + "Charge column with name {} not found. Running without.".format( + charge_column + ) + ) + + self._charge_index = None + self._max_activations = max_activations - super().__init__() def _define_output_feature_names( self, input_feature_names: List[str] @@ -258,10 +268,15 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: """Construct nodes from raw node features ´x´.""" # Cast to Numpy x = x.numpy() + # if there is no charge column add a dummy column of zeros with the same shape as the time column + if self._charge_index is None: + charge_index: int = len(self._keys) + x = np.insert(x, charge_index, np.zeros(x.shape[0]), axis=1) + # Sort by time x = x[x[:, self._time_index].argsort()] # Undo log10 scaling so we can sum charges - x[:, self._charge_index] = np.power(10, x[:, self._charge_index]) + x[:, charge_index] = np.power(10, x[:, charge_index]) # Shift time to start at 0 x[:, self._time_index] -= np.min(x[:, self._time_index]) # Group pulses on the same DOM @@ -279,10 +294,10 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: counts = sort_this[:, unique_sensors.shape[1] :].flatten().astype(int) time_series = np.split( - x[:, [self._charge_index, self._time_index]], counts.cumsum()[:-1] + x[:, [charge_index, self._time_index]], counts.cumsum()[:-1] ) - # add total charge to unique dom features and apply inverse hyperbolic sine scaling + # add first time and total charge to unique dom features and apply inverse hyperbolic sine scaling time_charge = np.stack( [ (image[0, 1], np.arcsinh(5 * image[:, 0].sum()) / 5) From dd8727aaa70b98005f9f54c1cda13d95971b7196 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Tue, 23 Jan 2024 14:47:12 +0900 Subject: [PATCH 015/124] clean up --- src/graphnet/models/gnn/RNN_tito.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/graphnet/models/gnn/RNN_tito.py b/src/graphnet/models/gnn/RNN_tito.py index 8facecd45..2496fa891 100644 --- a/src/graphnet/models/gnn/RNN_tito.py +++ b/src/graphnet/models/gnn/RNN_tito.py @@ -7,9 +7,6 @@ from graphnet.models.gnn.dynedge_kaggle_tito import DynEdgeTITO from graphnet.models.rnn.node_rnn import Node_RNN -# from graphnet.models.rnn.dom_window_rnn import Dom_Window_RNN -from graphnet.models.rnn.node_transformer import Node_Transformer - from graphnet.utilities.config import save_model_config from torch_geometric.data import Data @@ -123,7 +120,6 @@ def __init__( def forward(self, data: Data) -> torch.Tensor: """Apply learnable forward pass of the RNN and tito model.""" data = self._rnn(data) - # data = self._node_transformer(data) readout = self._dynedge_tito(data) return readout From 45a3cc1f0b48491c41732cb8c8c076863f9d3ff1 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 26 Jan 2024 01:15:28 +0100 Subject: [PATCH 016/124] first attemp. worked in node definition. New detector definition. --- src/graphnet/models/components/layers.py | 488 +++++++++++++++++++++- src/graphnet/models/detector/icecube.py | 41 ++ src/graphnet/models/gnn/icemix.py | 250 +++++++++++ src/graphnet/models/graphs/nodes/nodes.py | 101 +++++ src/graphnet/models/graphs/utils.py | 38 ++ 5 files changed, 912 insertions(+), 6 deletions(-) create mode 100644 src/graphnet/models/gnn/icemix.py diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 53f970286..9c80fb062 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -1,3 +1,4 @@ + """Class(es) implementing layers to be used in `graphnet` models.""" from typing import Any, Callable, Optional, Sequence, Union, List, Tuple @@ -9,11 +10,15 @@ from torch_geometric.typing import Adj, PairTensor from torch_geometric.nn.conv import MessagePassing from torch_geometric.nn.inits import reset +import torch.nn as nn +from torch.nn.functional import linear from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer -from torch.nn.modules.normalization import LayerNorm from torch_geometric.utils import to_dense_batch from pytorch_lightning import LightningModule +from timm.models.layers import drop_path +import math +from torch.fft import fft class DynEdgeConv(EdgeConv, LightningModule): """Dynamical edge convolution layer.""" @@ -129,7 +134,6 @@ def __init__( """Construct `DynTrans`. Args: - nn: The MLP/torch.Module to be used within the `DynTrans`. layer_sizes: List of layer sizes to be used in `DynTrans`. aggr: Aggregation method to be used with `DynTrans`. features_subset: Subset of features in `Data.x` that should be used @@ -151,17 +155,17 @@ def __init__( ): if ix == 0: nb_in *= 3 # edgeConv1 - layers.append(torch.nn.Linear(nb_in, nb_out)) - layers.append(torch.nn.LeakyReLU()) + layers.append(nn.Linear(nb_in, nb_out)) + layers.append(nn.LeakyReLU()) d_model = nb_out # Base class constructor - super().__init__(nn=torch.nn.Sequential(*layers), aggr=aggr, **kwargs) + super().__init__(nn=nn.Sequential(*layers), aggr=aggr, **kwargs) # Additional member variables self.features_subset = features_subset - self.norm1 = LayerNorm(d_model, eps=1e-5) # lNorm + self.norm1 = nn.LayerNorm(d_model, eps=1e-5) # lNorm # Transformer layer(s) encoder_layer = TransformerEncoderLayer( @@ -193,3 +197,475 @@ def forward( x = x[mask] return x + + + +class DropPath(nn.Module): + """DropPath regularization module for neural networks.""" + def __init__( + self, + drop_prob: Optional[float] = None, + ): + """ + Construct `DropPath`. + + Args: + drop_prob: Probability of dropping a path during training. + If None, no paths are dropped. Defaults to None. + """ + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x: Tensor) -> Tensor: + """Forward pass.""" + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + """Return extra representation of the module.""" + return "p={}".format(self.drop_prob) + + +class Mlp(nn.Module): + """ + Multi-Layer Perceptron (MLP) module. + """ + def __init__( + self, + in_features: int = 768, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + activation: Optional[nn.Module] = nn.GELU, + dropout_prob: Optional[float] = 0.0, + ): + """ + Construct `Mlp`. + + Args: + in_features: Number of input features. + hidden_features: Number of hidden features. Defaults to None. + If None, it is set to the value of `in_features`. + out_features: Number of output features. Defaults to None. + If None, it is set to the value of `in_features`. + activation: Activation layer. Defaults to `nn.GELU`. + dropout_prob: Dropout probability. Defaults to 0.0. + """ + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.input_projection = nn.Linear(in_features, hidden_features) + self.activation = activation() + self.output_projection = nn.Linear(hidden_features, out_features) + self.dropout = nn.Dropout(dropout_prob) + + def forward(self, x: Tensor) -> Tensor: + """Forward pass.""" + x = self.input_projection(x) + x = self.activation(x) + x = self.output_projection(x) + x = self.dropout(x) + return x + +class SinusoidalPosEmb(nn.Module): + def __init__( + self, + emb_dim: int = 16, + max_sequence_length: int = 10000, + ): + """ + Construct `SinusoidalPosEmb`. + + This module generates sinusoidal positional embeddings to be added to input sequences. + + Args: + emb_dim: Dimensionality of the positional embeddings. + max_sequence_length: Maximum sequence length, used to scale the frequency of sinusoidal embeddings. + """ + super().__init__() + self.embe_dim = emb_dim + self.max_sequence_length = max_sequence_length + + def forward(self, x: Tensor) -> Tensor: + """Forward pass.""" + device = x.device + half_dim = self.emb_dim // 2 + emb1 = math.log(self.max_sequence_length) / half_dim + emb2 = torch.log(self.max_sequence_length) / half_dim + if emb1 == emb2: + emb = emb1 + else: + raise ValueError("emb1 != emb2") + emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) + emb = x[..., None] * emb[None, ...] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + +class Extractor(nn.Module): + def __init__( + self, + base_dim: int = 128, + output_dim: int = 384, + ): + """ + Construct `Extractor`. + + This module incorporates sinusoidal positional embeddings and auxiliary embeddings + to process input sequences and produce meaningful representations. + + Args: + base_dim: Dimensionality of the base sinusoidal positional embeddings. + output_dim: Output dimensionality of the final projection. + """ + super().__init__() + self.sin_emb = SinusoidalPosEmb(emb_dim=base_dim) + self.aux_emb = nn.Embedding(2, base_dim // 2) + self.sin_emb2 = SinusoidalPosEmb(emb_dim=base_dim // 2) + self.projection = nn.Sequential( + nn.Linear(6 * base_dim, 6 * base_dim), + nn.LayerNorm(6 * base_dim), + nn.GELU(), + nn.Linear(6 * base_dim, output_dim), + ) + + def forward( + self, + x: Tensor, + Lmax: Optional[int] = None + ) -> Tensor: + """Forward pass.""" + pos = x.pos if Lmax is None else x.pos[:, :Lmax] + charge = x.charge if Lmax is None else x.charge[:, :Lmax] + time = x.time if Lmax is None else x.time[:, :Lmax] + auxiliary = x.auxiliary if Lmax is None else x.auxiliary[:, :Lmax] + length = torch.log10(x.n_pulses.to(dtype=pos.dtype)) + + x = torch.cat( + [ + self.sin_emb(4096 * pos).flatten(-2), + self.sin_emb(1024 * charge), + self.sin_emb(4096 * time), + self.aux_emb(auxiliary), + self.sin_emb2(length).unsqueeze(1).expand(-1, pos.shape[1], -1), + ], + -1, + ) + x = self.projection(x) + return x + + +class Spacetime_encoder(nn.Module): + def __init__( + self, + base_dim: int = 32, + ): + """ + Construct `Spacetime_encoder`. + + This module calculates space-time interval between each pair of events and + generates sinusoidal positional embeddings to be added to input sequences. + + Args: + base_dim: Dimensionality of the sinusoidal positional embeddings. + """ + super().__init__() + self.sin_emb = SinusoidalPosEmb(emb_dim=base_dim) + self.projection = nn.Linear(base_dim, base_dim) + + def forward( + self, + x: Tensor, + Lmax: Optional[int] = None, + ) -> Tensor: + """Forward pass.""" + pos = x.pos if Lmax is None else x.pos[:, :Lmax] + time = x.time if Lmax is None else x.time[:, :Lmax] + spacetime_interval = (pos[:, :, None] - pos[:, None, :]).pow(2).sum(-1) - ( + (time[:, :, None] - time[:, None, :]) * (3e4 / 500 * 3e-1) + ).pow(2) + four_distance = torch.sign(spacetime_interval) * torch.sqrt(torch.abs(spacetime_interval)) + sin_emb = self.sin_emb(1024 * four_distance.clip(-4, 4)) + rel_attn = self.projection(sin_emb) + return rel_attn, sin_emb + +# BEiTv2 block +class Block_rel(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + **kwargs, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention_rel( + dim, num_heads, attn_drop=attn_drop, qkv_bias=qkv_bias + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values is not None: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, key_padding_mask=None, rel_pos_bias=None, kv=None): + """Forward pass.""" + if self.gamma_1 is None: + xn = self.norm1(x) + kv = xn if kv is None else self.norm1(kv) + x = x + self.drop_path( + self.attn( + xn, + kv, + kv, + rel_pos_bias=rel_pos_bias, + key_padding_mask=key_padding_mask, + ) + ) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + xn = self.norm1(x) + kv = xn if kv is None else self.norm1(kv) + x = x + self.drop_path( + self.gamma_1 + * self.drop_path( + self.attn( + xn, + kv, + kv, + rel_pos_bias=rel_pos_bias, + key_padding_mask=key_padding_mask, + ) + ) + ) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + +class Attention_rel(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.proj_q = nn.Linear(dim, all_head_dim, bias=False) + self.proj_k = nn.Linear(dim, all_head_dim, bias=False) + self.proj_v = nn.Linear(dim, all_head_dim, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, q, k, v, rel_pos_bias=None, key_padding_mask=None): + """Forward pass.""" + B, N, C = q.shape + + q = linear(input=q, weight=self.proj_q.weight, bias=self.q_bias) + q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + k = linear(input=k, weight=self.proj_k.weight, bias=None) + k = k.reshape(B, k.shape[1], self.num_heads, -1).permute(0, 2, 1, 3) + v = linear(input=v, weight=self.proj_v.weight, bias=self.v_bias) + v = v.reshape(B, v.shape[1], self.num_heads, -1).permute(0, 2, 1, 3) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + if rel_pos_bias is not None: + bias = torch.einsum("bhic,bijc->bhij", q, rel_pos_bias) + attn = attn + bias + if key_padding_mask is not None: + assert ( + key_padding_mask.dtype == torch.float32 + or key_padding_mask.dtype == torch.float16 + ), "incorrect mask dtype" + bias = torch.min(key_padding_mask[:, None, :], key_padding_mask[:, :, None]) + bias[ + torch.max(key_padding_mask[:, None, :], key_padding_mask[:, :, None]) + < 0 + ] = 0 + # print(bias.shape,bias.min(),bias.max()) + attn = attn + bias.unsqueeze(1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2) + if rel_pos_bias is not None: + x = x + torch.einsum("bhij,bijc->bihc", attn, rel_pos_bias) + x = x.reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + **kwargs, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = nn.MultiheadAttention( + dim, num_heads, dropout=drop, batch_first=True + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values is not None: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, attn_mask=None, key_padding_mask=None): + """Forward pass.""" + if self.gamma_1 is None: + xn = self.norm1(x) + x = x + self.drop_path( + self.attn( + xn, + xn, + xn, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + )[0] + ) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + xn = self.norm1(x) + x = x + self.drop_path( + self.gamma_1 + * self.attn( + xn, + xn, + xn, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + )[0] + ) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + +class ScaledSinusoidalEmbedding(nn.Module): + def __init__(self, dim=32, M=10000): + super().__init__() + assert (dim % 2) == 0 + self.scale = nn.Parameter(torch.ones(1) * dim**-0.5) + self.dim = dim + self.M = M + + def forward(self, x): + """Forward pass.""" + device = x.device + half_dim = self.dim // 2 + emb1 = math.log(self.M) / half_dim + emb2 = torch.log(self.M) / half_dim + if emb1 == emb2: + emb = emb1 + else: + raise ValueError("emb1 != emb2") + emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) + emb = x[..., None] * emb[None, ...] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb * self.scale + + +class ExtractorV11Scaled(nn.Module): + def __init__(self, dim_base=128, dim=384): + super().__init__() + self.pos = ScaledSinusoidalEmbedding(dim=dim_base) + self.emb_charge = ScaledSinusoidalEmbedding(dim=dim_base) + self.time = ScaledSinusoidalEmbedding(dim=dim_base) + self.aux_emb = nn.Embedding(2, dim_base // 2) + self.emb2 = ScaledSinusoidalEmbedding(dim=dim_base // 2) + self.proj = nn.Sequential( + nn.Linear(6 * dim_base, 6 * dim_base), + nn.LayerNorm(6 * dim_base), + nn.GELU(), + nn.Linear(6 * dim_base, dim), + ) + + def forward(self, x, Lmax=None): + """Forward pass.""" + pos = x.pos if Lmax is None else x.pos[:, :Lmax] + charge = x.charge if Lmax is None else x.charge[:, :Lmax] + time = x.time if Lmax is None else x.time[:, :Lmax] + auxiliary = x.auxiliary if Lmax is None else x.auxiliary[:, :Lmax] + length = torch.log10(x.n_pulses.to(dtype=pos.dtype)) + + x = torch.cat( + [ + self.pos(4096 * pos).flatten(-2), + self.emb_charge(1024 * charge), + self.time(4096 * time), + self.aux_emb(auxiliary), + self.emb2(length).unsqueeze(1).expand(-1, pos.shape[1], -1), + ], + -1, + ) + x = self.proj(x) + return x \ No newline at end of file diff --git a/src/graphnet/models/detector/icecube.py b/src/graphnet/models/detector/icecube.py index 691b94fc7..c99706149 100644 --- a/src/graphnet/models/detector/icecube.py +++ b/src/graphnet/models/detector/icecube.py @@ -158,3 +158,44 @@ def _dom_xyz(self, x: torch.tensor) -> torch.tensor: def _pmt_area(self, x: torch.tensor) -> torch.tensor: return x / 0.05 + + +class IceMixDetector(Detector): + """`Detector` class for IceCube-86.""" + + geometry_table_path = os.path.join( + ICECUBE_GEOMETRY_TABLE_DIR, "icecube86.parquet" + ) + + xyz = ["dom_x", "dom_y", "dom_z"] + string_id_column = "string" + sensor_id_column = "sensor_id" + + def feature_map(self) -> Dict[str, Callable]: + """Map standardization functions to each dimension of input data.""" + feature_map = { + "dom_x": self._dom_xyz, + "dom_y": self._dom_xyz, + "dom_z": self._dom_xyz, + "dom_time": self._dom_time, + "charge": self._charge, + "rde": self._rde, + "pmt_area": self._pmt_area, + "hlc": self._identity, + } + return feature_map + + def _dom_xyz(self, x: torch.tensor) -> torch.tensor: + return x / 500.0 + + def _dom_time(self, x: torch.tensor) -> torch.tensor: + return (x - 1.0e04) / 3.0e4 + + def _charge(self, x: torch.tensor) -> torch.tensor: + return torch.log10(x) / 3.0 + + def _rde(self, x: torch.tensor) -> torch.tensor: + return (x - 1.0) / 0.35 + + def _pmt_area(self, x: torch.tensor) -> torch.tensor: + return x / 0.05 diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py new file mode 100644 index 000000000..7198efab1 --- /dev/null +++ b/src/graphnet/models/gnn/icemix.py @@ -0,0 +1,250 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +import math + +from graphnet.models.components.layers import Extractor, Spacetime_encoder, Block_rel, Block, ExtractorV11Scaled +from graphnet.models.gnn.dynedge import DynEdge +from graphnet.models.gnn.gnn import GNN + +from timm.models.layers import trunc_normal_ + +from torch_geometric.nn.pool import knn_graph +from torch_geometric.utils import to_dense_batch + + +class DeepIceModel(nn.Module): + def __init__( + self, + dim=384, + dim_base=128, + depth=12, + use_checkpoint=False, + head_size=32, + depth_rel=4, + n_rel=1, + **kwargs, + ): + super().__init__() + self.extractor = Extractor(dim_base, dim) + self.rel_pos = Spacetime_encoder(head_size) + self.sandwich = nn.ModuleList( + [Block_rel(dim=dim, num_heads=dim // head_size) for i in range(depth_rel)] + ) + self.cls_token = nn.Linear(dim, 1, bias=False) + self.blocks = nn.ModuleList( + [ + Block( + dim=dim, + num_heads=dim // head_size, + mlp_ratio=4, + drop_path=0.0 * (i / (depth - 1)), + init_values=1, + ) + for i in range(depth) + ] + ) + #self.proj_out = nn.Linear(dim, 3) + self.use_checkpoint = use_checkpoint + self.apply(self._init_weights) + trunc_normal_(self.cls_token.weight, std=0.02) + self.n_rel = n_rel + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def init_weights(self, pretrained=None): + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + + @torch.jit.ignore + def no_weight_decay(self): + return {"cls_token"} + + def forward(self, x0): + mask = x0.mask + Lmax = mask.sum(-1).max() + x = self.extractor(x0, Lmax) + rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) + # nbs = get_nbs(x0, Lmax) + mask = mask[:, :Lmax] + B, _ = mask.shape + attn_mask = torch.zeros(mask.shape, device=mask.device) + attn_mask[~mask] = -torch.inf + + for i, blk in enumerate(self.sandwich): + x = blk(x, attn_mask, rel_pos_bias) + if i + 1 == self.n_rel: + rel_pos_bias = None + + mask = torch.cat( + [torch.ones(B, 1, dtype=mask.dtype, device=mask.device), mask], 1 + ) + attn_mask = torch.zeros(mask.shape, device=mask.device) + attn_mask[~mask] = -torch.inf + cls_token = self.cls_token.weight.unsqueeze(0).expand(B, -1, -1) + x = torch.cat([cls_token, x], 1) + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, None, attn_mask) + else: + x = blk(x, None, attn_mask) + + #x = self.proj_out(x[:, 0]) # cls token + return x[:, 0] + + +class EncoderWithDirectionReconstruction(nn.Module): + def __init__( + self, + dim=384, + dim_base=128, + depth=8, + use_checkpoint=False, + head_size=64, + knn_features=3, + **kwargs, + ): + super().__init__() + self.knn_features = knn_features + self.extractor = ExtractorV11Scaled(dim_base, dim // 2) + self.rel_pos = Spacetime_encoder(head_size) + self.sandwich = nn.ModuleList( + [ + Block_rel(dim=dim, num_heads=dim // head_size), + Block_rel(dim=dim, num_heads=dim // head_size), + Block_rel(dim=dim, num_heads=dim // head_size), + Block_rel(dim=dim, num_heads=dim // head_size), + ] + ) + self.cls_token = nn.Linear(dim, 1, bias=False) + self.blocks = nn.ModuleList( + [ + Block( + dim=dim, + num_heads=dim // head_size, + mlp_ratio=4, + drop_path=0.0 * (i / (depth - 1)), + init_values=1, + ) + for i in range(depth) + ] + ) + #self.proj_out = nn.Linear(dim, 3) + self.use_checkpoint = use_checkpoint + self.local_root = DynEdge( + 9, + post_processing_layer_sizes=[336, dim // 2], + dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], + ) + self.apply(self._init_weights) + trunc_normal_(self.cls_token.weight, std=0.02) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def init_weights(self, pretrained=None): + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + + @torch.jit.ignore + def no_weight_decay(self): + return {"cls_token"} + + def forward(self, x0): + mask = x0.mask + graph_feature = torch.concat( + [ + x0.pos[mask], + x0.time[mask].view(-1, 1), + x0.auxiliary[mask].view(-1, 1), + x0.qe[mask].view(-1, 1), + x0.charge[mask].view(-1, 1), + x0.ice_properties[mask], + ], + dim=1, + ) + Lmax = mask.sum(-1).max() + x = self.extractor(x0, Lmax) + rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) + # nbs = get_nbs(x0, Lmax) + mask = mask[:, :Lmax] + batch_index = mask.nonzero()[:, 0] + edge_index = knn_graph(x=graph_feature[:, :self.knn_features], k=8, batch=batch_index).to( + mask.device + ) + graph_feature = self.local_root( + graph_feature, edge_index, batch_index, x0.n_pulses + ) + graph_feature, _ = to_dense_batch(graph_feature, batch_index) + + B, _ = mask.shape + attn_mask = torch.zeros(mask.shape, device=mask.device) + attn_mask[~mask] = -torch.inf + x = torch.cat([x, graph_feature], 2) + + for blk in self.sandwich: + x = blk(x, attn_mask, rel_pos_bias) + if self.knn_features == 3: + rel_pos_bias = None + mask = torch.cat( + [torch.ones(B, 1, dtype=mask.dtype, device=mask.device), mask], 1 + ) + attn_mask = torch.zeros(mask.shape, device=mask.device) + attn_mask[~mask] = -torch.inf + cls_token = self.cls_token.weight.unsqueeze(0).expand(B, -1, -1) + x = torch.cat([cls_token, x], 1) + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, None, attn_mask) + else: + x = blk(x, None, attn_mask) + + #x = self.proj_out(x[:, 0]) # cls token + return x[:, 0] \ No newline at end of file diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index fa0400b97..6e38fa145 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -2,6 +2,7 @@ from typing import List, Tuple, Optional from abc import abstractmethod +import numpy as np import torch from torch_geometric.data import Data @@ -11,6 +12,7 @@ from graphnet.models.graphs.utils import ( cluster_summarize_with_percentiles, identify_indices, + ice_transparency, ) from copy import deepcopy @@ -211,3 +213,102 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: raise AttributeError return Data(x=torch.tensor(array)) + +class IceMixNodes(NodeDefinition): + + def __init__( + self, + input_feature_names: Optional[List[str]] = None, + max_pulses: int = 384, + ) -> None: + + super().__init__(input_feature_names=input_feature_names) + + if input_feature_names is None: + input_feature_names = ["dom_x", + "dom_y", + "dom_z", + "dom_time", + "charge", + "hlc", + "rde"] + + self.all_features = ["dom_x", + "dom_y", + "dom_z", + "dom_time", + "charge", + "hlc", + "rde", + "ice_properties", + "mask"] + + missing_features = set(self.all_features) - set(input_feature_names) + if any(feat in missing_features for feat in self.all_features[:7]): + raise ValueError("Features dom_x, dom_y, dom_z, dom_time, charge, hlc, rde are required for IceMixNodes") + + self.feature_indexes = {feat: self.all_features.index(feat) for feat in input_feature_names} + + + + self.max_length = max_pulses + + + + def _define_output_feature_names( + self, + input_feature_names: List[str] + ) -> List[str]: + return self.all_features + + def _calculate_ice_transparency(x: torch.Tensor) -> torch.Tensor: + f_s, f_a = ice_transparency(x) + x = f_s / (f_s + f_a) + return x + + def _calculate_new_features(x: torch.Tensor) -> torch.Tensor: + x = _calculate_ice_transparency(x) + return x + + def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: + + graph = torch.zeros(self.max_length, len(self.all_features)) + + n_pulses = x.shape[0] + event_length = n_pulses + #qe = torch.zeros(self.max_length) + hlc = x[:, self.feature_indexes["hlc"]] + + if event_length < self.max_length: + graph[:,3] = np.pad(x[:, self.feature_indexes["dom_time"]], (0, max(0, self.max_length - event_length))) # dom-time + graph[:,4] = np.pad(x[:, self.feature_indexes["charge"]], (0, max(0, self.max_length - event_length))) # charge + graph[:,5] = np.pad(torch.logical_not(x[:, self.feature_indexes["hlc"]]), (0, max(0, self.max_length - event_length))) # hlc + + + #random_sampling = False + else: + ids = torch.randperm(event_length) + hlc = x[:, self.feature_indexes["hlc"]] + auxiliary_n = torch.nonzero(hlc == 1).squeeze(1) + auxiliary_p = torch.nonzero(hlc == 0).squeeze(1) + ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] + ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] + ids = np.concatenate([ids_n, ids_p]) + ids.sort() + graph[:,3] = x[:, self.feature_indexes["dom_time"]][ids] # dom_time + graph[:,4] = x[:, self.feature_indexes["charge"]][ids] # charge + graph[:,5] = torch.logical_not(hlc)[ids] # hlc + event_length = len(ids) + + #qe[:n_pulses] = np.pad(qe, (0, max(0, self.max_length - n_pulses))) + + #random_sampling = True + + graph[:event_length,8] = torch.ones_like(event_length) # mask + graph[:event_length,:3] = x[:, [self.feature_indexes["dom_x"], + self.feature_indexes["dom_y"], + self.feature_indexes["dom_z"]]] + + + #x = _calculate_new_features(x) + return Data(x=graph) \ No newline at end of file diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index ccd861783..c1f143d33 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -1,7 +1,12 @@ """Utility functions for construction of graphs.""" from typing import List, Tuple +import os import numpy as np +import pandas as pd +from scipy.interpolate import interp1d +from sklearn.preprocessing import RobustScaler +from graphnet.constants import DATA_DIR def lex_sort(x: np.array, cluster_columns: List[int]) -> np.ndarray: @@ -158,3 +163,36 @@ def cluster_summarize_with_percentiles( ) return array + + +def ice_transparency(datum: int = 1950): + """Calculate the normalized scattering and absorption lengths + of ice as a function of depth. + + Args: + datum: The datum depth in meters. + Default to 1950. + + Returns: + f_scattering: Function that takes a normalized depth + and returns the corresponding normalized + scattering length. + f_absorption: Function that takes a normalized depth + and returns the corresponding normalized + absorption length. + """ + # Data from page 31 of https://arxiv.org/pdf/1301.5361.pdf + # Datum is from footnote 8 of page 29 + df = pd.read_parquet( + os.path.join(DATA_DIR, "ice_properties/ice_transparency.txt"), + delim_whitespace=True + ) + df["z"] = df["depth"] - datum + df["z_norm"] = df["z"] / 500 + df[["scattering_len_norm", "absorption_len_norm"]] = RobustScaler().fit_transform( + df[["scattering_len", "absorption_len"]] + ) + + f_scattering = interp1d(df["z_norm"], df["scattering_len_norm"]) + f_absorption = interp1d(df["z_norm"], df["absorption_len_norm"]) + return f_scattering, f_absorption From 15b64a2e399e238706e5e4f8d9e07e1aebc30351 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 26 Jan 2024 14:07:28 +0900 Subject: [PATCH 017/124] embedding small change + docstring --- src/graphnet/models/components/embedding.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/graphnet/models/components/embedding.py b/src/graphnet/models/components/embedding.py index c29f5562f..de29a6fe5 100644 --- a/src/graphnet/models/components/embedding.py +++ b/src/graphnet/models/components/embedding.py @@ -3,24 +3,30 @@ class SinusoidalPosEmb(torch.nn.Module): - """Sinusoidal positional embedding layer.""" + """Sinusoidal positional embedding layer. - def __init__(self, dim: int = 16, M: int = 10000) -> None: + This module is from the kaggle competition 2nd place solution (see + arXiv:2310.15674): It performs what is called Fourier encoding or it's used + in the Attention is all you need arXiv:1706.03762. It can be seen as a soft + digitization of the input data + """ + + def __init__(self, dim: int = 16, m: int = 10000) -> None: """Construct `SinusoidalPosEmb`. Args: dim: Embedding dimension. - M: Number of frequencies. + m: Number of frequencies. """ super().__init__() self.dim = dim - self.M = M + self.m = m def forward(self, x: torch.Tensor) -> torch.Tensor: """Apply learnable forward pass to the layer.""" device = x.device half_dim = self.dim - emb = torch.log(torch.tensor(self.M, device=device)) / half_dim + emb = torch.log(torch.tensor(self.m, device=device)) / half_dim emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) emb = x[..., None] * emb[None, ...] emb = torch.cat((emb.sin(), emb.cos()), dim=-1) From 5d79ce7749a72df8b8979e7a5d5e9bc2366fc69d Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 26 Jan 2024 14:07:58 +0900 Subject: [PATCH 018/124] embedding small change + docstring --- src/graphnet/models/components/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/models/components/embedding.py b/src/graphnet/models/components/embedding.py index de29a6fe5..da39716a5 100644 --- a/src/graphnet/models/components/embedding.py +++ b/src/graphnet/models/components/embedding.py @@ -25,7 +25,7 @@ def __init__(self, dim: int = 16, m: int = 10000) -> None: def forward(self, x: torch.Tensor) -> torch.Tensor: """Apply learnable forward pass to the layer.""" device = x.device - half_dim = self.dim + half_dim = self.dim // 2 emb = torch.log(torch.tensor(self.m, device=device)) / half_dim emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) emb = x[..., None] * emb[None, ...] From f2fbfd0bde95a0dc264bb1d4352e633b61be0e3c Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 26 Jan 2024 11:40:51 +0100 Subject: [PATCH 019/124] node feature calculation implemented --- src/graphnet/models/graphs/nodes/nodes.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 6e38fa145..3fca41de5 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -265,14 +265,10 @@ def _calculate_ice_transparency(x: torch.Tensor) -> torch.Tensor: f_s, f_a = ice_transparency(x) x = f_s / (f_s + f_a) return x - - def _calculate_new_features(x: torch.Tensor) -> torch.Tensor: - x = _calculate_ice_transparency(x) - return x def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: - graph = torch.zeros(self.max_length, len(self.all_features)) + graph = torch.zeros([self.max_length, len(self.all_features)]) n_pulses = x.shape[0] event_length = n_pulses @@ -283,7 +279,10 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: graph[:,3] = np.pad(x[:, self.feature_indexes["dom_time"]], (0, max(0, self.max_length - event_length))) # dom-time graph[:,4] = np.pad(x[:, self.feature_indexes["charge"]], (0, max(0, self.max_length - event_length))) # charge graph[:,5] = np.pad(torch.logical_not(x[:, self.feature_indexes["hlc"]]), (0, max(0, self.max_length - event_length))) # hlc - + graph[:event_length,:3] = x[:, [self.feature_indexes["dom_x"], + self.feature_indexes["dom_y"], + self.feature_indexes["dom_z"]]] + graph[:,6] = x[:, self.feature_indexes["rde"]] #random_sampling = False else: @@ -295,9 +294,13 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] ids = np.concatenate([ids_n, ids_p]) ids.sort() - graph[:,3] = x[:, self.feature_indexes["dom_time"]][ids] # dom_time - graph[:,4] = x[:, self.feature_indexes["charge"]][ids] # charge + graph[:,3] = x[ids, self.feature_indexes["dom_time"]] # dom_time + graph[:,4] = x[ids, self.feature_indexes["charge"]] # charge graph[:,5] = torch.logical_not(hlc)[ids] # hlc + graph[:,:3] = x[ids, [self.feature_indexes["dom_x"], + self.feature_indexes["dom_y"], + self.feature_indexes["dom_z"]]] + graph[ids,6] = x[ids, self.feature_indexes["rde"]] event_length = len(ids) #qe[:n_pulses] = np.pad(qe, (0, max(0, self.max_length - n_pulses))) @@ -305,9 +308,7 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: #random_sampling = True graph[:event_length,8] = torch.ones_like(event_length) # mask - graph[:event_length,:3] = x[:, [self.feature_indexes["dom_x"], - self.feature_indexes["dom_y"], - self.feature_indexes["dom_z"]]] + #x = _calculate_new_features(x) From 9eee9eeb6657db635b13616627d4e729c1355e69 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 26 Jan 2024 14:55:41 +0100 Subject: [PATCH 020/124] node definition finished. researchign weigt initialization process --- src/graphnet/models/gnn/__init__.py | 1 + src/graphnet/models/gnn/icemix.py | 4 ++ src/graphnet/models/graphs/nodes/nodes.py | 62 ++++++++--------------- 3 files changed, 27 insertions(+), 40 deletions(-) diff --git a/src/graphnet/models/gnn/__init__.py b/src/graphnet/models/gnn/__init__.py index 2abe3d358..60b0aee95 100644 --- a/src/graphnet/models/gnn/__init__.py +++ b/src/graphnet/models/gnn/__init__.py @@ -4,3 +4,4 @@ from .dynedge import DynEdge from .dynedge_jinst import DynEdgeJINST from .dynedge_kaggle_tito import DynEdgeTITO +from .icemix import DeepIceModel, EncoderWithDirectionReconstruction diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 7198efab1..44c27884f 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -49,6 +49,8 @@ def __init__( self.apply(self._init_weights) trunc_normal_(self.cls_token.weight, std=0.02) self.n_rel = n_rel + + super().__init__(dim_base, dim) def fix_init_weight(self): def rescale(param, layer_id): @@ -162,6 +164,8 @@ def __init__( ) self.apply(self._init_weights) trunc_normal_(self.cls_token.weight, std=0.02) + + super().__init__(dim_base, dim) def fix_init_weight(self): def rescale(param, layer_id): diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 3fca41de5..78f5faa24 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -240,7 +240,8 @@ def __init__( "charge", "hlc", "rde", - "ice_properties", + "scatt_lenght", + "abs_lenght" "mask"] missing_features = set(self.all_features) - set(input_feature_names) @@ -248,23 +249,24 @@ def __init__( raise ValueError("Features dom_x, dom_y, dom_z, dom_time, charge, hlc, rde are required for IceMixNodes") self.feature_indexes = {feat: self.all_features.index(feat) for feat in input_feature_names} - - - + self.input_feature_names = input_feature_names self.max_length = max_pulses - - def _define_output_feature_names( self, input_feature_names: List[str] ) -> List[str]: return self.all_features - def _calculate_ice_transparency(x: torch.Tensor) -> torch.Tensor: - f_s, f_a = ice_transparency(x) - x = f_s / (f_s + f_a) - return x + def _add_ice_properties(self, + graph: torch.Tensor, + x: torch.Tensor, + ids: List[int]) -> torch.Tensor: + + f_scattering, f_absoprtion = ice_transparency() + graph[:len(ids),7] = f_scattering(x[ids, self.feature_indexes["dom_z"]]) + graph[:len(ids),8] = f_absoprtion(x[ids, self.feature_indexes["dom_z"]]) + return graph def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: @@ -272,44 +274,24 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: n_pulses = x.shape[0] event_length = n_pulses - #qe = torch.zeros(self.max_length) - hlc = x[:, self.feature_indexes["hlc"]] + x[:, self.feature_indexes["hlc"]] = torch.logical_not(x[:, self.feature_indexes["hlc"]]) if event_length < self.max_length: - graph[:,3] = np.pad(x[:, self.feature_indexes["dom_time"]], (0, max(0, self.max_length - event_length))) # dom-time - graph[:,4] = np.pad(x[:, self.feature_indexes["charge"]], (0, max(0, self.max_length - event_length))) # charge - graph[:,5] = np.pad(torch.logical_not(x[:, self.feature_indexes["hlc"]]), (0, max(0, self.max_length - event_length))) # hlc - graph[:event_length,:3] = x[:, [self.feature_indexes["dom_x"], - self.feature_indexes["dom_y"], - self.feature_indexes["dom_z"]]] - graph[:,6] = x[:, self.feature_indexes["rde"]] - - #random_sampling = False + ids = torch.arange(event_length) else: ids = torch.randperm(event_length) - hlc = x[:, self.feature_indexes["hlc"]] - auxiliary_n = torch.nonzero(hlc == 1).squeeze(1) - auxiliary_p = torch.nonzero(hlc == 0).squeeze(1) + auxiliary_n = torch.nonzero(x[:, self.feature_indexes["hlc"]] == 0).squeeze(1) + auxiliary_p = torch.nonzero(x[:, self.feature_indexes["hlc"]] == 1).squeeze(1) ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] - ids = np.concatenate([ids_n, ids_p]) - ids.sort() - graph[:,3] = x[ids, self.feature_indexes["dom_time"]] # dom_time - graph[:,4] = x[ids, self.feature_indexes["charge"]] # charge - graph[:,5] = torch.logical_not(hlc)[ids] # hlc - graph[:,:3] = x[ids, [self.feature_indexes["dom_x"], - self.feature_indexes["dom_y"], - self.feature_indexes["dom_z"]]] - graph[ids,6] = x[ids, self.feature_indexes["rde"]] + ids = np.concatenate([ids_n, ids_p]).sort() + #ids.sort() event_length = len(ids) - #qe[:n_pulses] = np.pad(qe, (0, max(0, self.max_length - n_pulses))) - - #random_sampling = True - - graph[:event_length,8] = torch.ones_like(event_length) # mask + for idx, feature in enumerate(self.all_features[:7]): + graph[:event_length, idx] = x[ids, self.feature_indexes[feature]] + graph = self._add_ice_properties(graph, x, ids) #ice properties + graph[:event_length,9] = torch.ones_like(event_length) # mask - - #x = _calculate_new_features(x) return Data(x=graph) \ No newline at end of file From ce9302b59bcbb542706f00803601c69780b55ecf Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 28 Jan 2024 15:09:39 +0100 Subject: [PATCH 021/124] restructure --- src/graphnet/data/dataclasses.py | 10 + src/graphnet/data/dataconverter_new.py | 254 +++++++++++++++++++++ src/graphnet/data/extractors/__init__.py | 1 + src/graphnet/data/extractors/extractor.py | 107 +++++++++ src/graphnet/data/readers.py | 265 ++++++++++++++++++++++ src/graphnet/data/writers.py | 59 +++++ 6 files changed, 696 insertions(+) create mode 100644 src/graphnet/data/dataclasses.py create mode 100644 src/graphnet/data/dataconverter_new.py create mode 100644 src/graphnet/data/extractors/extractor.py create mode 100644 src/graphnet/data/readers.py create mode 100644 src/graphnet/data/writers.py diff --git a/src/graphnet/data/dataclasses.py b/src/graphnet/data/dataclasses.py new file mode 100644 index 000000000..98b837693 --- /dev/null +++ b/src/graphnet/data/dataclasses.py @@ -0,0 +1,10 @@ +"""Module containing experiment-specific dataclasses.""" + + +from dataclasses import dataclass + + +@dataclass +class I3FileSet: # noqa: D101 + i3_file: str + gcd_file: str diff --git a/src/graphnet/data/dataconverter_new.py b/src/graphnet/data/dataconverter_new.py new file mode 100644 index 000000000..0456d26c8 --- /dev/null +++ b/src/graphnet/data/dataconverter_new.py @@ -0,0 +1,254 @@ +"""Contains `DataConverter`.""" +from typing import List, Union, OrderedDict, Dict, Tuple, Any, Optional, Type +from abc import abstractmethod, ABC + +from tqdm import tqdm +import numpy as np +from multiprocessing import Manager, Pool, Value +import multiprocessing.pool +from multiprocessing.sharedctypes import Synchronized +import pandas as pd +import os + +from graphnet.utilities.decorators import final +from graphnet.utilities.logging import Logger +from .readers import GraphNeTFileReader +from .writers import GraphNeTFileSaveMethod +from .extractors import Extractor +from .dataclasses import I3FileSet + + +def init_global_index(index: Synchronized, output_files: List[str]) -> None: + """Make `global_index` available to pool workers.""" + global global_index, global_output_files # type: ignore[name-defined] + global_index, global_output_files = (index, output_files) # type: ignore[name-defined] + + +class DataConverter(ABC, Logger): + """A finalized data conversion class in GraphNeT. + + `DataConverter` provides parallel processing of file conversion and + extraction from experiment-specific file formats to graphnet-supported data + formats. This class also assigns event id's to training examples. + """ + + def __init__( + self, + file_reader: Type[GraphNeTFileReader], + save_method: Type[GraphNeTFileSaveMethod], + extractors: Union[Type[Extractor], List[Type[Extractor]]], + index_column: str = "event_no", + num_workers: int = 1, + ) -> None: + """Initialize `DataConverter`. + + Args: + file_reader: The method used for reading and applying `Extractors`. + save_method: The method used to save the interim data format to + a graphnet supported file format. + extractors: The `Extractor`(s) that will be applied to the input + files. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + """ + # Member Variable Assignment + self._file_reader = file_reader + self._save_method = save_method + self._num_workers = num_workers + self._index_column = index_column + self._index = 0 + self._output_files: List[str] = [] + + # Set Extractors. Will throw error if extractors are incompatible + # with reader. + self._file_reader.set_extractors(extractors) + + @final + def __call__( + self, input_dir: Union[str, List[str]], output_dir: str + ) -> None: + """Extract data from files in `input_dir` and save to disk. + + Args: + input_dir: A directory that contains the input files. + The directory will be searched recursively for files + matching the file extension. + output_dir: The directory to save the files to. Input folder + structure is not respected. + """ + # Get the file reader to produce a list of input files + # in the directory + input_files = self._file_reader.find_files(path=input_dir) # type: ignore + self._launch_jobs(input_files=input_files, output_dir=output_dir) + + @final + def _launch_jobs( + self, input_files: Union[List[str], List[I3FileSet]] + ) -> None: + """Multi Processing Logic. + + Spawns worker pool, + distributes the input files evenly across workers. + declare event_no as globally accessible variable across workers. + starts jobs. + + Will call process_file in parallel. + """ + # Get appropriate mapping function + map_fn, pool = self.get_map_function(nb_files=len(input_files)) + + # Iterate over files + for _ in map_fn( + self._process_file, + tqdm(input_files, unit="file(s)", colour="green"), + ): + self.debug("processing file.") + + self._update_shared_variables(pool) + + @final + def _process_file(self, file_path: str) -> None: + """Process a single file. + + Calls file reader to recieve extracted output, event ids + is assigned to the extracted data and is handed to save method. + + This function is called in parallel. + """ + # Read and apply extractors + data = self._file_reader(file_path=file_path) + + # Assign event_no's to each event in data + data = self._assign_event_no(data=data) + + # Create output file name + output_file_name = self._create_file_name(input_file_path=file_path) + + # Apply save method + self._save_method(data=data, file_name=output_file_name) + + @final + def _create_file_name(self, input_file_path: str) -> str: + """Convert input file path to an output file name.""" + path_without_extension = os.path.splitext(input_file_path)[0] + base_file_name = path_without_extension.split("/")[-1] + return base_file_name + self._save_method.file_extension() # type: ignore + + @final + def _assign_event_no( + self, data: List[OrderedDict[str, Any]] + ) -> Dict[str, pd.DataFrame]: + + # Request event_no's for the entire file + event_nos = self._request_event_nos(n_ids=len(data)) + + # Dict holding pd.DataFrame's + dataframe_dict: Dict = {} + # Loop through events (again..) to assign event_nos + for k in range(len(data)): + for extractor_name in data[k].keys(): + n_rows = self._count_rows( + event_dict=data[k], extractor_name=extractor_name + ) + + data[k][extractor_name][self._index_column] = np.repeat( + event_nos[k], n_rows + ).tolist() + df = pd.DataFrame( + data[k][extractor_name], index=[0] if n_rows == 1 else None + ) + if extractor_name in dataframe_dict.keys(): + dataframe_dict[extractor_name].append(df) + else: + dataframe_dict[extractor_name] = [df] + + return dataframe_dict + + @final + def _count_rows( + self, event_dict: OrderedDict[str, Any], extractor_name: str + ) -> int: + """Count number of rows that features from `extractor_name` have.""" + try: + extractor_dict = event_dict[extractor_name] + # If all features in extractor_name have the same length + # this line of code will execute without error and result + # in an array with shape [num_features, n_rows_in_feature] + n_rows = np.asarray(list(extractor_dict.values())).shape[1] + except ValueError as e: + self.error( + f"Features from {extractor_name} ({extractor_dict.keys()}) have different lengths." + ) + raise e + + return n_rows + + def _request_event_nos(self, n_ids: int) -> List[int]: + + # Get new, unique index and increment value + if self._num_workers > 1: + with global_index.get_lock(): # type: ignore[name-defined] + starting_index = global_index.value # type: ignore[name-defined] + event_nos = np.arange( + starting_index, starting_index + n_ids, 1 + ).tolist() + global_index.value += n_ids # type: ignore[name-defined] + else: + starting_index = self._index + event_nos = np.arange( + starting_index, starting_index + n_ids, 1 + ).tolist() + self._index += n_ids + + return event_nos + + @final + def get_map_function( + self, nb_files: int, unit: str = "file(s)" + ) -> Tuple[Any, Optional[multiprocessing.pool.Pool]]: + """Identify map function to use (pure python or multiprocess).""" + # Choose relevant map-function given the requested number of workers. + n_workers = min(self._num_workers, nb_files) + if n_workers > 1: + self.info( + f"Starting pool of {n_workers} workers to process {nb_files} {unit}" + ) + + manager = Manager() + index = Value("i", 0) + output_files = manager.list() + + pool = Pool( + processes=n_workers, + initializer=init_global_index, + initargs=(index, output_files), + ) + map_fn = pool.imap + + else: + self.info( + f"Processing {nb_files} {unit} in main thread (not multiprocessing)" + ) + map_fn = map # type: ignore + pool = None + + return map_fn, pool + + @final + def _update_shared_variables( + self, pool: Optional[multiprocessing.pool.Pool] + ) -> None: + """Update `self._index` and `self._output_files`. + + If `pool` is set, it means that multiprocessing was used. In this case, + the worker processes will not have been able to write directly to + `self._index` and `self._output_files`, and we need to get them synced + up. + """ + if pool: + # Extract information from shared variables to member variables. + index, output_files = pool._initargs # type: ignore + self._index += index.value + self._output_files.extend(list(sorted(output_files[:]))) diff --git a/src/graphnet/data/extractors/__init__.py b/src/graphnet/data/extractors/__init__.py index e1d4895bf..ec0ecfe5e 100644 --- a/src/graphnet/data/extractors/__init__.py +++ b/src/graphnet/data/extractors/__init__.py @@ -18,3 +18,4 @@ from .i3pisaextractor import I3PISAExtractor from .i3ntmuonlabelsextractor import I3NTMuonLabelExtractor from .i3quesoextractor import I3QUESOExtractor +from .extractor import Extractor diff --git a/src/graphnet/data/extractors/extractor.py b/src/graphnet/data/extractors/extractor.py new file mode 100644 index 000000000..795d05cf1 --- /dev/null +++ b/src/graphnet/data/extractors/extractor.py @@ -0,0 +1,107 @@ +"""Base I3Extractor class(es).""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from graphnet.utilities.imports import has_icecube_package +from graphnet.utilities.logging import Logger + +if has_icecube_package() or TYPE_CHECKING: + from icecube import icetray, dataio # pyright: reportMissingImports=false + + +class Extractor(ABC, Logger): + """Base class for extracting information from data files. + + All classes inheriting from `Extractor` should implement the `__call__` + method, and should return a pure python dictionary on the form + + output = [{'var1: .., + ... , + 'var_n': ..}] + + Variables can be scalar or array-like of shape [n, 1], where n denotes the + number of elements in the array, and 1 the number of columns. + + An extractor is used in conjunction with a specific `FileReader`. + """ + + def __init__(self, extractor_name: str): + """Construct Extractor. + + Args: + extractor_name: Name of the `Extractor` instance. Used to keep track of the + provenance of different data, and to name tables to which this + data is saved. E.g. "mc_truth". + """ + # Member variable(s) + self._extractor_name: str = extractor_name + + # Base class constructor + super().__init__(name=__name__, class_name=self.__class__.__name__) + + @abstractmethod + def __call__(self, frame: "icetray.I3Frame") -> dict: + """Extract information from frame.""" + pass + + @property + def name(self) -> str: + """Get the name of the `I3Extractor` instance.""" + return self._extractor_name + + +class I3Extractor(Extractor): + """Base class for extracting information from physics I3-frames. + + Contains functionality required to extract data from i3 files, used by + the IceCube Neutrino Observatory. + + All classes inheriting from `I3Extractor` should implement the `__call__` + method. + """ + + def __init__(self, extractor_name: str): + """Construct I3Extractor. + + Args: + extractor_name: Name of the `I3Extractor` instance. Used to keep track of the + provenance of different data, and to name tables to which this + data is saved. + """ + # Member variable(s) + self._i3_file: str = "" + self._gcd_file: str = "" + self._gcd_dict: Dict[int, Any] = {} + self._calibration: Optional["icetray.I3Frame.Calibration"] = None + + # Base class constructor + super().__init__(extractor_name=extractor_name) + + def set_gcd(self, gcd_file: str, i3_file: str) -> None: + """Load the geospatial information contained in the GCD-file.""" + # If no GCD file is provided, search the I3 file for frames containing + # geometry (G) and calibration (C) information. + gcd = dataio.I3File(gcd_file or i3_file) + + try: + g_frame = gcd.pop_frame(icetray.I3Frame.Geometry) + except RuntimeError: + self.error( + "No GCD file was provided and no G-frame was found. Exiting." + ) + raise + else: + self._gcd_dict = g_frame["I3Geometry"].omgeo + + try: + c_frame = gcd.pop_frame(icetray.I3Frame.Calibration) + except RuntimeError: + self.warning("No GCD file was provided and no C-frame was found.") + else: + self._calibration = c_frame["I3Calibration"] + + @abstractmethod + def __call__(self, frame: "icetray.I3Frame") -> dict: + """Extract information from frame.""" + pass diff --git a/src/graphnet/data/readers.py b/src/graphnet/data/readers.py new file mode 100644 index 000000000..487ca99f7 --- /dev/null +++ b/src/graphnet/data/readers.py @@ -0,0 +1,265 @@ +"""Module containing different FileReader classes in GraphNeT. + +These methods are used to open and apply `Extractors` to experiment-specific +file formats. +""" + +from typing import List, Union, OrderedDict, Type +from abc import abstractmethod, ABC +import glob +import os + +from graphnet.utilities.decorators import final +from graphnet.utilities.logging import Logger +from graphnet.utilities.imports import has_icecube_package +from graphnet.data.filters import I3Filter, NullSplitI3Filter + +from .dataclasses import I3FileSet + +from .extractors.extractor import ( + Extractor, + I3Extractor, +) # , I3GenericExtractor +from graphnet.utilities.filesys import find_i3_files + +if has_icecube_package(): + from icecube import icetray, dataio # pyright: reportMissingImports=false + + +class GraphNeTFileReader(Logger, ABC): + """A generic base class for FileReaders in GraphNeT. + + Classes inheriting from `GraphNeTFileReader` must implement a + `__call__` method that opens a file, applies `Extractor`(s) and returns + a list of ordered dictionaries. + + In addition, Classes inheriting from `GraphNeTFileReader` must set + class properties `accepted_file_extensions` and `accepted_extractors`. + """ + + @abstractmethod + def __call__(self, file_path: str) -> List[OrderedDict]: + """Open and apply extractors to a single file. + + The `output` must be a list of dictionaries, where the number of events + in the file `n_events` satisfies `len(output) = n_events`. I.e each + element in the list is a dictionary, and each field in the dictionary + is the output of a single extractor. + """ + + @property + def accepted_file_extensions(self) -> List[str]: + """Return list of accepted file extensions.""" + return self._accepted_file_extensions # type: ignore + + @property + def accepted_extractors(self) -> List[Extractor]: + """Return list of compatible `Extractor`(s).""" + return self._accepted_extractors # type: ignore + + @property + def extracor_names(self) -> List[str]: + """Return list of table names produced by extractors.""" + return [extractor.name for extractor in self._extractors] # type: ignore + + def find_files( + self, path: Union[str, List[str]] + ) -> Union[List[str], List[I3FileSet]]: + """Search directory for input files recursively. + + This method may be overwritten by custom implementations. + + Args: + path: path to directory. + + Returns: + List of files matching accepted file extensions. + """ + if isinstance(path, str): + path = [path] + files = [] + for dir in path: + for accepted_file_extension in self.accepted_file_extensions: + files.extend(glob.glob(dir + f"/*{accepted_file_extension}")) + + # Check that files are OK. + self.validate_files(files) + return files + + @final + def set_extractors(self, extractors: List[Extractor]) -> None: + """Set `Extractor`(s) as member variable. + + Args: + extractors: A list of `Extractor`(s) to set as member variable. + """ + self._validate_extractors(extractors) + self._extractors = extractors + + @final + def _validate_extractors(self, extractors: List[Extractor]) -> None: + for extractor in extractors: + try: + assert isinstance(extractor, tuple(self.accepted_extractors)) # type: ignore + except AssertionError as e: + self.error( + f"{extractor.__class__.__name__} is not supported by {self.__class__.__name__}" + ) + raise e + + @final + def validate_files( + self, input_files: Union[List[str], List[I3FileSet]] + ) -> None: + """Check that the input files are accepted by the reader. + + Args: + input_files: Path(s) to input file(s). + """ + for input_file in input_files: + # Handle filepath vs. FileSet cases + if isinstance(input_file, I3FileSet): + self._validate_file(input_file.i3_file) + self._validate_file(input_file.gcd_file) + else: + self._validate_file(input_file) + + @final + def _validate_file(self, file: str) -> None: + """Validate a single file path. + + Args: + file: path to file. + + Returns: + None + """ + try: + assert file.lower().endswith(tuple(self.accepted_file_extensions)) + except AssertionError: + self.error( + f'{self.__class__.__name__} accepts {self.accepted_file_extensions} but {file.split("/")[-1]} has extension {os.path.splitext(file)[1]}.' + ) + + +class I3Reader(GraphNeTFileReader): + """A class for reading .i3 files from the IceCube Neutrino Observatory. + + Note that this class relies on IceCube-specific software, and therefore + must be run in a software environment that contains IceTray. + """ + + def __init__( + self, + gcd_rescue: str, + i3_filters: Union[ + Type[I3Filter], List[Type[I3Filter]] + ] = NullSplitI3Filter, + icetray_verbose: int = 0, + ): + """Initialize `I3Reader`. + + Args: + gcd_rescue: Path to a GCD file that will be used if no GCD file is + found in subfolder. `I3Reader` will recursively search + the input directory for I3-GCD file pairs. By IceCube + convention, a folder containing i3 files will have an + accompanying GCD file. However, in some cases, this + convention is broken. In cases where a folder contains + i3 files but no GCD file, the `gcd_rescue` is used + instead. + i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to + `NullSplitI3Filter`. + icetray_verbose: Set the level of verbosity of icetray. + Defaults to 0. + """ + # Set verbosity + if icetray_verbose == 0: + icetray.I3Logger.global_logger = icetray.I3NullLogger() + + # Set Member Variables + self._accepted_file_extensions = [".bz2", ".zst", ".gz"] + self._accepted_extractors = [I3Extractor] + self._gcd_rescue = gcd_rescue + self._i3filters = ( + i3_filters if isinstance(i3_filters, list) else [i3_filters] + ) + + # Base class constructor + super().__init__(name=__name__, class_name=self.__class__.__name__) + + def __call__(self, file_path: I3FileSet) -> List[OrderedDict]: # type: ignore + """Extract data from single I3 file. + + Args: + fileset: Path to I3 file and corresponding GCD file. + + Returns: + Extracted data. + """ + # Set I3-GCD file pair in extractor + for extractor in self._extractors: + extractor.set_files(file_path.i3_file, file_path.gcd_file) # type: ignore + + # Open I3 file + i3_file_io = dataio.I3File(file_path.i3_file, "r") + data = list() + while i3_file_io.more(): + try: + frame = i3_file_io.pop_physics() + except Exception as e: + if "I3" in str(e): + continue + # check if frame should be skipped + if self._skip_frame(frame): + continue + + # Try to extract data from I3Frame + results = [extractor(frame) for extractor in self._extractors] + + data_dict = OrderedDict(zip(self.extracor_names, results)) + + # If an I3GenericExtractor is used, we want each automatically + # parsed key to be stored as a separate table. + # for extractor in self._extractors: + # if isinstance(extractor, I3GenericExtractor): + # data_dict.update(data_dict.pop(extractor._name)) + + data.append(data_dict) + return data + + def find_files(self, path: Union[str, List[str]]) -> List[I3FileSet]: + """Recursively search directory for I3 and GCD file pairs. + + Args: + path: directory to search recursively. + + Returns: + List I3 and GCD file pairs as I3FileSets + """ + # Find all I3 and GCD files in the specified directories. + i3_files, gcd_files = find_i3_files( + path, + self._gcd_rescue, + ) + + # Pack as I3FileSets + filesets = [ + I3FileSet(i3_file, gcd_file) + for i3_file, gcd_file in zip(i3_files, gcd_files) + ] + return filesets + + def _skip_frame(self, frame: "icetray.I3Frame") -> bool: + """Check the user defined filters. + + Returns: + bool: True if frame should be skipped, False otherwise. + """ + if self._i3filters is None: + return False # No filters defined, so we keep the frame + + for filter in self._i3filters: + if not filter(frame): + return True # keep_frame call false, skip the frame. + return False # All filter keep_frame calls true, keep the frame. diff --git a/src/graphnet/data/writers.py b/src/graphnet/data/writers.py new file mode 100644 index 000000000..d02eef2b4 --- /dev/null +++ b/src/graphnet/data/writers.py @@ -0,0 +1,59 @@ +"""Module containing `GraphNeTFileSaveMethod`(s). + +These modules are used to save the interim data format from `DataConverter` to +a deep-learning friendly file format. +""" + +import os +from typing import List, Union, OrderedDict, Any +from abc import abstractmethod, ABC + +from graphnet.utilities.decorators import final +from graphnet.utilities.logging import Logger + + +class GraphNeTFileSaveMethod(Logger, ABC): + """Generic base class for saving interim data format in `DataConverter`. + + Classes inheriting from `GraphNeTFileSaveMethod` must implement the + `save_file` method, which recieves the interim data format from + from a single file. + + In addition, classes inheriting from `GraphNeTFileSaveMethod` must + set the `file_extension` property. + """ + + @abstractmethod + def _save_file( + self, data: OrderedDict[str, Any], output_file_path: str + ) -> None: + """Save the interim data format from a single input file. + + Args: + data: the interim data from a single input file. + output_file_path: output file path. + """ + return + + @final + def __call__( + self, data: OrderedDict[str, Any], file_name: str, out_dir: str + ) -> None: + """Save data. + + Args: + data: data to be saved. + file_name: name of input file. Will be used to generate output + file name. + out_dir: directory to save data to. + """ + output_file_path = os.path.join( + out_dir, file_name, self.file_extension + ) + self._save_file(data=data, output_file_path=output_file_path) + return + + @property + def file_extension(self) -> str: + """Return file extension used to store the data.""" + return self._file_extension # type: ignore From 9f966604f33b09a029761fc67c42dcbfecb6a6a3 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Fri, 2 Feb 2024 17:21:29 +0900 Subject: [PATCH 022/124] snake_case --- src/graphnet/models/gnn/RNN_tito.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/graphnet/models/gnn/RNN_tito.py b/src/graphnet/models/gnn/RNN_tito.py index 2496fa891..2cfe1dec0 100644 --- a/src/graphnet/models/gnn/RNN_tito.py +++ b/src/graphnet/models/gnn/RNN_tito.py @@ -26,9 +26,9 @@ def __init__( nb_inputs: int, *, nb_neighbours: int = 8, - RNN_layers: int = 2, - RNN_hidden_size: int = 64, - RNN_dropout: float = 0.5, + rnn_layers: int = 2, + rnn_hidden_size: int = 64, + rnn_dropout: float = 0.5, features_subset: Optional[List[int]] = None, dyntrans_layer_sizes: Optional[List[Tuple[int, ...]]] = None, post_processing_layer_sizes: Optional[List[int]] = None, @@ -45,11 +45,11 @@ def __init__( nb_inputs (int): Number of input features. nb_neighbours (int, optional): Number of neighbours to consider. Defaults to 8. - RNN_layers (int, optional): Number of RNN layers. + rnn_layers (int, optional): Number of RNN layers. Defaults to 1. - RNN_hidden_size (int, optional): Size of the hidden state of the RNN. Also determines the size of the output of the RNN. + rnn_hidden_size (int, optional): Size of the hidden state of the RNN. Also determines the size of the output of the RNN. Defaults to 64. - RNN_dropout (float, optional): Dropout to use in the RNN. Defaults to 0.5. + rnn_dropout (float, optional): Dropout to use in the RNN. Defaults to 0.5. features_subset (List[int], optional): The subset of latent features on each node that are used as metric dimensions when performing the k-nearest neighbours clustering. Defaults to [0,1,2,3] dyntrans_layer_sizes (List[Tuple[int, ...]], optional): List of tuples representing the sizes of the hidden layers of the DynTrans model. @@ -63,9 +63,9 @@ def __init__( """ self._nb_neighbours = nb_neighbours self._nb_inputs = nb_inputs - self._RNN_layers = RNN_layers - self._RNN_hidden_size = RNN_hidden_size # RNN_hidden_size - self._RNN_dropout = RNN_dropout + self._rnn_layers = rnn_layers + self._rnn_hidden_size = rnn_hidden_size + self._rnn_dropout = rnn_dropout self._embedding_dim = embedding_dim self._n_head = n_head self._use_global_features = use_global_features @@ -97,15 +97,15 @@ def __init__( super().__init__(nb_inputs, self._readout_layer_sizes[-1]) self._rnn = Node_RNN( - num_layers=self._RNN_layers, + num_layers=self._rnn_layers, nb_inputs=2, - hidden_size=self._RNN_hidden_size, - RNN_dropout=self._RNN_dropout, + hidden_size=self._rnn_hidden_size, + rnn_dropout=self._rnn_dropout, embedding_dim=self._embedding_dim, ) self._dynedge_tito = DynEdgeTITO( - nb_inputs=self._RNN_hidden_size + 5, + nb_inputs=self._rnn_hidden_size + 5, dyntrans_layer_sizes=self._dyntrans_layer_sizes, features_subset=self._features_subset, global_pooling_schemes=self._global_pooling_schemes, From efdadb12da548cac3fbeb12546526396a3f59248 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Sun, 4 Feb 2024 16:03:13 +0900 Subject: [PATCH 023/124] Major refactoring --- src/graphnet/models/gnn/RNN_tito.py | 8 ++- src/graphnet/models/graphs/nodes/nodes.py | 35 +++------- src/graphnet/models/rnn/node_rnn.py | 80 ++++++++++++++++++----- 3 files changed, 77 insertions(+), 46 deletions(-) diff --git a/src/graphnet/models/gnn/RNN_tito.py b/src/graphnet/models/gnn/RNN_tito.py index 2cfe1dec0..cc6aa48aa 100644 --- a/src/graphnet/models/gnn/RNN_tito.py +++ b/src/graphnet/models/gnn/RNN_tito.py @@ -12,7 +12,7 @@ class RNN_TITO(GNN): - """The RNN_DynEdge model class. + """The RNN_TITO model class. Combines the Node_RNN and DynEdgeTITO models, intended for data with large amount of DOM activations per event. This model works only with non- @@ -97,10 +97,12 @@ def __init__( super().__init__(nb_inputs, self._readout_layer_sizes[-1]) self._rnn = Node_RNN( - num_layers=self._rnn_layers, nb_inputs=2, hidden_size=self._rnn_hidden_size, - rnn_dropout=self._rnn_dropout, + num_layers=self._rnn_layers, + nb_neighbours=self._nb_neighbours, + features_subset=self._features_subset, + dropout=self._rnn_dropout, embedding_dim=self._embedding_dim, ) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index f81213a68..a2fb47b3f 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -262,7 +262,7 @@ def __init__( def _define_output_feature_names( self, input_feature_names: List[str] ) -> List[str]: - return input_feature_names + return input_feature_names + ["new_node_col"] def _construct_nodes(self, x: torch.Tensor) -> Data: """Construct nodes from raw node features ´x´.""" @@ -272,6 +272,8 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: if self._charge_index is None: charge_index: int = len(self._keys) x = np.insert(x, charge_index, np.zeros(x.shape[0]), axis=1) + else: + charge_index = self._charge_index # Sort by time x = x[x[:, self._time_index].argsort()] @@ -285,7 +287,7 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: unique_sensors, counts = np.unique( x[:, self._id_columns], axis=0, return_counts=True ) - # sort DOMs and pulse-counts + sort_this = np.concatenate( [unique_sensors, counts.reshape(-1, 1)], axis=1 ) @@ -293,31 +295,12 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: unique_sensors = sort_this[:, 0 : unique_sensors.shape[1]] counts = sort_this[:, unique_sensors.shape[1] :].flatten().astype(int) - time_series = np.split( - x[:, [charge_index, self._time_index]], counts.cumsum()[:-1] - ) - - # add first time and total charge to unique dom features and apply inverse hyperbolic sine scaling - time_charge = np.stack( - [ - (image[0, 1], np.arcsinh(5 * image[:, 0].sum()) / 5) - for image in time_series - ] - ) - x = np.column_stack([unique_sensors, time_charge]) - - if self._max_activations is not None: - counts[counts > self._max_activations] = self._max_activations - time_series = [ - image[: self._max_activations] for image in time_series - ] - time_series = np.concatenate(time_series) - # apply inverse hyperbolic sine to charge values (handles zeros unlike log scaling) - time_series[:, 0] = np.arcsinh(5 * time_series[:, 0]) / 5 + new_node_col = np.zeros(x.shape[0]) + new_node_col[counts.cumsum()[:-1]] = 1 + new_node_col[0] = 1 + x = np.column_stack([x, new_node_col]) return Data( x=torch.tensor(x), - time_series=torch.tensor(time_series), - cutter=torch.tensor(counts), - n_doms=len(x), + time_series_index=[charge_index, self._time_index], ) diff --git a/src/graphnet/models/rnn/node_rnn.py b/src/graphnet/models/rnn/node_rnn.py index a9855bce1..e8a6430f2 100644 --- a/src/graphnet/models/rnn/node_rnn.py +++ b/src/graphnet/models/rnn/node_rnn.py @@ -7,7 +7,9 @@ from graphnet.models.gnn.gnn import GNN from graphnet.utilities.config import save_model_config from torch_geometric.data import Data -from typing import Optional +from torch_geometric.nn.pool import knn_graph +from typing import List, Optional + from graphnet.models.components.embedding import SinusoidalPosEmb @@ -29,7 +31,9 @@ def __init__( nb_inputs: int, hidden_size: int, num_layers: int, - RNN_dropout: float = 0.5, + nb_neighbours: int = 8, + features_subset: Optional[List[int]] = None, + dropout: float = 0.5, embedding_dim: int = 0, ) -> None: """Construct `NodeTimeRNN`. @@ -38,48 +42,90 @@ def __init__( nb_inputs: Number of features in the input data. hidden_size: Number of features for the RNN output and hidden layers. num_layers: Number of layers in the RNN. - nb_neighbours: Number of neighbours to use when reconstructing the graph representation. - RNN_dropout: Dropout fractio to use in the RNN. Defaults to 0.5. + nb_neighbours: Number of neighbours to use when reconstructing the graph representation. Defaults to 8. + features_subset: The subset of latent features on each node that are used as metric dimensions when performing the k-nearest neighbours clustering. Defaults to [0,1,2,3] + dropout: Dropout fraction to use in the RNN. Defaults to 0.5. embedding_dim: Embedding dimension of the RNN. Defaults to no embedding. """ - self._num_layers = num_layers self._hidden_size = hidden_size + self._num_layers = num_layers + self._nb_neighbors = nb_neighbours + self._features_subset = features_subset self._embedding_dim = embedding_dim self._nb_inputs = nb_inputs super().__init__(nb_inputs, hidden_size + 5) if self._embedding_dim != 0: - self._nb_inputs = self._embedding_dim * 2 * nb_inputs + self._nb_inputs = self._embedding_dim * nb_inputs self._rnn = torch.nn.GRU( num_layers=self._num_layers, input_size=self._nb_inputs, hidden_size=self._hidden_size, batch_first=True, - dropout=RNN_dropout, + dropout=dropout, ) self._emb = SinusoidalPosEmb(dim=self._embedding_dim) + def clean_up_data_object(self, data: Data) -> Data: + """Update the feature names of the data object. + + Args: + data: The input data object. + """ + # old features removing the new_node column + old_features = data.features[0][:-1] + new_features = old_features + [ + "rnn_out_" + str(i) for i in range(self._hidden_size) + ] + data.features = [new_features] * len(data.features) + for i, name in enumerate(old_features): + data[name] = data.x[i] + return data + def forward(self, data: Data) -> torch.Tensor: """Apply learnable forward pass to the GNN.""" - cutter = data.cutter.cumsum(0)[:-1] + # cutter = data.cutter.cumsum(0)[:-1] # Optional embedding of the time and charge time series data. + x = data.x + time_series = x[:, data.time_series_index[0]] if self._embedding_dim != 0: - time_series = self._emb(data.time_series * 4096).reshape( + time_series = self._emb(time_series * 4096).reshape( ( - data.time_series.shape[0], - self._embedding_dim * 2 * data.time_series.shape[-1], + time_series.shape[0], + self._embedding_dim * time_series.shape[-1], ) ) - else: - time_series = data.time_series - + splitter = x[:, -1].argwhere()[1:].flatten().cpu() + time_series = time_series.tensor_split(splitter) + # apply RNN per DOM irrespective of batch and return the final state. time_series = torch.nn.utils.rnn.pack_sequence( - time_series.tensor_split(cutter.cpu()), enforce_sorted=False + time_series, enforce_sorted=False ) - # apply RNN per DOM irrespective of batch and return the final state. rnn_out = self._rnn(time_series)[-1][0] + # prepare node level features + charge = data.charge.tensor_split(splitter) + charge = torch.tensor( + [ + torch.asinh(5 * torch.sum(node_charges) / 5) + for node_charges in charge + ] + ) + batch = data.batch[x[:, -1].bool()] + x = x[x[:, -1].bool()][:, :-1] + x[:, data.features[0].index("charge")] = charge + # combine the RNN output with the DOM summary features - data.x = torch.hstack([data.x, rnn_out]) + data.x = torch.hstack([x, rnn_out]) + # correct the batches + data.batch = batch + data = self.clean_up_data_object(data) + # Recompute adjacency + data.edge_index = knn_graph( + x=x[:, self._features_subset], + k=self._nb_neighbors, + batch=batch, + ).to(self.device) + return data From db2fe8f04171d1510c729f5d5ef54da57206e8f4 Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Sun, 4 Feb 2024 16:56:02 +0900 Subject: [PATCH 024/124] small fixes --- examples/04_training/05_train_RNN_TITO.py | 6 +++--- src/graphnet/models/rnn/node_rnn.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/04_training/05_train_RNN_TITO.py b/examples/04_training/05_train_RNN_TITO.py index d3832bafb..4554f557e 100644 --- a/examples/04_training/05_train_RNN_TITO.py +++ b/examples/04_training/05_train_RNN_TITO.py @@ -116,9 +116,9 @@ def main( backbone = RNN_TITO( nb_inputs=graph_definition.nb_outputs, nb_neighbours=8, - RNN_layers=2, - RNN_hidden_size=64, - RNN_dropout=0.5, + rnn_layers=2, + rnn_hidden_size=64, + rnn_dropout=0.5, features_subset=[0, 1, 2, 3], dyntrans_layer_sizes=[(256, 256), (256, 256), (256, 256), (256, 256)], post_processing_layer_sizes=[336, 256], diff --git a/src/graphnet/models/rnn/node_rnn.py b/src/graphnet/models/rnn/node_rnn.py index e8a6430f2..390199c41 100644 --- a/src/graphnet/models/rnn/node_rnn.py +++ b/src/graphnet/models/rnn/node_rnn.py @@ -105,7 +105,7 @@ def forward(self, data: Data) -> torch.Tensor: ) rnn_out = self._rnn(time_series)[-1][0] # prepare node level features - charge = data.charge.tensor_split(splitter) + charge = data.x[:, data.time_series_index[0][0]].tensor_split(splitter) charge = torch.tensor( [ torch.asinh(5 * torch.sum(node_charges) / 5) @@ -114,7 +114,7 @@ def forward(self, data: Data) -> torch.Tensor: ) batch = data.batch[x[:, -1].bool()] x = x[x[:, -1].bool()][:, :-1] - x[:, data.features[0].index("charge")] = charge + x[:, data.time_series_index[0][0]] = charge # combine the RNN output with the DOM summary features data.x = torch.hstack([x, rnn_out]) From d80e3bf14245f0ea4b753768872eca1c76733ddd Mon Sep 17 00:00:00 2001 From: "askerosted@gmail.com" Date: Mon, 5 Feb 2024 10:35:30 +0900 Subject: [PATCH 025/124] small refactor --- examples/04_training/05_train_RNN_TITO.py | 1 + src/graphnet/models/gnn/RNN_tito.py | 3 +++ src/graphnet/models/graphs/nodes/nodes.py | 5 +---- src/graphnet/models/rnn/node_rnn.py | 9 ++++++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/04_training/05_train_RNN_TITO.py b/examples/04_training/05_train_RNN_TITO.py index 4554f557e..26979fd5d 100644 --- a/examples/04_training/05_train_RNN_TITO.py +++ b/examples/04_training/05_train_RNN_TITO.py @@ -116,6 +116,7 @@ def main( backbone = RNN_TITO( nb_inputs=graph_definition.nb_outputs, nb_neighbours=8, + time_series_columns=[4, 3], rnn_layers=2, rnn_hidden_size=64, rnn_dropout=0.5, diff --git a/src/graphnet/models/gnn/RNN_tito.py b/src/graphnet/models/gnn/RNN_tito.py index cc6aa48aa..75f3a04fc 100644 --- a/src/graphnet/models/gnn/RNN_tito.py +++ b/src/graphnet/models/gnn/RNN_tito.py @@ -24,6 +24,7 @@ class RNN_TITO(GNN): def __init__( self, nb_inputs: int, + time_series_columns: List[int], *, nb_neighbours: int = 8, rnn_layers: int = 2, @@ -43,6 +44,7 @@ def __init__( Args: nb_inputs (int): Number of input features. + time_series_columns (List[int]): The indices of the input data that should be treated as time series data. The first index should be the charge column. nb_neighbours (int, optional): Number of neighbours to consider. Defaults to 8. rnn_layers (int, optional): Number of RNN layers. @@ -100,6 +102,7 @@ def __init__( nb_inputs=2, hidden_size=self._rnn_hidden_size, num_layers=self._rnn_layers, + time_series_columns=time_series_columns, nb_neighbours=self._nb_neighbours, features_subset=self._features_subset, dropout=self._rnn_dropout, diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index a2fb47b3f..a71ef1c59 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -300,7 +300,4 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: new_node_col[0] = 1 x = np.column_stack([x, new_node_col]) - return Data( - x=torch.tensor(x), - time_series_index=[charge_index, self._time_index], - ) + return Data(x=torch.tensor(x)) diff --git a/src/graphnet/models/rnn/node_rnn.py b/src/graphnet/models/rnn/node_rnn.py index 390199c41..8410e9fec 100644 --- a/src/graphnet/models/rnn/node_rnn.py +++ b/src/graphnet/models/rnn/node_rnn.py @@ -31,6 +31,7 @@ def __init__( nb_inputs: int, hidden_size: int, num_layers: int, + time_series_columns: List[int], nb_neighbours: int = 8, features_subset: Optional[List[int]] = None, dropout: float = 0.5, @@ -42,6 +43,7 @@ def __init__( nb_inputs: Number of features in the input data. hidden_size: Number of features for the RNN output and hidden layers. num_layers: Number of layers in the RNN. + time_series_columns: The indices of the input data that should be treated as time series data. The first index should be the charge column. nb_neighbours: Number of neighbours to use when reconstructing the graph representation. Defaults to 8. features_subset: The subset of latent features on each node that are used as metric dimensions when performing the k-nearest neighbours clustering. Defaults to [0,1,2,3] dropout: Dropout fraction to use in the RNN. Defaults to 0.5. @@ -49,6 +51,7 @@ def __init__( """ self._hidden_size = hidden_size self._num_layers = num_layers + self._time_series_columns = time_series_columns self._nb_neighbors = nb_neighbours self._features_subset = features_subset self._embedding_dim = embedding_dim @@ -89,7 +92,7 @@ def forward(self, data: Data) -> torch.Tensor: # cutter = data.cutter.cumsum(0)[:-1] # Optional embedding of the time and charge time series data. x = data.x - time_series = x[:, data.time_series_index[0]] + time_series = x[:, self._time_series_columns] if self._embedding_dim != 0: time_series = self._emb(time_series * 4096).reshape( ( @@ -105,7 +108,7 @@ def forward(self, data: Data) -> torch.Tensor: ) rnn_out = self._rnn(time_series)[-1][0] # prepare node level features - charge = data.x[:, data.time_series_index[0][0]].tensor_split(splitter) + charge = data.x[:, self._time_series_columns[0]].tensor_split(splitter) charge = torch.tensor( [ torch.asinh(5 * torch.sum(node_charges) / 5) @@ -114,7 +117,7 @@ def forward(self, data: Data) -> torch.Tensor: ) batch = data.batch[x[:, -1].bool()] x = x[x[:, -1].bool()][:, :-1] - x[:, data.time_series_index[0][0]] = charge + x[:, self._time_series_columns[0]] = charge # combine the RNN output with the DOM summary features data.x = torch.hstack([x, rnn_out]) From e57934b27bb0bf71654a1147ebde11381f92e25b Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Mon, 5 Feb 2024 09:58:44 +0100 Subject: [PATCH 026/124] backup commit --- src/graphnet/models/components/layers.py | 14 ++-- src/graphnet/models/gnn/icemix.py | 74 ++------------------ src/graphnet/models/graphs/nodes/__init__.py | 2 +- 3 files changed, 13 insertions(+), 77 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 9c80fb062..130b31cef 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -395,7 +395,7 @@ def __init__( mlp_ratio=4.0, qkv_bias=False, qk_scale=None, - drop=0.0, + dropout=0.0, attn_drop=0.0, drop_path=0.0, init_values=None, @@ -416,8 +416,8 @@ def __init__( self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop, + activation=act_layer, + dropout_prob=dropout, ) if init_values is not None: @@ -544,7 +544,7 @@ def __init__( mlp_ratio=4.0, qkv_bias=False, qk_scale=None, - drop=0.0, + dropout=0.0, attn_drop=0.0, drop_path=0.0, init_values=None, @@ -557,7 +557,7 @@ def __init__( super().__init__() self.norm1 = norm_layer(dim) self.attn = nn.MultiheadAttention( - dim, num_heads, dropout=drop, batch_first=True + dim, num_heads, dropout=dropout, batch_first=True ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) @@ -565,8 +565,8 @@ def __init__( self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop, + activation=act_layer, + dropout_prob=dropout, ) if init_values is not None: diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 44c27884f..ec5037676 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -13,7 +13,7 @@ from torch_geometric.utils import to_dense_batch -class DeepIceModel(nn.Module): +class DeepIceModel(GNN): def __init__( self, dim=384, @@ -25,7 +25,7 @@ def __init__( n_rel=1, **kwargs, ): - super().__init__() + super().__init__(dim_base, dim) self.extractor = Extractor(dim_base, dim) self.rel_pos = Spacetime_encoder(head_size) self.sandwich = nn.ModuleList( @@ -46,40 +46,9 @@ def __init__( ) #self.proj_out = nn.Linear(dim, 3) self.use_checkpoint = use_checkpoint - self.apply(self._init_weights) - trunc_normal_(self.cls_token.weight, std=0.02) self.n_rel = n_rel - super().__init__(dim_base, dim) - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def init_weights(self, pretrained=None): - def _init_weights(m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - self.apply(_init_weights) + @torch.jit.ignore def no_weight_decay(self): @@ -119,7 +88,7 @@ def forward(self, x0): return x[:, 0] -class EncoderWithDirectionReconstruction(nn.Module): +class EncoderWithDirectionReconstruction(GNN): def __init__( self, dim=384, @@ -130,7 +99,7 @@ def __init__( knn_features=3, **kwargs, ): - super().__init__() + super().__init__(dim_base, dim) self.knn_features = knn_features self.extractor = ExtractorV11Scaled(dim_base, dim // 2) self.rel_pos = Spacetime_encoder(head_size) @@ -162,40 +131,7 @@ def __init__( post_processing_layer_sizes=[336, dim // 2], dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], ) - self.apply(self._init_weights) - trunc_normal_(self.cls_token.weight, std=0.02) - super().__init__(dim_base, dim) - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def init_weights(self, pretrained=None): - def _init_weights(m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - self.apply(_init_weights) - @torch.jit.ignore def no_weight_decay(self): return {"cls_token"} diff --git a/src/graphnet/models/graphs/nodes/__init__.py b/src/graphnet/models/graphs/nodes/__init__.py index 0119d2b98..dbcf3e477 100644 --- a/src/graphnet/models/graphs/nodes/__init__.py +++ b/src/graphnet/models/graphs/nodes/__init__.py @@ -5,4 +5,4 @@ and their features. """ -from .nodes import NodeDefinition, NodesAsPulses, PercentileClusters +from .nodes import NodeDefinition, NodesAsPulses, PercentileClusters, IceMixNodes From e255a51a83489ff8fb82de0fe3932b67f9a13b0f Mon Sep 17 00:00:00 2001 From: samadpls Date: Tue, 6 Feb 2024 18:58:45 +0500 Subject: [PATCH 027/124] intial draft of extra_repr Signed-off-by: samadpls --- .../models/graphs/graph_definition.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/graphnet/models/graphs/graph_definition.py b/src/graphnet/models/graphs/graph_definition.py index 6366fc390..368fe4c8e 100644 --- a/src/graphnet/models/graphs/graph_definition.py +++ b/src/graphnet/models/graphs/graph_definition.py @@ -447,3 +447,31 @@ def _add_custom_labels( for key, fn in custom_label_functions.items(): graph[key] = fn(graph) return graph + + def extra_repr(self) -> str: + """Provide a more detailed description for the object's string representation. + + Returns: + str: A string representation containing detailed information about the object. + """ + full_str = f"{self.__class__.__name__}(\n" + for item, value in self._config.__dict__.items(): + if isinstance(value, Model): + full_str += self._predindent_args(value) + else: + full_str += f" {item}={value}\n" + full_str += ")" + return full_str + + def _predindent_args(self, model: Model) -> str: + """Indent nested model arguments. + + Args: + model (Model): The nested model. + + Returns: + str: Indented string representation of the nested model's arguments. + """ + indented_str = model.extra_repr().replace("\n", "\n ") + return f" {indented_str}\n" + From bc24b64befdb2999be7b0c23ae3283a18eebbe2b Mon Sep 17 00:00:00 2001 From: samadpls Date: Tue, 6 Feb 2024 19:18:41 +0500 Subject: [PATCH 028/124] added extra_repr method Signed-off-by: samadpls --- src/graphnet/models/graphs/graph_definition.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/graphnet/models/graphs/graph_definition.py b/src/graphnet/models/graphs/graph_definition.py index 368fe4c8e..bbfc44c78 100644 --- a/src/graphnet/models/graphs/graph_definition.py +++ b/src/graphnet/models/graphs/graph_definition.py @@ -447,9 +447,9 @@ def _add_custom_labels( for key, fn in custom_label_functions.items(): graph[key] = fn(graph) return graph - + def extra_repr(self) -> str: - """Provide a more detailed description for the object's string representation. + """Provide a more detailed description of the object print. Returns: str: A string representation containing detailed information about the object. @@ -474,4 +474,3 @@ def _predindent_args(self, model: Model) -> str: """ indented_str = model.extra_repr().replace("\n", "\n ") return f" {indented_str}\n" - From 1b3e2011c597f198b5a0464a29212a1efd63bf17 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Tue, 6 Feb 2024 15:23:43 +0100 Subject: [PATCH 029/124] docstring definition and variable refactoring --- src/graphnet/models/components/layers.py | 274 +++++++++++----------- src/graphnet/models/gnn/__init__.py | 2 +- src/graphnet/models/gnn/icemix.py | 23 +- src/graphnet/models/graphs/nodes/nodes.py | 11 +- src/graphnet/models/graphs/utils.py | 3 +- 5 files changed, 160 insertions(+), 153 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 130b31cef..47ca2300e 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -18,8 +18,6 @@ from timm.models.layers import drop_path import math -from torch.fft import fft - class DynEdgeConv(EdgeConv, LightningModule): """Dynamical edge convolution layer.""" @@ -200,7 +198,7 @@ def forward( -class DropPath(nn.Module): +class DropPath(LightningModule): """DropPath regularization module for neural networks.""" def __init__( self, @@ -225,13 +223,13 @@ def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -class Mlp(nn.Module): +class Mlp(LightningModule): """ Multi-Layer Perceptron (MLP) module. """ def __init__( self, - in_features: int = 768, + in_features: int = None, hidden_features: Optional[int] = None, out_features: Optional[int] = None, activation: Optional[nn.Module] = nn.GELU, @@ -249,6 +247,11 @@ def __init__( activation: Activation layer. Defaults to `nn.GELU`. dropout_prob: Dropout probability. Defaults to 0.0. """ + + if in_features <= 0: + raise ValueError( + f"in_features must be greater than 0, got in_features={in_features} instead" + ) super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features @@ -265,31 +268,37 @@ def forward(self, x: Tensor) -> Tensor: x = self.dropout(x) return x -class SinusoidalPosEmb(nn.Module): +class SinusoidalPosEmb(LightningModule): def __init__( self, - emb_dim: int = 16, - max_sequence_length: int = 10000, + dim: int = 16, + n_freq: int = 10000, + scaled: bool = False, ): """ Construct `SinusoidalPosEmb`. - This module generates sinusoidal positional embeddings to be added to input sequences. + This module generates sinusoidal positional embeddings to be + added to input sequences. Args: - emb_dim: Dimensionality of the positional embeddings. - max_sequence_length: Maximum sequence length, used to scale the frequency of sinusoidal embeddings. + dim: Embedding dimension. + n_freq: Number of frequencies. + scaled: Whether or not to scale the embeddings. """ super().__init__() - self.embe_dim = emb_dim - self.max_sequence_length = max_sequence_length + if dim % 2 != 0: + raise ValueError("dim must be even") + self.scale = nn.Parameter(torch.ones(1) * dim**-0.5) if scaled else 1.0 + self.dim = dim + self.n_freq = n_freq def forward(self, x: Tensor) -> Tensor: """Forward pass.""" device = x.device - half_dim = self.emb_dim // 2 - emb1 = math.log(self.max_sequence_length) / half_dim - emb2 = torch.log(self.max_sequence_length) / half_dim + half_dim = self.dim // 2 + emb1 = math.log(self.n_freq) / half_dim + emb2 = torch.log(self.n_freq) / half_dim if emb1 == emb2: emb = emb1 else: @@ -297,16 +306,17 @@ def forward(self, x: Tensor) -> Tensor: emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) emb = x[..., None] * emb[None, ...] emb = torch.cat((emb.sin(), emb.cos()), dim=-1) - return emb + return emb * self.scale -class Extractor(nn.Module): +class FourierEncoder(LightningModule): def __init__( self, base_dim: int = 128, output_dim: int = 384, + scaled: bool = False, ): """ - Construct `Extractor`. + Construct `FourierEncoder`. This module incorporates sinusoidal positional embeddings and auxiliary embeddings to process input sequences and produce meaningful representations. @@ -314,11 +324,12 @@ def __init__( Args: base_dim: Dimensionality of the base sinusoidal positional embeddings. output_dim: Output dimensionality of the final projection. + scaled: Whether or not to scale the embeddings. """ super().__init__() - self.sin_emb = SinusoidalPosEmb(emb_dim=base_dim) + self.sin_emb = SinusoidalPosEmb(dim=base_dim, scaled=scaled) self.aux_emb = nn.Embedding(2, base_dim // 2) - self.sin_emb2 = SinusoidalPosEmb(emb_dim=base_dim // 2) + self.sin_emb2 = SinusoidalPosEmb(dim=base_dim // 2, scaled=scaled) self.projection = nn.Sequential( nn.Linear(6 * base_dim, 6 * base_dim), nn.LayerNorm(6 * base_dim), @@ -352,13 +363,13 @@ def forward( return x -class Spacetime_encoder(nn.Module): +class SpacetimeEncoder(LightningModule): def __init__( self, base_dim: int = 32, ): """ - Construct `Spacetime_encoder`. + Construct `SpacetimeEncoder`. This module calculates space-time interval between each pair of events and generates sinusoidal positional embeddings to be added to input sequences. @@ -367,7 +378,7 @@ def __init__( base_dim: Dimensionality of the sinusoidal positional embeddings. """ super().__init__() - self.sin_emb = SinusoidalPosEmb(emb_dim=base_dim) + self.sin_emb = SinusoidalPosEmb(dim=base_dim) self.projection = nn.Linear(base_dim, base_dim) def forward( @@ -387,28 +398,52 @@ def forward( return rel_attn, sin_emb # BEiTv2 block -class Block_rel(nn.Module): +class Block_rel(LightningModule): + """Implementation of BEiTv2 Block. + """ def __init__( self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - dropout=0.0, - attn_drop=0.0, - drop_path=0.0, - init_values=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - window_size=None, - attn_head_dim=None, - **kwargs, - ): + dim: int = None, + num_heads: int = None, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + qk_scale: float = None, + dropout: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + init_values: float = None, + activation: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + attn_head_dim: int = None, + ): + """ + Construct 'Block_rel'. + + Args: + dim: Dimension of the input tensor. + num_heads: Number of attention heads to use in the `Attention_rel` layer. + mlp_ratio: Ratio of the hidden size of the feedforward network to the + input size in the `Mlp` layer. + qkv_bias: Whether or not to include bias terms in the query, key, and + value matrices in the `Attention_rel` layer. + qk_scale: Scaling factor for the dot product of the query and key matrices + in the `Attention_rel` layer. + dropout: Dropout probability to use in the `Mlp` layer. + attn_dropt: Dropout probability to use in the `Attention_rel` layer. + drop_path: Probability of applying drop path regularization to the output + of the layer. + init_values: Initial value to use for the `gamma_1` and `gamma_2` + parameters if not `None`. + act_layer: Activation function to use in the `Mlp` layer. + norm_layer: Normalization layer to use. + attn_head_dim: Dimension of the attention head outputs in the + `Attention_rel` layer. + """ + super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention_rel( - dim, num_heads, attn_drop=attn_drop, qkv_bias=qkv_bias + dim, num_heads, attn_drop=attn_drop, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_head_dim=attn_head_dim ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) @@ -416,7 +451,7 @@ def __init__( self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, - activation=act_layer, + activation=activation, dropout_prob=dropout, ) @@ -430,7 +465,7 @@ def __init__( else: self.gamma_1, self.gamma_2 = None, None - def forward(self, x, key_padding_mask=None, rel_pos_bias=None, kv=None): + def forward(self, x: Tensor, key_padding_mask=None, rel_pos_bias=None, kv=None): """Forward pass.""" if self.gamma_1 is None: xn = self.norm1(x) @@ -463,22 +498,42 @@ def forward(self, x, key_padding_mask=None, rel_pos_bias=None, kv=None): x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x -class Attention_rel(nn.Module): +class Attention_rel(LightningModule): def __init__( self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, - attn_head_dim=None, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_scale: float = None, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + attn_head_dim: int = None, ): + """ + Args: + dim: Dimension of the input tensor. + num_heads: the number of attention heads to use (default: 8) + qkv_bias: whether to add bias to the query, key, and value + projections. Defaults to False. + qk_scale: a scaling factor that multiplies the dot product of query + and key vectors. Defaults to None. If None, computed as + :math: `\sqrt{1/head_dim}` + attn_drop: the dropout probability for the attention weights. + Defaults to 0.0. + proj_drop: the dropout probability for the output of the attention + module. Defaults to 0.0. + attn_head_dim: the feature dimensionality of each attention head. + Defaults to None. If None, computed as `dim // num_heads`. + """ + if dim <= 0 or num_heads <= 0: + raise ValueError( + f"dim and num_heads must be greater than 0," + f" got dim={dim} and num_heads={num_heads} instead" + ) + super().__init__() self.num_heads = num_heads - head_dim = dim // num_heads - if attn_head_dim is not None: - head_dim = attn_head_dim + head_dim = attn_head_dim or dim // num_heads all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim**-0.5 @@ -536,28 +591,43 @@ def forward(self, q, k, v, rel_pos_bias=None, key_padding_mask=None): x = self.proj_drop(x) return x -class Block(nn.Module): +class Block(LightningModule): def __init__( self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - dropout=0.0, - attn_drop=0.0, - drop_path=0.0, - init_values=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - window_size=None, - attn_head_dim=None, - **kwargs, + dim: int = None, + num_heads: int = None, + mlp_ratio: float = 4.0, + dropout: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + init_values: float = None, + activation: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, ): + """ + Construct 'Block'. + + Args: + dim: Dimension of the input tensor. + num_heads: Number of attention heads to use in the `MultiheadAttention` + layer. + mlp_ratio: Ratio of the hidden size of the feedforward network to the + input size in the `Mlp` layer. + dropout: Dropout probability to use in the `Mlp` layer. + attn_dropt: Dropout probability to use in the `MultiheadAttention` layer. + drop_path: Probability of applying drop path regularization to the output + of the layer. + init_values: Initial value to use for the `gamma_1` and `gamma_2` + parameters if not `None`. + act_layer: Activation function to use in the `Mlp` layer. + norm_layer: Normalization layer to use. + attn_head_dim: Dimension of the attention head outputs in the + `MultiheadAttention` layer. + """ super().__init__() self.norm1 = norm_layer(dim) self.attn = nn.MultiheadAttention( - dim, num_heads, dropout=dropout, batch_first=True + dim, num_heads, dropout=attn_drop, batch_first=True ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) @@ -565,7 +635,7 @@ def __init__( self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, - activation=act_layer, + activation=activation, dropout_prob=dropout, ) @@ -608,64 +678,4 @@ def forward(self, x, attn_mask=None, key_padding_mask=None): )[0] ) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - -class ScaledSinusoidalEmbedding(nn.Module): - def __init__(self, dim=32, M=10000): - super().__init__() - assert (dim % 2) == 0 - self.scale = nn.Parameter(torch.ones(1) * dim**-0.5) - self.dim = dim - self.M = M - - def forward(self, x): - """Forward pass.""" - device = x.device - half_dim = self.dim // 2 - emb1 = math.log(self.M) / half_dim - emb2 = torch.log(self.M) / half_dim - if emb1 == emb2: - emb = emb1 - else: - raise ValueError("emb1 != emb2") - emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) - emb = x[..., None] * emb[None, ...] - emb = torch.cat((emb.sin(), emb.cos()), dim=-1) - return emb * self.scale - - -class ExtractorV11Scaled(nn.Module): - def __init__(self, dim_base=128, dim=384): - super().__init__() - self.pos = ScaledSinusoidalEmbedding(dim=dim_base) - self.emb_charge = ScaledSinusoidalEmbedding(dim=dim_base) - self.time = ScaledSinusoidalEmbedding(dim=dim_base) - self.aux_emb = nn.Embedding(2, dim_base // 2) - self.emb2 = ScaledSinusoidalEmbedding(dim=dim_base // 2) - self.proj = nn.Sequential( - nn.Linear(6 * dim_base, 6 * dim_base), - nn.LayerNorm(6 * dim_base), - nn.GELU(), - nn.Linear(6 * dim_base, dim), - ) - - def forward(self, x, Lmax=None): - """Forward pass.""" - pos = x.pos if Lmax is None else x.pos[:, :Lmax] - charge = x.charge if Lmax is None else x.charge[:, :Lmax] - time = x.time if Lmax is None else x.time[:, :Lmax] - auxiliary = x.auxiliary if Lmax is None else x.auxiliary[:, :Lmax] - length = torch.log10(x.n_pulses.to(dtype=pos.dtype)) - - x = torch.cat( - [ - self.pos(4096 * pos).flatten(-2), - self.emb_charge(1024 * charge), - self.time(4096 * time), - self.aux_emb(auxiliary), - self.emb2(length).unsqueeze(1).expand(-1, pos.shape[1], -1), - ], - -1, - ) - x = self.proj(x) - return x \ No newline at end of file + return x \ No newline at end of file diff --git a/src/graphnet/models/gnn/__init__.py b/src/graphnet/models/gnn/__init__.py index 60b0aee95..420916382 100644 --- a/src/graphnet/models/gnn/__init__.py +++ b/src/graphnet/models/gnn/__init__.py @@ -4,4 +4,4 @@ from .dynedge import DynEdge from .dynedge_jinst import DynEdgeJINST from .dynedge_kaggle_tito import DynEdgeTITO -from .icemix import DeepIceModel, EncoderWithDirectionReconstruction +from .icemix import DeepIce, DeepIceWithDynEdge diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index ec5037676..f365af75d 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -3,7 +3,7 @@ import torch.utils.checkpoint as checkpoint import math -from graphnet.models.components.layers import Extractor, Spacetime_encoder, Block_rel, Block, ExtractorV11Scaled +from graphnet.models.components.layers import FourierEncoder, SpacetimeEncoder, Block_rel, Block from graphnet.models.gnn.dynedge import DynEdge from graphnet.models.gnn.gnn import GNN @@ -13,7 +13,7 @@ from torch_geometric.utils import to_dense_batch -class DeepIceModel(GNN): +class DeepIce(GNN): def __init__( self, dim=384, @@ -26,8 +26,8 @@ def __init__( **kwargs, ): super().__init__(dim_base, dim) - self.extractor = Extractor(dim_base, dim) - self.rel_pos = Spacetime_encoder(head_size) + self.fourier_ext = FourierEncoder(dim_base, dim) + self.rel_pos = SpacetimeEncoder(head_size) self.sandwich = nn.ModuleList( [Block_rel(dim=dim, num_heads=dim // head_size) for i in range(depth_rel)] ) @@ -57,7 +57,7 @@ def no_weight_decay(self): def forward(self, x0): mask = x0.mask Lmax = mask.sum(-1).max() - x = self.extractor(x0, Lmax) + x = self.fourier_ext(x0, Lmax) rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) # nbs = get_nbs(x0, Lmax) mask = mask[:, :Lmax] @@ -88,7 +88,7 @@ def forward(self, x0): return x[:, 0] -class EncoderWithDirectionReconstruction(GNN): +class DeepIceWithDynEdge(GNN): def __init__( self, dim=384, @@ -101,8 +101,8 @@ def __init__( ): super().__init__(dim_base, dim) self.knn_features = knn_features - self.extractor = ExtractorV11Scaled(dim_base, dim // 2) - self.rel_pos = Spacetime_encoder(head_size) + self.fourier_ext = FourierEncoder(dim_base, dim // 2, scaled=True) + self.rel_pos = SpacetimeEncoder(head_size) self.sandwich = nn.ModuleList( [ Block_rel(dim=dim, num_heads=dim // head_size), @@ -126,7 +126,7 @@ def __init__( ) #self.proj_out = nn.Linear(dim, 3) self.use_checkpoint = use_checkpoint - self.local_root = DynEdge( + self.dyn_edge = DynEdge( 9, post_processing_layer_sizes=[336, dim // 2], dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], @@ -150,15 +150,14 @@ def forward(self, x0): dim=1, ) Lmax = mask.sum(-1).max() - x = self.extractor(x0, Lmax) + x = self.fourier_ext(x0, Lmax) rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) - # nbs = get_nbs(x0, Lmax) mask = mask[:, :Lmax] batch_index = mask.nonzero()[:, 0] edge_index = knn_graph(x=graph_feature[:, :self.knn_features], k=8, batch=batch_index).to( mask.device ) - graph_feature = self.local_root( + graph_feature = self.dyn_edge( graph_feature, edge_index, batch_index, x0.n_pulses ) graph_feature, _ = to_dense_batch(graph_feature, batch_index) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 78f5faa24..fbedd406e 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -2,7 +2,6 @@ from typing import List, Tuple, Optional from abc import abstractmethod -import numpy as np import torch from torch_geometric.data import Data @@ -241,7 +240,7 @@ def __init__( "hlc", "rde", "scatt_lenght", - "abs_lenght" + "abs_lenght", "mask"] missing_features = set(self.all_features) - set(input_feature_names) @@ -264,8 +263,8 @@ def _add_ice_properties(self, ids: List[int]) -> torch.Tensor: f_scattering, f_absoprtion = ice_transparency() - graph[:len(ids),7] = f_scattering(x[ids, self.feature_indexes["dom_z"]]) - graph[:len(ids),8] = f_absoprtion(x[ids, self.feature_indexes["dom_z"]]) + graph[:len(ids),7] = torch.tensor(f_scattering(x[ids, self.feature_indexes["dom_z"]])) + graph[:len(ids),8] = torch.tensor(f_absoprtion(x[ids, self.feature_indexes["dom_z"]])) return graph def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: @@ -284,7 +283,7 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: auxiliary_p = torch.nonzero(x[:, self.feature_indexes["hlc"]] == 1).squeeze(1) ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] - ids = np.concatenate([ids_n, ids_p]).sort() + ids = torch.cat([ids_n, ids_p]).sort().values #ids.sort() event_length = len(ids) @@ -292,6 +291,6 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: graph[:event_length, idx] = x[ids, self.feature_indexes[feature]] graph = self._add_ice_properties(graph, x, ids) #ice properties - graph[:event_length,9] = torch.ones_like(event_length) # mask + graph[:event_length,9] = torch.ones(event_length) # mask return Data(x=graph) \ No newline at end of file diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index c1f143d33..2d5ff5196 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -184,8 +184,7 @@ def ice_transparency(datum: int = 1950): # Data from page 31 of https://arxiv.org/pdf/1301.5361.pdf # Datum is from footnote 8 of page 29 df = pd.read_parquet( - os.path.join(DATA_DIR, "ice_properties/ice_transparency.txt"), - delim_whitespace=True + os.path.join(DATA_DIR, "ice_properties/ice_transparency.parquet"), ) df["z"] = df["depth"] - datum df["z_norm"] = df["z"] / 500 From f33ae78c79b202ffd7ab7c94c8345a5c3be60f75 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Tue, 6 Feb 2024 19:26:45 +0100 Subject: [PATCH 030/124] first test --- src/graphnet/data/dataconverter_new.py | 69 +++++++++++----- .../data/extractors/i3featureextractor.py | 2 +- .../data/extractors/i3truthextractor.py | 2 +- src/graphnet/data/readers.py | 10 ++- src/graphnet/data/writers.py | 81 +++++++++++++++++-- 5 files changed, 129 insertions(+), 35 deletions(-) diff --git a/src/graphnet/data/dataconverter_new.py b/src/graphnet/data/dataconverter_new.py index 0456d26c8..eb51495d1 100644 --- a/src/graphnet/data/dataconverter_new.py +++ b/src/graphnet/data/dataconverter_new.py @@ -65,6 +65,9 @@ def __init__( # with reader. self._file_reader.set_extractors(extractors) + # Base class constructor + super().__init__(name=__name__, class_name=self.__class__.__name__) + @final def __call__( self, input_dir: Union[str, List[str]], output_dir: str @@ -78,14 +81,17 @@ def __call__( output_dir: The directory to save the files to. Input folder structure is not respected. """ + # Set outdir + self._output_dir = output_dir # Get the file reader to produce a list of input files # in the directory input_files = self._file_reader.find_files(path=input_dir) # type: ignore - self._launch_jobs(input_files=input_files, output_dir=output_dir) + self._launch_jobs(input_files=input_files) @final def _launch_jobs( - self, input_files: Union[List[str], List[I3FileSet]] + self, + input_files: Union[List[str], List[I3FileSet]], ) -> None: """Multi Processing Logic. @@ -109,7 +115,7 @@ def _launch_jobs( self._update_shared_variables(pool) @final - def _process_file(self, file_path: str) -> None: + def _process_file(self, file_path: Union[str, I3FileSet]) -> None: """Process a single file. Calls file reader to recieve extracted output, event ids @@ -119,22 +125,30 @@ def _process_file(self, file_path: str) -> None: """ # Read and apply extractors data = self._file_reader(file_path=file_path) + n_events = len(data) # type: ignore - # Assign event_no's to each event in data + # Assign event_no's to each event in data and transform to pd.DataFrame data = self._assign_event_no(data=data) # Create output file name output_file_name = self._create_file_name(input_file_path=file_path) # Apply save method - self._save_method(data=data, file_name=output_file_name) + self._save_method( + data=data, + file_name=output_file_name, + n_events=n_events, + output_dir=self._output_dir, + ) @final - def _create_file_name(self, input_file_path: str) -> str: + def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: """Convert input file path to an output file name.""" + if isinstance(input_file_path, I3FileSet): + input_file_path = input_file_path.i3_file path_without_extension = os.path.splitext(input_file_path)[0] base_file_name = path_without_extension.split("/")[-1] - return base_file_name + self._save_method.file_extension() # type: ignore + return base_file_name # type: ignore @final def _assign_event_no( @@ -152,18 +166,23 @@ def _assign_event_no( n_rows = self._count_rows( event_dict=data[k], extractor_name=extractor_name ) - - data[k][extractor_name][self._index_column] = np.repeat( - event_nos[k], n_rows - ).tolist() - df = pd.DataFrame( - data[k][extractor_name], index=[0] if n_rows == 1 else None - ) - if extractor_name in dataframe_dict.keys(): - dataframe_dict[extractor_name].append(df) - else: - dataframe_dict[extractor_name] = [df] - + if n_rows > 0: + data[k][extractor_name][self._index_column] = np.repeat( + event_nos[k], n_rows + ).tolist() + df = pd.DataFrame( + data[k][extractor_name], + index=[0] if n_rows == 1 else None, + ) + if extractor_name in dataframe_dict.keys(): + dataframe_dict[extractor_name].append(df) + else: + dataframe_dict[extractor_name] = [df] + # Merge each list of dataframes + for key in dataframe_dict.keys(): + dataframe_dict[key] = pd.concat( + dataframe_dict[key], axis=0 + ).reset_index(drop=True) return dataframe_dict @final @@ -171,18 +190,24 @@ def _count_rows( self, event_dict: OrderedDict[str, Any], extractor_name: str ) -> int: """Count number of rows that features from `extractor_name` have.""" + extractor_dict = event_dict[extractor_name] + try: - extractor_dict = event_dict[extractor_name] # If all features in extractor_name have the same length # this line of code will execute without error and result # in an array with shape [num_features, n_rows_in_feature] - n_rows = np.asarray(list(extractor_dict.values())).shape[1] + # unless the list is empty! + + shape = np.asarray(list(extractor_dict.values())).shape + if len(shape) > 1: + n_rows = shape[1] + else: + n_rows = 1 except ValueError as e: self.error( f"Features from {extractor_name} ({extractor_dict.keys()}) have different lengths." ) raise e - return n_rows def _request_event_nos(self, n_ids: int) -> List[int]: diff --git a/src/graphnet/data/extractors/i3featureextractor.py b/src/graphnet/data/extractors/i3featureextractor.py index f1f578453..f351f0f3a 100644 --- a/src/graphnet/data/extractors/i3featureextractor.py +++ b/src/graphnet/data/extractors/i3featureextractor.py @@ -1,7 +1,7 @@ """I3Extractor class(es) for extracting specific, reconstructed features.""" from typing import TYPE_CHECKING, Any, Dict, List -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.extractor import I3Extractor from graphnet.data.extractors.utilities.frames import ( get_om_keys_and_pulseseries, ) diff --git a/src/graphnet/data/extractors/i3truthextractor.py b/src/graphnet/data/extractors/i3truthextractor.py index bcfe694c3..d04be69b2 100644 --- a/src/graphnet/data/extractors/i3truthextractor.py +++ b/src/graphnet/data/extractors/i3truthextractor.py @@ -4,7 +4,7 @@ import matplotlib.path as mpath from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.extractor import I3Extractor from graphnet.data.extractors.utilities.frames import ( frame_is_montecarlo, frame_is_noise, diff --git a/src/graphnet/data/readers.py b/src/graphnet/data/readers.py index 487ca99f7..6dd9bd63d 100644 --- a/src/graphnet/data/readers.py +++ b/src/graphnet/data/readers.py @@ -103,7 +103,8 @@ def _validate_extractors(self, extractors: List[Extractor]) -> None: assert isinstance(extractor, tuple(self.accepted_extractors)) # type: ignore except AssertionError as e: self.error( - f"{extractor.__class__.__name__} is not supported by {self.__class__.__name__}" + f"{extractor.__class__.__name__}" + f" is not supported by {self.__class__.__name__}" ) raise e @@ -154,7 +155,7 @@ def __init__( gcd_rescue: str, i3_filters: Union[ Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter, + ] = NullSplitI3Filter(), # type: ignore icetray_verbose: int = 0, ): """Initialize `I3Reader`. @@ -199,7 +200,10 @@ def __call__(self, file_path: I3FileSet) -> List[OrderedDict]: # type: ignore """ # Set I3-GCD file pair in extractor for extractor in self._extractors: - extractor.set_files(file_path.i3_file, file_path.gcd_file) # type: ignore + assert isinstance(extractor, I3Extractor) + extractor.set_gcd( + i3_file=file_path.i3_file, gcd_file=file_path.gcd_file + ) # type: ignore # Open I3 file i3_file_io = dataio.I3File(file_path.i3_file, "r") diff --git a/src/graphnet/data/writers.py b/src/graphnet/data/writers.py index d02eef2b4..d23b21ac8 100644 --- a/src/graphnet/data/writers.py +++ b/src/graphnet/data/writers.py @@ -5,11 +5,17 @@ """ import os -from typing import List, Union, OrderedDict, Any +from typing import List, Union, Dict, Any, OrderedDict from abc import abstractmethod, ABC from graphnet.utilities.decorators import final from graphnet.utilities.logging import Logger +from graphnet.data.sqlite.sqlite_utilities import ( + create_table, + create_table_and_save_to_sql, +) + +import pandas as pd class GraphNeTFileSaveMethod(Logger, ABC): @@ -25,19 +31,26 @@ class GraphNeTFileSaveMethod(Logger, ABC): @abstractmethod def _save_file( - self, data: OrderedDict[str, Any], output_file_path: str + self, + data: Dict[str, pd.DataFrame], + output_file_path: str, + n_events: int, ) -> None: """Save the interim data format from a single input file. Args: data: the interim data from a single input file. output_file_path: output file path. + n_events: Number of events container in `data`. """ - return @final def __call__( - self, data: OrderedDict[str, Any], file_name: str, out_dir: str + self, + data: Dict[str, pd.DataFrame], + file_name: str, + output_dir: str, + n_events: int, ) -> None: """Save data. @@ -45,15 +58,67 @@ def __call__( data: data to be saved. file_name: name of input file. Will be used to generate output file name. - out_dir: directory to save data to. + output_dir: directory to save data to. + n_events: Number of events in `data`. """ - output_file_path = os.path.join( - out_dir, file_name, self.file_extension + # make dir + os.makedirs(output_dir, exist_ok=True) + output_file_path = ( + os.path.join(output_dir, file_name) + self.file_extension + ) + + self._save_file( + data=data, output_file_path=output_file_path, n_events=n_events ) - self._save_file(data=data, output_file_path=output_file_path) return @property def file_extension(self) -> str: """Return file extension used to store the data.""" return self._file_extension # type: ignore + + +class SQLiteSaveMethod(GraphNeTFileSaveMethod): + """A method for saving GraphNeT's interim dataformat to SQLite.""" + + _file_extension = ".db" + + def _save_file( + self, + data: Dict[str, pd.DataFrame], + output_file_path: str, + n_events: int, + ) -> None: + """Save data to SQLite database.""" + # Check(s) + if os.path.exists(output_file_path): + self.warning( + f"Output file {output_file_path} already exists. Appending." + ) + + # Concatenate data + if len(data) == 0: + self.warning( + "No data was extracted from the processed I3 file(s). " + f"No data saved to {output_file_path}" + ) + return + + saved_any = False + # Save each dataframe to SQLite database + self.debug(f"Saving to {output_file_path}") + for table, df in data.items(): + if len(df) > 0: + create_table_and_save_to_sql( + df, + table, + output_file_path, + default_type="FLOAT", + integer_primary_key=len(df) <= n_events, + ) + saved_any = True + + if saved_any: + self.debug("- Done saving") + else: + self.warning(f"No data saved to {output_file_path}") From 58d2f2740bda1150591457f7e9b252088bc39ad6 Mon Sep 17 00:00:00 2001 From: samadpls Date: Wed, 7 Feb 2024 01:03:03 +0500 Subject: [PATCH 031/124] Refactor GraphNeTDataModule and add unit test for save_selection function --- src/graphnet/data/datamodule.py | 51 +++++++++++++++++++-------------- src/graphnet/training/utils.py | 10 ++++--- tests/data/test_datamodule.py | 33 +++++++++++++++++++++ 3 files changed, 68 insertions(+), 26 deletions(-) create mode 100644 tests/data/test_datamodule.py diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index c8268c990..f4db88e88 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -23,9 +23,9 @@ class GraphNeTDataModule(pl.LightningDataModule, Logger): def __init__( self, dataset_reference: Union[SQLiteDataset, ParquetDataset, Dataset], - selection: Optional[Union[List[int], List[List[int]]]], - test_selection: Optional[Union[List[int], List[List[int]]]], dataset_args: Dict[str, Any], + selection: Optional[Union[List[int], List[List[int]]]] = None, + test_selection: Optional[Union[List[int], List[List[int]]]] = None, train_dataloader_kwargs: Optional[Dict[str, Any]] = None, validation_dataloader_kwargs: Optional[Dict[str, Any]] = None, test_dataloader_kwargs: Optional[Dict[str, Any]] = None, @@ -36,20 +36,22 @@ def __init__( Args: dataset_reference: A non-instantiated reference to the dataset class. + dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. selection: (Optional) a list of event id's used for training and validation. test_selection: (Optional) a list of event id's used for testing. - dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. train_dataloader_kwargs: Arguments for the training DataLoader. validation_dataloader_kwargs: Arguments for the validation DataLoader. test_dataloader_kwargs: Arguments for the test DataLoader. train_val_split (Optional): Split ratio for training and validation sets. Default is [0.9, 0.10]. split_seed: seed used for shuffling and splitting selections into train/validation. """ + Logger.__init__(self) + self._make_sure_root_logger_is_configured() self._dataset = dataset_reference - self._selection = selection or [0] - self._train_val_split = train_val_split or [0.0] - self._test_selection = test_selection or [0.0] self._dataset_args = dataset_args + self._selection = selection + self._test_selection = test_selection + self._train_val_split = train_val_split or [0.0] self._rng = split_seed self._train_dataloader_kwargs = train_dataloader_kwargs or {} @@ -61,6 +63,8 @@ def __init__( self._dataset_args["path"], list ) + self.setup("") + def prepare_data(self) -> None: """Prepare the dataset for training.""" # Download method for curated datasets. Method for download is @@ -82,9 +86,10 @@ def setup(self, stage: str) -> None: self._resolve_selections() # Creation of Datasets + # self._dataset = self._create_dataset(self.) self._train_dataset = self._create_dataset(self._train_selection) self._val_dataset = self._create_dataset(self._val_selection) - self._test_dataset = self._create_dataset(self._test_selection) + self._test_dataset = self._create_dataset(self._test_selection) # type: ignore return @@ -169,12 +174,14 @@ def _validate_dataset_class(self) -> None: ParquetDataset, or Dataset. Raises a TypeError if an invalid dataset type is detected, or if an EnsembleDataset is used. """ - if not isinstance( - self._dataset, (SQLiteDataset, ParquetDataset, Dataset) - ): - raise TypeError( - "dataset_reference must be an instance of SQLiteDataset, ParquetDataset, or Dataset." - ) + print(self._dataset, "Dataset\n") + print( + f"Type of self._dataset before validation check: {type(self._dataset)}" + ) + # if type(self._dataset) not in [SQLiteDataset, ParquetDataset, Dataset]: + # raise TypeError( + # "dataset_reference must be an instance of SQLiteDataset, ParquetDataset, or Dataset." + # ) if isinstance(self._dataset, EnsembleDataset): raise TypeError( "EnsembleDataset is not allowed as dataset_reference." @@ -250,7 +257,7 @@ def _resolve_selections(self) -> None: self._selection ) - if self._selection is None: + else: # selection is None # If not provided, we infer it by grabbing all event ids in the dataset. self.info( f"{self.__class__.__name__} did not receive an argument for `selection`. Selection will automatically be created with a split of train: {self._train_val_split[0]} and validation: {self._train_val_split[1]}" @@ -258,7 +265,7 @@ def _resolve_selections(self) -> None: ( self._train_selection, self._val_selection, - ) = self._infer_selections() + ) = self._infer_selections() # type: ignore def _split_selection( self, selection: Union[int, List[int], List[List[int]]] @@ -336,16 +343,15 @@ def _infer_selections_on_single_dataset( all_events = ( tmp_dataset._get_all_indices() - ) # unshuffled list, # sequential index + ) # unshuffled list, sequential index # Multiple lines to avoid one large - all_events = pd.DataFrame(all_events).sample( - frac=1, replace=False, random_state=self._rng - ) - - all_events = random.sample( - all_events, len(all_events) + all_events = ( + pd.DataFrame(all_events) + .sample(frac=1, replace=False, random_state=self._rng) + .values.tolist() ) # shuffled list + return self._split_selection(all_events) def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: @@ -354,6 +360,7 @@ def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: Return: Dataset object constructed from input arguments. """ + print(tmp_args, "temp argument") dataset = self._dataset(**tmp_args) return dataset diff --git a/src/graphnet/training/utils.py b/src/graphnet/training/utils.py index b33089ec9..fca4a21e0 100644 --- a/src/graphnet/training/utils.py +++ b/src/graphnet/training/utils.py @@ -326,7 +326,9 @@ def save_selection(selection: List[int], file_path: str) -> None: selection: List of event ids. file_path: File path to save the selection. """ - with open(file_path, "w") as file: - file.write("event_id\n") - for event_id in selection: - file.write(f"{event_id}\n") + assert isinstance( + selection, list + ), "Selection should be a list of integers." + with open(file_path, "w") as f: + f.write(",".join(map(str, selection))) + f.write("\n") diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py new file mode 100644 index 000000000..e826bf189 --- /dev/null +++ b/tests/data/test_datamodule.py @@ -0,0 +1,33 @@ +"""Unit tests for DataModule.""" + +from typing import Union, Dict, Any, List + +import os +import pandas as pd +import pytest +from graphnet.data.constants import FEATURES, TRUTH + +from graphnet.training.utils import save_selection + + +@pytest.fixture +def selection() -> List[int]: + """Return a selection.""" + return [1, 2, 3, 4, 5] + + +@pytest.fixture +def file_path(tmpdir: str) -> str: + """Return a file path.""" + return os.path.join(tmpdir, "selection.csv") + + +def test_save_selection(selection: List[int], file_path: str) -> None: + """Test `save_selection` function.""" + save_selection(selection, file_path) + + assert os.path.exists(file_path) + + with open(file_path, "r") as f: + content = f.read() + assert content.strip() == "1,2,3,4,5" From 31732dfe30d12c1e121069ca145d6c182c0349c7 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Wed, 7 Feb 2024 07:51:35 +0100 Subject: [PATCH 032/124] start working in the changes of data inside GNN definition --- src/graphnet/models/gnn/dynedge.py | 42 ++++++++++++++++-------------- src/graphnet/models/gnn/icemix.py | 36 +++++++++++++------------ 2 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/graphnet/models/gnn/dynedge.py b/src/graphnet/models/gnn/dynedge.py index 9ea93f9ce..cb65df9ab 100644 --- a/src/graphnet/models/gnn/dynedge.py +++ b/src/graphnet/models/gnn/dynedge.py @@ -17,7 +17,6 @@ "mean": scatter_mean, } - class DynEdge(GNN): """DynEdge (dynamical edge convolutional) model.""" @@ -32,9 +31,9 @@ def __init__( readout_layer_sizes: Optional[List[int]] = None, global_pooling_schemes: Optional[Union[str, List[str]]] = None, add_global_variables_after_pooling: bool = False, + icemix_encoder: bool = False, ): """Construct `DynEdge`. - Args: nb_inputs: Number of input features on each node. nb_neighbours: Number of neighbours to used in the k-nearest @@ -65,6 +64,9 @@ def __init__( after global pooling. The alternative is to added (distribute) them to the individual nodes before any convolutional operations. + icemix_encoder: Whether to use the IceCubeMix encoder. If `True`, + the activation function is GELU, and layer normalization is + applied after each linear layer. Defaults to `False`. """ # Latent feature subset for computing nearest neighbours in DynEdge. if features_subset is None: @@ -95,9 +97,7 @@ def __init__( assert len(dynedge_layer_sizes) assert all(isinstance(sizes, tuple) for sizes in dynedge_layer_sizes) assert all(len(sizes) > 0 for sizes in dynedge_layer_sizes) - assert all( - all(size > 0 for size in sizes) for sizes in dynedge_layer_sizes - ) + assert all(all(size > 0 for size in sizes) for sizes in dynedge_layer_sizes) self._dynedge_layer_sizes = dynedge_layer_sizes @@ -145,19 +145,21 @@ def __init__( "No global pooling schemes were request, so cannot add global" " variables after pooling." ) - self._add_global_variables_after_pooling = ( - add_global_variables_after_pooling - ) + self._add_global_variables_after_pooling = add_global_variables_after_pooling # Base class constructor super().__init__(nb_inputs, self._readout_layer_sizes[-1]) # Remaining member variables() - self._activation = torch.nn.LeakyReLU() + if icemix_encoder: + self._activation = torch.nn.GELU() + else: + self._activation = torch.nn.LeakyReLU() self._nb_inputs = nb_inputs self._nb_global_variables = 5 + nb_inputs self._nb_neighbours = nb_neighbours self._features_subset = features_subset + self._icemix_encoder = icemix_encoder self._construct_layers() @@ -179,6 +181,8 @@ def _construct_layers(self) -> None: if ix == 0: nb_in *= 2 layers.append(torch.nn.Linear(nb_in, nb_out)) + if self._icemix_encoder: + layers.append(torch.nn.LayerNorm(nb_out)) layers.append(self._activation) conv_layer = DynEdgeConv( @@ -193,25 +197,22 @@ def _construct_layers(self) -> None: # Post-processing operations nb_latent_features = ( - sum(sizes[-1] for sizes in self._dynedge_layer_sizes) - + nb_input_features + sum(sizes[-1] for sizes in self._dynedge_layer_sizes) + nb_input_features ) post_processing_layers = [] - layer_sizes = [nb_latent_features] + list( - self._post_processing_layer_sizes - ) + layer_sizes = [nb_latent_features] + list(self._post_processing_layer_sizes) for nb_in, nb_out in zip(layer_sizes[:-1], layer_sizes[1:]): post_processing_layers.append(torch.nn.Linear(nb_in, nb_out)) + if self._icemix_encoder: + post_processing_layers.append(torch.nn.LayerNorm(nb_out)) post_processing_layers.append(self._activation) self._post_processing = torch.nn.Sequential(*post_processing_layers) # Read-out operations nb_poolings = ( - len(self._global_pooling_schemes) - if self._global_pooling_schemes - else 1 + len(self._global_pooling_schemes) if self._global_pooling_schemes else 1 ) nb_latent_features = nb_out * nb_poolings if self._add_global_variables_after_pooling: @@ -221,6 +222,8 @@ def _construct_layers(self) -> None: layer_sizes = [nb_latent_features] + list(self._readout_layer_sizes) for nb_in, nb_out in zip(layer_sizes[:-1], layer_sizes[1:]): readout_layers.append(torch.nn.Linear(nb_in, nb_out)) + if self._icemix_encoder: + readout_layers.append(torch.nn.LayerNorm(nb_out)) readout_layers.append(self._activation) self._readout = torch.nn.Sequential(*readout_layers) @@ -288,8 +291,7 @@ def forward(self, data: Data) -> Tensor: ).type(torch.float) global_variables_distributed = torch.sum( - distribute.unsqueeze(dim=2) - * global_variables.unsqueeze(dim=0), + distribute.unsqueeze(dim=2) * global_variables.unsqueeze(dim=0), dim=1, ) @@ -322,4 +324,4 @@ def forward(self, data: Data) -> Tensor: # Read-out x = self._readout(x) - return x + return x \ No newline at end of file diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index f365af75d..a35f00f3f 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -11,6 +11,9 @@ from torch_geometric.nn.pool import knn_graph from torch_geometric.utils import to_dense_batch +from torch_geometric.data import Data +from torch import Tensor + class DeepIce(GNN): @@ -54,11 +57,11 @@ def __init__( def no_weight_decay(self): return {"cls_token"} - def forward(self, x0): - mask = x0.mask - Lmax = mask.sum(-1).max() - x = self.fourier_ext(x0, Lmax) - rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) + def forward(self, data: Data) -> Tensor: + mask = data.mask + Lmax = data.n_pulses.sum(-1) + x = self.fourier_ext(data, Lmax) + rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) # nbs = get_nbs(x0, Lmax) mask = mask[:, :Lmax] B, _ = mask.shape @@ -130,35 +133,36 @@ def __init__( 9, post_processing_layer_sizes=[336, dim // 2], dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], + global_pooling_schemes=None ) @torch.jit.ignore def no_weight_decay(self): return {"cls_token"} - def forward(self, x0): - mask = x0.mask + def forward(self, data: Data) -> Tensor: + mask = data.mask graph_feature = torch.concat( [ - x0.pos[mask], - x0.time[mask].view(-1, 1), - x0.auxiliary[mask].view(-1, 1), - x0.qe[mask].view(-1, 1), - x0.charge[mask].view(-1, 1), - x0.ice_properties[mask], + data.pos[mask], + data.time[mask].view(-1, 1), + data.auxiliary[mask].view(-1, 1), + data.qe[mask].view(-1, 1), + data.charge[mask].view(-1, 1), + data.ice_properties[mask], ], dim=1, ) Lmax = mask.sum(-1).max() - x = self.fourier_ext(x0, Lmax) - rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) + x = self.fourier_ext(data, Lmax) + rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) mask = mask[:, :Lmax] batch_index = mask.nonzero()[:, 0] edge_index = knn_graph(x=graph_feature[:, :self.knn_features], k=8, batch=batch_index).to( mask.device ) graph_feature = self.dyn_edge( - graph_feature, edge_index, batch_index, x0.n_pulses + graph_feature, edge_index, batch_index, data.n_pulses ) graph_feature, _ = to_dense_batch(graph_feature, batch_index) From 71a798b2063a178e978b5477d02be24baef0ef3d Mon Sep 17 00:00:00 2001 From: samadpls Date: Wed, 7 Feb 2024 17:09:27 +0500 Subject: [PATCH 033/124] Refactored `GraphNeTDataModule` and add test cases for `without_selection` Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 28 ++++++---- tests/data/test_datamodule.py | 98 +++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 16 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index f4db88e88..e35cfba2e 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -37,13 +37,13 @@ def __init__( Args: dataset_reference: A non-instantiated reference to the dataset class. dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. - selection: (Optional) a list of event id's used for training and validation. - test_selection: (Optional) a list of event id's used for testing. - train_dataloader_kwargs: Arguments for the training DataLoader. - validation_dataloader_kwargs: Arguments for the validation DataLoader. - test_dataloader_kwargs: Arguments for the test DataLoader. + selection: (Optional) a list of event id's used for training and validation, Default None. + test_selection: (Optional) a list of event id's used for testing, Default None. + train_dataloader_kwargs: Arguments for the training DataLoader, Default None. + validation_dataloader_kwargs: Arguments for the validation DataLoader, Default None. + test_dataloader_kwargs: Arguments for the test DataLoader, Default None. train_val_split (Optional): Split ratio for training and validation sets. Default is [0.9, 0.10]. - split_seed: seed used for shuffling and splitting selections into train/validation. + split_seed: seed used for shuffling and splitting selections into train/validation, Default 42. """ Logger.__init__(self) self._make_sure_root_logger_is_configured() @@ -63,7 +63,7 @@ def __init__( self._dataset_args["path"], list ) - self.setup("") + self.setup("fit") def prepare_data(self) -> None: """Prepare the dataset for training.""" @@ -86,10 +86,11 @@ def setup(self, stage: str) -> None: self._resolve_selections() # Creation of Datasets - # self._dataset = self._create_dataset(self.) - self._train_dataset = self._create_dataset(self._train_selection) - self._val_dataset = self._create_dataset(self._val_selection) - self._test_dataset = self._create_dataset(self._test_selection) # type: ignore + if stage == "fit" or stage == "validate": + self._train_dataset = self._create_dataset(self._train_selection) + self._val_dataset = self._create_dataset(self._val_selection) + elif stage == "test": + self._test_dataset = self._create_dataset(self._test_selection) # type: ignore return @@ -165,6 +166,9 @@ def _create_dataloader( "Unknown dataset encountered during dataloader creation." ) + if dataloader_args is None: + raise AttributeError("Dataloader arguments not provided.") + return DataLoader(dataset=dataset, **dataloader_args) def _validate_dataset_class(self) -> None: @@ -361,7 +365,7 @@ def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: Dataset object constructed from input arguments. """ print(tmp_args, "temp argument") - dataset = self._dataset(**tmp_args) + dataset = self._dataset(**tmp_args) # type: ignore return dataset def _create_dataset( diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py index e826bf189..946dd8906 100644 --- a/tests/data/test_datamodule.py +++ b/tests/data/test_datamodule.py @@ -1,15 +1,66 @@ """Unit tests for DataModule.""" -from typing import Union, Dict, Any, List - import os -import pandas as pd +from typing import List, Any + import pytest -from graphnet.data.constants import FEATURES, TRUTH +from torch.utils.data import SequentialSampler +from graphnet.constants import EXAMPLE_DATA_DIR +from graphnet.data.constants import FEATURES, TRUTH +from graphnet.data.dataset import SQLiteDataset, ParquetDataset +from graphnet.data.datamodule import GraphNeTDataModule +from graphnet.models.detector import IceCubeDeepCore +from graphnet.models.graphs import KNNGraph +from graphnet.models.graphs.nodes import NodesAsPulses from graphnet.training.utils import save_selection +@pytest.fixture +def dataset_ref(request: pytest.FixtureRequest) -> pytest.FixtureRequest: + """Return the dataset reference.""" + return request.param + + +@pytest.fixture +def dataset_setup(dataset_ref: pytest.FixtureRequest) -> tuple: + """Set up the dataset for testing. + + Args: + dataset_ref: The dataset reference. + + Returns: + A tuple with the dataset reference, dataset kwargs, and dataloader kwargs. + """ + # Grab public dataset paths + data_path = ( + f"{EXAMPLE_DATA_DIR}/sqlite/prometheus/prometheus-events.db" + if dataset_ref is SQLiteDataset + else f"{EXAMPLE_DATA_DIR}/parquet/prometheus/prometheus-events.parquet" + ) + + # Setup basic inputs; can be altered by individual tests + graph_definition = KNNGraph( + detector=IceCubeDeepCore(), + node_definition=NodesAsPulses(), + nb_nearest_neighbours=8, + input_feature_names=FEATURES.DEEPCORE, + ) + + dataset_kwargs = { + "truth_table": "mc_truth", + "pulsemaps": "total", + "truth": TRUTH.PROMETHEUS, + "features": FEATURES.PROMETHEUS, + "path": data_path, + "graph_definition": graph_definition, + } + + dataloader_kwargs = {"batch_size": 2, "num_workers": 1} + + return dataset_ref, dataset_kwargs, dataloader_kwargs + + @pytest.fixture def selection() -> List[int]: """Return a selection.""" @@ -31,3 +82,42 @@ def test_save_selection(selection: List[int], file_path: str) -> None: with open(file_path, "r") as f: content = f.read() assert content.strip() == "1,2,3,4,5" + + +@pytest.mark.parametrize( + "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True +) +def test_single_dataset_without_selections( + dataset_setup: tuple[Any, dict[str, Any], dict[str, int]] +) -> None: + """Verify GraphNeTDataModule behavior when no test selection is provided. + + Args: + dataset_setup: Tuple with dataset reference, dataset arguments, and dataloader arguments. + + Raises: + Exception: If the test dataloader is accessed without providing a test selection. + """ + dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + + # Only training_dataloader args + # Default values should be assigned to validation dataloader + dm = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + ) + + train_dataloader = dm.train_dataloader() + val_dataloader = dm.val_dataloader() + print(dm.test_dataloader, "here") + + with pytest.raises(Exception): + # should fail because we provided no test selection + test_dataloader = dm.test_dataloader() # noqa + # validation loader should have shuffle = False by default + assert isinstance(val_dataloader.sampler, SequentialSampler) + # Should have identical batch_size + assert val_dataloader.batch_size != train_dataloader.batch_size + # Training dataloader should contain more batches + assert len(train_dataloader) > len(val_dataloader) From 9c58cad42bdde8dba544ae5d5cd2dc23ae7ff374 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Wed, 7 Feb 2024 13:13:23 +0100 Subject: [PATCH 034/124] backup. Padding not done in the node definition --- src/graphnet/models/gnn/icemix.py | 3 +-- src/graphnet/models/graphs/nodes/nodes.py | 13 +++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index a35f00f3f..2d46704af 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -59,10 +59,9 @@ def no_weight_decay(self): def forward(self, data: Data) -> Tensor: mask = data.mask - Lmax = data.n_pulses.sum(-1) + Lmax = max(data.n_pulses) x = self.fourier_ext(data, Lmax) rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) - # nbs = get_nbs(x0, Lmax) mask = mask[:, :Lmax] B, _ = mask.shape attn_mask = torch.zeros(mask.shape, device=mask.device) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index fbedd406e..3757b211e 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -240,12 +240,12 @@ def __init__( "hlc", "rde", "scatt_lenght", - "abs_lenght", - "mask"] + "abs_lenght"] missing_features = set(self.all_features) - set(input_feature_names) if any(feat in missing_features for feat in self.all_features[:7]): - raise ValueError("Features dom_x, dom_y, dom_z, dom_time, charge, hlc, rde are required for IceMixNodes") + raise ValueError(f"Features dom_x, dom_y, dom_z, dom_time, charge, hlc, rde" + f" are required for IceMixNodes") self.feature_indexes = {feat: self.all_features.index(feat) for feat in input_feature_names} self.input_feature_names = input_feature_names @@ -269,9 +269,9 @@ def _add_ice_properties(self, def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: - graph = torch.zeros([self.max_length, len(self.all_features)]) - n_pulses = x.shape[0] + graph = torch.zeros([n_pulses, len(self.all_features)]) + event_length = n_pulses x[:, self.feature_indexes["hlc"]] = torch.logical_not(x[:, self.feature_indexes["hlc"]]) @@ -284,13 +284,10 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] ids = torch.cat([ids_n, ids_p]).sort().values - #ids.sort() event_length = len(ids) for idx, feature in enumerate(self.all_features[:7]): graph[:event_length, idx] = x[ids, self.feature_indexes[feature]] graph = self._add_ice_properties(graph, x, ids) #ice properties - graph[:event_length,9] = torch.ones(event_length) # mask - return Data(x=graph) \ No newline at end of file From 1d257d75825b4fb1791aa0c9fc9e3195c996385d Mon Sep 17 00:00:00 2001 From: samadpls Date: Wed, 7 Feb 2024 17:34:22 +0500 Subject: [PATCH 035/124] used typing notations Signed-off-by: samadpls --- tests/data/test_datamodule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py index 946dd8906..9c291929f 100644 --- a/tests/data/test_datamodule.py +++ b/tests/data/test_datamodule.py @@ -1,7 +1,7 @@ """Unit tests for DataModule.""" import os -from typing import List, Any +from typing import List, Any, Dict, Tuple import pytest from torch.utils.data import SequentialSampler @@ -88,7 +88,7 @@ def test_save_selection(selection: List[int], file_path: str) -> None: "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True ) def test_single_dataset_without_selections( - dataset_setup: tuple[Any, dict[str, Any], dict[str, int]] + dataset_setup: Tuple[Any, Dict[str, Any], Dict[str, int]] ) -> None: """Verify GraphNeTDataModule behavior when no test selection is provided. From af7dd4658732069ac948e2f8d0ad943be523ae80 Mon Sep 17 00:00:00 2001 From: samadpls Date: Wed, 7 Feb 2024 18:10:19 +0500 Subject: [PATCH 036/124] added unit test for `with_selection` use case Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 12 ++++-- tests/data/test_datamodule.py | 71 ++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index e35cfba2e..2a1c25d35 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -86,11 +86,15 @@ def setup(self, stage: str) -> None: self._resolve_selections() # Creation of Datasets - if stage == "fit" or stage == "validate": - self._train_dataset = self._create_dataset(self._train_selection) - self._val_dataset = self._create_dataset(self._val_selection) - elif stage == "test": + if self._test_selection is not None: self._test_dataset = self._create_dataset(self._test_selection) # type: ignore + if stage == "fit" or stage == "validate": + if self._train_selection is not None: + self._train_dataset = self._create_dataset( + self._train_selection + ) + if self._val_selection is not None: + self._val_dataset = self._create_dataset(self._val_selection) return diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py index 9c291929f..f1418bd4b 100644 --- a/tests/data/test_datamodule.py +++ b/tests/data/test_datamodule.py @@ -2,7 +2,8 @@ import os from typing import List, Any, Dict, Tuple - +import pandas as pd +import sqlite3 import pytest from torch.utils.data import SequentialSampler @@ -121,3 +122,71 @@ def test_single_dataset_without_selections( assert val_dataloader.batch_size != train_dataloader.batch_size # Training dataloader should contain more batches assert len(train_dataloader) > len(val_dataloader) + + +def extract_all_events_ids( + file_path: str, dataset_kwargs: Dict[str, Any] +) -> List[int]: + """Extract all available event ids.""" + if file_path.endswith(".parquet"): + selection = pd.read_parquet(file_path)["event_id"].to_numpy().tolist() + elif file_path.endswith(".db"): + with sqlite3.connect(file_path) as conn: + query = f'SELECT event_no FROM {dataset_kwargs["truth_table"]}' + selection = ( + pd.read_sql(query, conn)["event_no"].to_numpy().tolist() + ) + else: + raise AssertionError( + f"File extension not accepted: {file_path.split('.')[-1]}" + ) + return selection + + +@pytest.mark.parametrize( + "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True +) +def test_single_dataset_with_selections( + dataset_setup: Tuple[Any, Dict[str, Any], Dict[str, int]] +) -> None: + """Test that selection functionality of DataModule behaves as expected. + + Args: + dataset_setup (Tuple[Any, Dict[str, Any], Dict[str, int]]): A tuple containing the dataset reference, + dataset arguments, and dataloader arguments. + + Returns: + None + """ + # extract all events + dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + file_path = dataset_kwargs["path"] + selection = extract_all_events_ids( + file_path=file_path, dataset_kwargs=dataset_kwargs + ) + + test_selection = selection[0:10] + train_val_selection = selection[10:] + + # Only training_dataloader args + # Default values should be assigned to validation dataloader + dm = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + selection=train_val_selection, + test_selection=test_selection, + ) + + train_dataloader = dm.train_dataloader() + val_dataloader = dm.val_dataloader() + test_dataloader = dm.test_dataloader() + + # Check that the training and validation dataloader contains + # the same number of events as was given in the selection. + assert len(train_dataloader.dataset) + len(val_dataloader.dataset) == len(train_val_selection) # type: ignore + # Check that the number of events in the test dataset is equal to the + # number of events given in the selection. + assert len(test_dataloader.dataset) == len(test_selection) # type: ignore + # Training dataloader should have more batches + assert len(train_dataloader) > len(val_dataloader) From d8cb6a9387985c3033d9883a366e47e8730d8e36 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Wed, 7 Feb 2024 17:35:04 +0100 Subject: [PATCH 037/124] implemented mask calculation and padding in GNN --- src/graphnet/models/components/layers.py | 12 +++++++----- src/graphnet/models/gnn/icemix.py | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 47ca2300e..38cae1d1f 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -10,6 +10,7 @@ from torch_geometric.typing import Adj, PairTensor from torch_geometric.nn.conv import MessagePassing from torch_geometric.nn.inits import reset +from torch_geometric.data import Data import torch.nn as nn from torch.nn.functional import linear from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer @@ -340,14 +341,15 @@ def __init__( def forward( self, x: Tensor, + n_pulses: Tensor, Lmax: Optional[int] = None ) -> Tensor: """Forward pass.""" - pos = x.pos if Lmax is None else x.pos[:, :Lmax] - charge = x.charge if Lmax is None else x.charge[:, :Lmax] - time = x.time if Lmax is None else x.time[:, :Lmax] - auxiliary = x.auxiliary if Lmax is None else x.auxiliary[:, :Lmax] - length = torch.log10(x.n_pulses.to(dtype=pos.dtype)) + pos = x[:,:,:3] if Lmax is None else x[:,:Lmax,:3] + charge = x[:,:,4] if Lmax is None else x[:,:Lmax,4] + time = x[:,:,3] if Lmax is None else x[:,:Lmax,3] + auxiliary = x[:,:,5] if Lmax is None else x[:,:Lmax,5] + length = torch.log10(n_pulses.to(dtype=pos.dtype)) x = torch.cat( [ diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 2d46704af..39864efda 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -14,8 +14,6 @@ from torch_geometric.data import Data from torch import Tensor - - class DeepIce(GNN): def __init__( self, @@ -56,12 +54,21 @@ def __init__( @torch.jit.ignore def no_weight_decay(self): return {"cls_token"} + + def _convert_data(self, data: Data): + """Convert the input data to a tensor of shape (B, L, D)""" + x_list = torch.split(data.x, data.n_pulses.tolist()) + x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) + mask = torch.ne(x, torch.inf) + x[~mask] = 0 + return x, mask def forward(self, data: Data) -> Tensor: - mask = data.mask - Lmax = max(data.n_pulses) - x = self.fourier_ext(data, Lmax) - rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) + x0, mask = self._convert_data(data) + n_pulses = data.n_pulses + Lmax = max(n_pulses) + x = self.fourier_ext(x0, n_pulses, Lmax) + rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) mask = mask[:, :Lmax] B, _ = mask.shape attn_mask = torch.zeros(mask.shape, device=mask.device) From 49e72ac7cd3573d41d7142555b2fa19c4bebd74f Mon Sep 17 00:00:00 2001 From: samadpls Date: Wed, 7 Feb 2024 22:51:51 +0500 Subject: [PATCH 038/124] Refactored `dataloader` arguments Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 17 +++--- tests/data/test_datamodule.py | 94 +++++++++++++++++++++++---------- 2 files changed, 78 insertions(+), 33 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 2a1c25d35..92751b092 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -5,7 +5,6 @@ from copy import deepcopy from sklearn.model_selection import train_test_split import pandas as pd -import random from graphnet.data.dataset import ( Dataset, @@ -14,7 +13,6 @@ ParquetDataset, ) from graphnet.utilities.logging import Logger -from graphnet.training.utils import save_selection class GraphNeTDataModule(pl.LightningDataModule, Logger): @@ -86,7 +84,10 @@ def setup(self, stage: str) -> None: self._resolve_selections() # Creation of Datasets - if self._test_selection is not None: + if ( + self._test_selection is not None + or len(self._test_dataloader_kwargs) > 0 + ): self._test_dataset = self._create_dataset(self._test_selection) # type: ignore if stage == "fit" or stage == "validate": if self._train_selection is not None: @@ -98,7 +99,8 @@ def setup(self, stage: str) -> None: return - def train_dataloader(self) -> DataLoader: + @property + def train_dataloader(self) -> DataLoader: # type: ignore[override] """Prepare and return the training DataLoader. Returns: @@ -106,7 +108,8 @@ def train_dataloader(self) -> DataLoader: """ return self._create_dataloader(self._train_dataset) - def val_dataloader(self) -> DataLoader: + @property + def val_dataloader(self) -> DataLoader: # type: ignore[override] """Prepare and return the validation DataLoader. Returns: @@ -114,7 +117,8 @@ def val_dataloader(self) -> DataLoader: """ return self._create_dataloader(self._val_dataset) - def test_dataloader(self) -> DataLoader: + @property + def test_dataloader(self) -> DataLoader: # type: ignore[override] """Prepare and return the test DataLoader. Returns: @@ -369,6 +373,7 @@ def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: Dataset object constructed from input arguments. """ print(tmp_args, "temp argument") + print(self._dataset, "<-dataset") dataset = self._dataset(**tmp_args) # type: ignore return dataset diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py index f1418bd4b..9dab2b1d1 100644 --- a/tests/data/test_datamodule.py +++ b/tests/data/test_datamodule.py @@ -1,5 +1,6 @@ """Unit tests for DataModule.""" +from copy import deepcopy import os from typing import List, Any, Dict, Tuple import pandas as pd @@ -17,6 +18,25 @@ from graphnet.training.utils import save_selection +def extract_all_events_ids( + file_path: str, dataset_kwargs: Dict[str, Any] +) -> List[int]: + """Extract all available event ids.""" + if file_path.endswith(".parquet"): + selection = pd.read_parquet(file_path)["event_id"].to_numpy().tolist() + elif file_path.endswith(".db"): + with sqlite3.connect(file_path) as conn: + query = f'SELECT event_no FROM {dataset_kwargs["truth_table"]}' + selection = ( + pd.read_sql(query, conn)["event_no"].to_numpy().tolist() + ) + else: + raise AssertionError( + f"File extension not accepted: {file_path.split('.')[-1]}" + ) + return selection + + @pytest.fixture def dataset_ref(request: pytest.FixtureRequest) -> pytest.FixtureRequest: """Return the dataset reference.""" @@ -109,13 +129,12 @@ def test_single_dataset_without_selections( train_dataloader_kwargs=dataloader_kwargs, ) - train_dataloader = dm.train_dataloader() - val_dataloader = dm.val_dataloader() - print(dm.test_dataloader, "here") + train_dataloader = dm.train_dataloader + val_dataloader = dm.val_dataloader with pytest.raises(Exception): # should fail because we provided no test selection - test_dataloader = dm.test_dataloader() # noqa + test_dataloader = dm.test_dataloader # noqa # validation loader should have shuffle = False by default assert isinstance(val_dataloader.sampler, SequentialSampler) # Should have identical batch_size @@ -124,25 +143,6 @@ def test_single_dataset_without_selections( assert len(train_dataloader) > len(val_dataloader) -def extract_all_events_ids( - file_path: str, dataset_kwargs: Dict[str, Any] -) -> List[int]: - """Extract all available event ids.""" - if file_path.endswith(".parquet"): - selection = pd.read_parquet(file_path)["event_id"].to_numpy().tolist() - elif file_path.endswith(".db"): - with sqlite3.connect(file_path) as conn: - query = f'SELECT event_no FROM {dataset_kwargs["truth_table"]}' - selection = ( - pd.read_sql(query, conn)["event_no"].to_numpy().tolist() - ) - else: - raise AssertionError( - f"File extension not accepted: {file_path.split('.')[-1]}" - ) - return selection - - @pytest.mark.parametrize( "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True ) @@ -158,8 +158,8 @@ def test_single_dataset_with_selections( Returns: None """ - # extract all events dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + # extract all events file_path = dataset_kwargs["path"] selection = extract_all_events_ids( file_path=file_path, dataset_kwargs=dataset_kwargs @@ -178,9 +178,9 @@ def test_single_dataset_with_selections( test_selection=test_selection, ) - train_dataloader = dm.train_dataloader() - val_dataloader = dm.val_dataloader() - test_dataloader = dm.test_dataloader() + train_dataloader = dm.train_dataloader + val_dataloader = dm.val_dataloader + test_dataloader = dm.test_dataloader # Check that the training and validation dataloader contains # the same number of events as was given in the selection. @@ -190,3 +190,43 @@ def test_single_dataset_with_selections( assert len(test_dataloader.dataset) == len(test_selection) # type: ignore # Training dataloader should have more batches assert len(train_dataloader) > len(val_dataloader) + + +@pytest.mark.parametrize( + "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True +) +def test_dataloader_args( + dataset_setup: Tuple[Any, Dict[str, Any], Dict[str, int]] +) -> None: + """Test that arguments to dataloaders are propagated correctly. + + Args: + dataset_setup (Tuple[Any, Dict[str, Any], Dict[str, int]]): A tuple containing the dataset reference, + dataset keyword arguments, and dataloader keyword arguments. + + Returns: + None + """ + dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + val_dataloader_kwargs = deepcopy(dataloader_kwargs) + test_dataloader_kwargs = deepcopy(dataloader_kwargs) + + # Setting batch sizes to different values + val_dataloader_kwargs["batch_size"] = 1 + test_dataloader_kwargs["batch_size"] = 2 + dataloader_kwargs["batch_size"] = 3 + + dm = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + validation_dataloader_kwargs=val_dataloader_kwargs, + test_dataloader_kwargs=test_dataloader_kwargs, + ) + + # Check that the resulting dataloaders have the right batch sizes + assert dm.train_dataloader.batch_size == dataloader_kwargs["batch_size"] + assert dm.val_dataloader.batch_size == val_dataloader_kwargs["batch_size"] + assert ( + dm.test_dataloader.batch_size == test_dataloader_kwargs["batch_size"] + ) From 3b5a225dd65a9949a6292b0148f5f9cedfc2ea7a Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Thu, 8 Feb 2024 00:47:09 +0100 Subject: [PATCH 039/124] mask definition, error in Block layer --- src/graphnet/models/components/layers.py | 36 +++++++++--------------- src/graphnet/models/gnn/icemix.py | 12 ++------ 2 files changed, 16 insertions(+), 32 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 38cae1d1f..a1832161c 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -298,12 +298,7 @@ def forward(self, x: Tensor) -> Tensor: """Forward pass.""" device = x.device half_dim = self.dim // 2 - emb1 = math.log(self.n_freq) / half_dim - emb2 = torch.log(self.n_freq) / half_dim - if emb1 == emb2: - emb = emb1 - else: - raise ValueError("emb1 != emb2") + emb = math.log(self.n_freq) / half_dim emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) emb = x[..., None] * emb[None, ...] emb = torch.cat((emb.sin(), emb.cos()), dim=-1) @@ -342,22 +337,17 @@ def forward( self, x: Tensor, n_pulses: Tensor, - Lmax: Optional[int] = None + #Lmax: Optional[int] = None ) -> Tensor: """Forward pass.""" - pos = x[:,:,:3] if Lmax is None else x[:,:Lmax,:3] - charge = x[:,:,4] if Lmax is None else x[:,:Lmax,4] - time = x[:,:,3] if Lmax is None else x[:,:Lmax,3] - auxiliary = x[:,:,5] if Lmax is None else x[:,:Lmax,5] - length = torch.log10(n_pulses.to(dtype=pos.dtype)) - + length = torch.log10(n_pulses.to(dtype=x.dtype)) x = torch.cat( [ - self.sin_emb(4096 * pos).flatten(-2), - self.sin_emb(1024 * charge), - self.sin_emb(4096 * time), - self.aux_emb(auxiliary), - self.sin_emb2(length).unsqueeze(1).expand(-1, pos.shape[1], -1), + self.sin_emb(4096 * x[:,:,:3]).flatten(-2), #pos + self.sin_emb(1024 * x[:,:,4]), #charge + self.sin_emb(4096 * x[:,:,3]), #time + self.aux_emb(x[:,:,5].long()), #auxiliary + self.sin_emb2(length).unsqueeze(1).expand(-1, max(n_pulses), -1), ], -1, ) @@ -386,13 +376,13 @@ def __init__( def forward( self, x: Tensor, - Lmax: Optional[int] = None, + #Lmax: Optional[int] = None, ) -> Tensor: """Forward pass.""" - pos = x.pos if Lmax is None else x.pos[:, :Lmax] - time = x.time if Lmax is None else x.time[:, :Lmax] - spacetime_interval = (pos[:, :, None] - pos[:, None, :]).pow(2).sum(-1) - ( - (time[:, :, None] - time[:, None, :]) * (3e4 / 500 * 3e-1) + #pos = x[:,:,:3] + #time = x[:,:,3] + spacetime_interval = (x[:, :, :3, None] - x[:, :, None, :3]).pow(2).sum(-1) - ( + (x[:, :, 3, None] - x[:, :, None, 3]) * (3e4 / 500 * 3e-1) ).pow(2) four_distance = torch.sign(spacetime_interval) * torch.sqrt(torch.abs(spacetime_interval)) sin_emb = self.sin_emb(1024 * four_distance.clip(-4, 4)) diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 39864efda..212e4284a 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -45,7 +45,6 @@ def __init__( for i in range(depth) ] ) - #self.proj_out = nn.Linear(dim, 3) self.use_checkpoint = use_checkpoint self.n_rel = n_rel @@ -59,17 +58,15 @@ def _convert_data(self, data: Data): """Convert the input data to a tensor of shape (B, L, D)""" x_list = torch.split(data.x, data.n_pulses.tolist()) x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) - mask = torch.ne(x, torch.inf) + mask = torch.ne(x[:,:,1], torch.inf) x[~mask] = 0 return x, mask def forward(self, data: Data) -> Tensor: x0, mask = self._convert_data(data) n_pulses = data.n_pulses - Lmax = max(n_pulses) - x = self.fourier_ext(x0, n_pulses, Lmax) - rel_pos_bias, rel_enc = self.rel_pos(x0, Lmax) - mask = mask[:, :Lmax] + x = self.fourier_ext(x0, n_pulses) + rel_pos_bias, rel_enc = self.rel_pos(x0) B, _ = mask.shape attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf @@ -93,7 +90,6 @@ def forward(self, data: Data) -> Tensor: else: x = blk(x, None, attn_mask) - #x = self.proj_out(x[:, 0]) # cls token return x[:, 0] @@ -133,7 +129,6 @@ def __init__( for i in range(depth) ] ) - #self.proj_out = nn.Linear(dim, 3) self.use_checkpoint = use_checkpoint self.dyn_edge = DynEdge( 9, @@ -195,5 +190,4 @@ def forward(self, data: Data) -> Tensor: else: x = blk(x, None, attn_mask) - #x = self.proj_out(x[:, 0]) # cls token return x[:, 0] \ No newline at end of file From c207ec6ea53b25d3d0d6b19c6dd7a01faaa4874d Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Thu, 8 Feb 2024 15:00:45 +0100 Subject: [PATCH 040/124] restructure --- examples/01_icetray/01_convert_i3_files.py | 2 +- .../02_compare_sqlite_and_parquet.py | 2 +- src/graphnet/data/__init__.py | 5 +- src/graphnet/data/dataconverter.py | 716 ++++++------------ src/graphnet/data/dataconverter_new.py | 279 ------- src/graphnet/data/extractors/__init__.py | 21 +- src/graphnet/data/extractors/extractor.py | 56 -- src/graphnet/data/extractors/i3extractor.py | 106 --- .../data/extractors/i3particleextractor.py | 43 -- .../data/extractors/icecube/__init__.py | 20 + .../data/extractors/icecube/i3extractor.py | 66 ++ .../{ => icecube}/i3featureextractor.py | 9 +- .../{ => icecube}/i3genericextractor.py | 6 +- .../{ => icecube}/i3hybridrecoextractor.py | 2 +- .../{ => icecube}/i3ntmuonlabelsextractor.py | 2 +- .../extractors/icecube/i3particleextractor.py | 44 ++ .../{ => icecube}/i3pisaextractor.py | 2 +- .../{ => icecube}/i3quesoextractor.py | 2 +- .../{ => icecube}/i3retroextractor.py | 4 +- .../{ => icecube}/i3splinempeextractor.py | 2 +- .../{ => icecube}/i3truthextractor.py | 4 +- .../{ => icecube}/i3tumextractor.py | 2 +- .../{ => icecube}/utilities/__init__.py | 0 .../{ => icecube}/utilities/collections.py | 0 .../{ => icecube}/utilities/frames.py | 0 .../icecube/utilities/i3_filters.py} | 0 .../{ => icecube}/utilities/types.py | 4 +- src/graphnet/data/parquet/__init__.py | 2 - .../data/parquet/parquet_dataconverter.py | 52 -- src/graphnet/data/pipeline.py | 4 +- src/graphnet/data/readers/__init__.py | 3 + .../data/readers/graphnet_file_reader.py | 132 ++++ .../data/{readers.py => readers/i3reader.py} | 144 +--- src/graphnet/data/sqlite/__init__.py | 4 - .../data/sqlite/sqlite_dataconverter.py | 349 --------- src/graphnet/data/utilities/__init__.py | 3 + .../data/utilities/parquet_to_sqlite.py | 4 +- .../{sqlite => utilities}/sqlite_utilities.py | 54 +- src/graphnet/data/writers/__init__.py | 4 + .../graphnet_writer.py} | 69 +- src/graphnet/data/writers/parquet_writer.py | 34 + src/graphnet/data/writers/sqlite_writer.py | 224 ++++++ .../deployment/i3modules/graphnet_module.py | 4 +- src/graphnet/models/graphs/edges/minkowski.py | 11 +- src/graphnet/training/weight_fitting.py | 4 +- .../data/test_dataconverters_and_datasets.py | 2 +- tests/data/test_i3extractor.py | 2 +- tests/data/test_i3genericextractor.py | 8 +- 48 files changed, 879 insertions(+), 1633 deletions(-) delete mode 100644 src/graphnet/data/dataconverter_new.py delete mode 100644 src/graphnet/data/extractors/i3extractor.py delete mode 100644 src/graphnet/data/extractors/i3particleextractor.py create mode 100644 src/graphnet/data/extractors/icecube/__init__.py create mode 100644 src/graphnet/data/extractors/icecube/i3extractor.py rename src/graphnet/data/extractors/{ => icecube}/i3featureextractor.py (97%) rename src/graphnet/data/extractors/{ => icecube}/i3genericextractor.py (98%) rename src/graphnet/data/extractors/{ => icecube}/i3hybridrecoextractor.py (96%) rename src/graphnet/data/extractors/{ => icecube}/i3ntmuonlabelsextractor.py (96%) create mode 100644 src/graphnet/data/extractors/icecube/i3particleextractor.py rename src/graphnet/data/extractors/{ => icecube}/i3pisaextractor.py (94%) rename src/graphnet/data/extractors/{ => icecube}/i3quesoextractor.py (94%) rename src/graphnet/data/extractors/{ => icecube}/i3retroextractor.py (97%) rename src/graphnet/data/extractors/{ => icecube}/i3splinempeextractor.py (93%) rename src/graphnet/data/extractors/{ => icecube}/i3truthextractor.py (99%) rename src/graphnet/data/extractors/{ => icecube}/i3tumextractor.py (94%) rename src/graphnet/data/extractors/{ => icecube}/utilities/__init__.py (100%) rename src/graphnet/data/extractors/{ => icecube}/utilities/collections.py (100%) rename src/graphnet/data/extractors/{ => icecube}/utilities/frames.py (100%) rename src/graphnet/data/{filters.py => extractors/icecube/utilities/i3_filters.py} (100%) rename src/graphnet/data/extractors/{ => icecube}/utilities/types.py (98%) delete mode 100644 src/graphnet/data/parquet/__init__.py delete mode 100644 src/graphnet/data/parquet/parquet_dataconverter.py create mode 100644 src/graphnet/data/readers/__init__.py create mode 100644 src/graphnet/data/readers/graphnet_file_reader.py rename src/graphnet/data/{readers.py => readers/i3reader.py} (51%) delete mode 100644 src/graphnet/data/sqlite/__init__.py delete mode 100644 src/graphnet/data/sqlite/sqlite_dataconverter.py rename src/graphnet/data/{sqlite => utilities}/sqlite_utilities.py (72%) create mode 100644 src/graphnet/data/writers/__init__.py rename src/graphnet/data/{writers.py => writers/graphnet_writer.py} (57%) create mode 100644 src/graphnet/data/writers/parquet_writer.py create mode 100644 src/graphnet/data/writers/sqlite_writer.py diff --git a/examples/01_icetray/01_convert_i3_files.py b/examples/01_icetray/01_convert_i3_files.py index 88dcf714a..9a39f95e7 100644 --- a/examples/01_icetray/01_convert_i3_files.py +++ b/examples/01_icetray/01_convert_i3_files.py @@ -3,7 +3,7 @@ import os from graphnet.constants import EXAMPLE_OUTPUT_DIR, TEST_DATA_DIR -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCubeUpgrade, I3FeatureExtractorIceCube86, I3RetroExtractor, diff --git a/examples/01_icetray/02_compare_sqlite_and_parquet.py b/examples/01_icetray/02_compare_sqlite_and_parquet.py index 99250d4b0..d3874c5f2 100644 --- a/examples/01_icetray/02_compare_sqlite_and_parquet.py +++ b/examples/01_icetray/02_compare_sqlite_and_parquet.py @@ -7,7 +7,7 @@ from graphnet.data.sqlite import SQLiteDataConverter from graphnet.data.parquet import ParquetDataConverter from graphnet.data.dataset import SQLiteDataset, ParquetDataset -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCube86, I3TruthExtractor, I3RetroExtractor, diff --git a/src/graphnet/data/__init__.py b/src/graphnet/data/__init__.py index fbb1ee095..e7eb84ca4 100644 --- a/src/graphnet/data/__init__.py +++ b/src/graphnet/data/__init__.py @@ -1,6 +1,7 @@ """Modules for converting and ingesting data. `graphnet.data` enables converting domain-specific data to industry-standard, -intermediate file formats and reading this data. +intermediate file formats and reading this data. """ -from .filters import I3Filter, I3FilterMask +from .extractors.icecube.utilities.i3_filters import I3Filter, I3FilterMask +from .dataconverter import DataConverter diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 2a67ddce9..efae14a2f 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -1,496 +1,251 @@ -"""Base `DataConverter` class(es) used in GraphNeT.""" -# type: ignore[name-defined] # Due to use of `init_global_index`. - -from abc import ABC, abstractmethod -from collections import OrderedDict -from dataclasses import dataclass -from functools import wraps -import itertools +"""Contains `DataConverter`.""" +from typing import List, Union, OrderedDict, Dict, Tuple, Any, Optional, Type +from abc import abstractmethod, ABC + +from tqdm import tqdm +import numpy as np from multiprocessing import Manager, Pool, Value import multiprocessing.pool from multiprocessing.sharedctypes import Synchronized +import pandas as pd import os -import re -from typing import ( - Any, - Callable, - Dict, - List, - Optional, - Tuple, - TypeVar, - Union, - cast, -) +from glob import glob -import numpy as np -import pandas as pd -from tqdm import tqdm -from graphnet.data.utilities.random import pairwise_shuffle -from graphnet.data.extractors import ( - I3Extractor, - I3ExtractorCollection, - I3FeatureExtractor, - I3TruthExtractor, - I3GenericExtractor, -) from graphnet.utilities.decorators import final -from graphnet.utilities.filesys import find_i3_files -from graphnet.utilities.imports import has_icecube_package from graphnet.utilities.logging import Logger -from graphnet.data.filters import I3Filter, NullSplitI3Filter - -if has_icecube_package(): - from icecube import icetray, dataio # pyright: reportMissingImports=false +from .readers.graphnet_file_reader import GraphNeTFileReader +from .writers.graphnet_writer import GraphNeTWriter +from .extractors import Extractor +from .dataclasses import I3FileSet -SAVE_STRATEGIES = [ - "1:1", - "sequential_batched", - "pattern_batched", -] - - -# Utility classes -@dataclass -class FileSet: # noqa: D101 - i3_file: str - gcd_file: str - - -# Utility method(s) def init_global_index(index: Synchronized, output_files: List[str]) -> None: """Make `global_index` available to pool workers.""" global global_index, global_output_files # type: ignore[name-defined] global_index, global_output_files = (index, output_files) # type: ignore[name-defined] -F = TypeVar("F", bound=Callable[..., Any]) - - -def cache_output_files(process_method: F) -> F: - """Decorate `process_method` to cache output file names.""" - - @wraps(process_method) - def wrapper(self: Any, *args: Any) -> Any: - try: - # Using multiprocessing - output_files = global_output_files # type: ignore[name-defined] - except NameError: # `global_output_files` not set - # Running on main process - output_files = self._output_files - - output_file = process_method(self, *args) - output_files.append(output_file) - return output_file - - return cast(F, wrapper) - - class DataConverter(ABC, Logger): - """Base class for converting I3-files to intermediate file format.""" + """A finalized data conversion class in GraphNeT. - @property - @abstractmethod - def file_suffix(self) -> str: - """Suffix to use on output files.""" + `DataConverter` provides parallel processing of file conversion and + extraction from experiment-specific file formats to graphnet-supported data + formats. This class also assigns event id's to training examples. + """ def __init__( self, - extractors: List[I3Extractor], - outdir: str, - gcd_rescue: Optional[str] = None, - *, - nb_files_to_batch: Optional[int] = None, - sequential_batch_pattern: Optional[str] = None, - input_file_batch_pattern: Optional[str] = None, - workers: int = 1, + file_reader: Type[GraphNeTFileReader], + save_method: Type[GraphNeTWriter], + extractors: Union[Type[Extractor], List[Type[Extractor]]], index_column: str = "event_no", - icetray_verbose: int = 0, - i3_filters: List[I3Filter] = [], - ): - """Construct DataConverter. - - When using `input_file_batch_pattern`, regular expressions are used to - group files according to their names. All files that match a certain - pattern up to wildcards are grouped into the same output file. This - output file has the same name as the input files that are group into it, - with wildcards replaced with "x". Periods (.) and wildcards (*) have a - special meaning: Periods are interpreted as literal periods, and not as - matching any character (as in standard regex); and wildcards are - interpreted as ".*" in standard regex. - - For instance, the pattern "[A-Z]{1}_[0-9]{5}*.i3.zst" will find all I3 - files whose names contain: - - one capital letter, followed by - - an underscore, followed by - - five numbers, followed by - - any string of characters ending in ".i3.zst" - - This means that, e.g., the files: - - upgrade_genie_step4_141020_A_000000.i3.zst - - upgrade_genie_step4_141020_A_000001.i3.zst - - ... - - upgrade_genie_step4_141020_A_000008.i3.zst - - upgrade_genie_step4_141020_A_000009.i3.zst - would be grouped into the output file named - "upgrade_genie_step4_141020_A_00000x." but the file - - upgrade_genie_step4_141020_A_000010.i3.zst - would end up in a separate group, named - "upgrade_genie_step4_141020_A_00001x.". - """ - # Check(s) - if not isinstance(extractors, (list, tuple)): - extractors = [extractors] - - assert ( - len(extractors) > 0 - ), "Please specify at least one argument of type I3Extractor" - - for extractor in extractors: - assert isinstance( - extractor, I3Extractor - ), f"{type(extractor)} is not a subclass of I3Extractor" - - # Infer saving strategy - save_strategy = self._infer_save_strategy( - nb_files_to_batch, - sequential_batch_pattern, - input_file_batch_pattern, - ) + num_workers: int = 1, + ) -> None: + """Initialize `DataConverter`. - # Member variables - self._outdir = outdir - self._gcd_rescue = gcd_rescue - self._save_strategy = save_strategy - self._nb_files_to_batch = nb_files_to_batch - self._sequential_batch_pattern = sequential_batch_pattern - self._input_file_batch_pattern = input_file_batch_pattern - self._workers = workers - - # I3Filters (NullSplitI3Filter is always included) - self._i3filters = [NullSplitI3Filter()] + i3_filters - - for filter in self._i3filters: - assert isinstance( - filter, I3Filter - ), f"{type(filter)} is not a subclass of I3Filter" - - # Create I3Extractors - self._extractors = I3ExtractorCollection(*extractors) - - # Create shorthand of names of all pulsemaps queried - self._table_names = [extractor.name for extractor in self._extractors] - self._pulsemaps = [ - extractor.name - for extractor in self._extractors - if isinstance(extractor, I3FeatureExtractor) - ] - - # Placeholders for keeping track of sequential event indices and output files + Args: + file_reader: The method used for reading and applying `Extractors`. + save_method: The method used to save the interim data format to + a graphnet supported file format. + extractors: The `Extractor`(s) that will be applied to the input + files. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + """ + # Member Variable Assignment + self._file_reader = file_reader + self._save_method = save_method + self._num_workers = num_workers self._index_column = index_column self._index = 0 self._output_files: List[str] = [] - # Set verbosity - if icetray_verbose == 0: - icetray.I3Logger.global_logger = icetray.I3NullLogger() + # Set Extractors. Will throw error if extractors are incompatible + # with reader. + self._file_reader.set_extractors(extractors) # Base class constructor super().__init__(name=__name__, class_name=self.__class__.__name__) @final def __call__( - self, - directories: Union[str, List[str]], - recursive: Optional[bool] = True, + self, input_dir: Union[str, List[str]], output_dir: str ) -> None: - """Convert I3-files in `directories. + """Extract data from files in `input_dir` and save to disk. Args: - directories: One or more directories, the I3 files within which - should be converted to an intermediate file format. - recursive: Whether or not to search the directories recursively. + input_dir: A directory that contains the input files. + The directory will be searched recursively for files + matching the file extension. + output_dir: The directory to save the files to. Input folder + structure is not respected. """ - # Find all I3 and GCD files in the specified directories. - i3_files, gcd_files = find_i3_files( - directories, self._gcd_rescue, recursive + # Set outdir + self._output_dir = output_dir + # Get the file reader to produce a list of input files + # in the directory + input_files = self._file_reader.find_files(path=input_dir) # type: ignore + self._launch_jobs(input_files=input_files) + self._output_files = glob( + os.path.join( + self._output_dir, f"*{self._save_method.file_extension}" + ) ) - if len(i3_files) == 0: - self.error(f"No files found in {directories}.") - return - - # Save a record of the found I3 files in the output directory. - self._save_filenames(i3_files) - - # Shuffle I3 files to get a more uniform load on worker nodes. - i3_files, gcd_files = pairwise_shuffle(i3_files, gcd_files) - - # Process the files - filesets = [ - FileSet(i3_file, gcd_file) - for i3_file, gcd_file in zip(i3_files, gcd_files) - ] - self.execute(filesets) @final - def execute(self, filesets: List[FileSet]) -> None: - """General method for processing a set of I3 files. - - The files are converted individually according to the inheriting class/ - intermediate file format. - - Args: - filesets: List of paths to I3 and corresponding GCD files. - """ - # Make sure output directory exists. - self.info(f"Saving results to {self._outdir}") - os.makedirs(self._outdir, exist_ok=True) - - # Iterate over batches of files. - try: - if self._save_strategy == "sequential_batched": - # Define batches - assert self._nb_files_to_batch is not None - assert self._sequential_batch_pattern is not None - batches = np.array_split( - np.asarray(filesets), - int(np.ceil(len(filesets) / self._nb_files_to_batch)), - ) - batches = [ - ( - group.tolist(), - self._sequential_batch_pattern.format(ix_batch), - ) - for ix_batch, group in enumerate(batches) - ] - self.info( - f"Will batch {len(filesets)} input files into {len(batches)} groups." - ) - - # Iterate over batches - pool = self._iterate_over_batches_of_files(batches) - - elif self._save_strategy == "pattern_batched": - # Define batches - groups: Dict[str, List[FileSet]] = OrderedDict() - for fileset in sorted(filesets, key=lambda f: f.i3_file): - group = re.sub( - self._sub_from, - self._sub_to, - os.path.basename(fileset.i3_file), - ) - if group not in groups: - groups[group] = list() - groups[group].append(fileset) - - self.info( - f"Will batch {len(filesets)} input files into {len(groups)} groups" - ) - if len(groups) <= 20: - for group, group_filesets in groups.items(): - self.info( - f"> {group}: {len(group_filesets):3d} file(s)" - ) - - batches = [ - (list(group_filesets), group) - for group, group_filesets in groups.items() - ] - - # Iterate over batches - pool = self._iterate_over_batches_of_files(batches) - - elif self._save_strategy == "1:1": - pool = self._iterate_over_individual_files(filesets) - - else: - assert False, "Shouldn't reach here." - - self._update_shared_variables(pool) - - except KeyboardInterrupt: - self.warning("[ctrl+c] Exciting gracefully.") - - @abstractmethod - def save_data(self, data: List[OrderedDict], output_file: str) -> None: - """Implementation-specific method for saving data to file. - - Args: - data: List of extracted features. - output_file: Name of output file. - """ - - @abstractmethod - def merge_files( - self, output_file: str, input_files: Optional[List[str]] = None + def _launch_jobs( + self, + input_files: Union[List[str], List[I3FileSet]], ) -> None: - """Implementation-specific method for merging output files. + """Multi Processing Logic. - Args: - output_file: Name of the output file containing the merged results. - input_files: Intermediate files to be merged, according to the - specific implementation. Default to None, meaning that all - files output by the current instance are merged. - - Raises: - NotImplementedError: If the method has not been implemented for the - backend in question. - """ + Spawns worker pool, + distributes the input files evenly across workers. + declare event_no as globally accessible variable across workers. + starts jobs. - # Internal methods - def _iterate_over_individual_files( - self, args: List[FileSet] - ) -> Optional[multiprocessing.pool.Pool]: + Will call process_file in parallel. + """ # Get appropriate mapping function - map_fn, pool = self.get_map_function(len(args)) + map_fn, pool = self.get_map_function(nb_files=len(input_files)) # Iterate over files for _ in map_fn( - self._process_file, tqdm(args, unit="file(s)", colour="green") + self._process_file, + tqdm(input_files, unit="file(s)", colour="green"), ): - self.debug( - "Saving with 1:1 strategy on the individual worker processes" - ) + self.debug("processing file.") - return pool + self._update_shared_variables(pool) - def _iterate_over_batches_of_files( - self, args: List[Tuple[List[FileSet], str]] - ) -> Optional[multiprocessing.pool.Pool]: - """Iterate over a batch of files and save results on worker process.""" - # Get appropriate mapping function - map_fn, pool = self.get_map_function(len(args), unit="batch(es)") - - # Iterate over batches of files - for _ in map_fn( - self._process_batch, tqdm(args, unit="batch(es)", colour="green") - ): - self.debug("Saving with batched strategy") - - return pool + @final + def _process_file(self, file_path: Union[str, I3FileSet]) -> None: + """Process a single file. - def _update_shared_variables( - self, pool: Optional[multiprocessing.pool.Pool] - ) -> None: - """Update `self._index` and `self._output_files`. + Calls file reader to recieve extracted output, event ids + is assigned to the extracted data and is handed to save method. - If `pool` is set, it means that multiprocessing was used. In this case, - the worker processes will not have been able to write directly to - `self._index` and `self._output_files`, and we need to get them synced - up. + This function is called in parallel. """ - if pool: - # Extract information from shared variables to member variables. - index, output_files = pool._initargs # type: ignore - self._index += index.value - self._output_files.extend(list(sorted(output_files[:]))) - - @cache_output_files - def _process_file( - self, - fileset: FileSet, - ) -> str: - - # Process individual files - data = self._extract_data(fileset) - - # Save data - output_file = self._get_output_file(fileset.i3_file) - self.save_data(data, output_file) - - return output_file - - @cache_output_files - def _process_batch(self, args: Tuple[List[FileSet], str]) -> str: - # Unpack arguments - filesets, output_file_name = args - - # Process individual files - data = list( - itertools.chain.from_iterable(map(self._extract_data, filesets)) + # Read and apply extractors + data = self._file_reader(file_path=file_path) + n_events = len(data) # type: ignore + + # Assign event_no's to each event in data and transform to pd.DataFrame + data = self._assign_event_no(data=data) + + # Create output file name + output_file_name = self._create_file_name(input_file_path=file_path) + + # Apply save method + self._save_method( + data=data, + file_name=output_file_name, + n_events=n_events, + output_dir=self._output_dir, ) - # Save batched data - output_file = self._get_output_file(output_file_name) - self.save_data(data, output_file) - - return output_file - - def _extract_data(self, fileset: FileSet) -> List[OrderedDict]: - """Extract data from single I3 file. - - If the saving strategy is 1:1 (i.e., each I3 file is converted to a - corresponding intermediate file) the data is saved to such a file, and - no data is return from the method. + @final + def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: + """Convert input file path to an output file name.""" + if isinstance(input_file_path, I3FileSet): + input_file_path = input_file_path.i3_file + path_without_extension = os.path.splitext(input_file_path)[0] + base_file_name = path_without_extension.split("/")[-1] + return base_file_name # type: ignore - The above distincting is to allow worker processes to save files rather - than sending it back to the main process. + @final + def _assign_event_no( + self, data: List[OrderedDict[str, Any]] + ) -> Dict[str, pd.DataFrame]: + + # Request event_no's for the entire file + event_nos = self._request_event_nos(n_ids=len(data)) + + # Dict holding pd.DataFrame's + dataframe_dict: Dict = {} + # Loop through events (again..) to assign event_nos + for k in range(len(data)): + for extractor_name in data[k].keys(): + n_rows = self._count_rows( + event_dict=data[k], extractor_name=extractor_name + ) + if n_rows > 0: + data[k][extractor_name][self._index_column] = np.repeat( + event_nos[k], n_rows + ).tolist() + df = pd.DataFrame( + data[k][extractor_name], + index=[0] if n_rows == 1 else None, + ) + if extractor_name in dataframe_dict.keys(): + dataframe_dict[extractor_name].append(df) + else: + dataframe_dict[extractor_name] = [df] + # Merge each list of dataframes + for key in dataframe_dict.keys(): + dataframe_dict[key] = pd.concat( + dataframe_dict[key], axis=0 + ).reset_index(drop=True) + return dataframe_dict - Args: - fileset: Path to I3 file and corresponding GCD file. + @final + def _count_rows( + self, event_dict: OrderedDict[str, Any], extractor_name: str + ) -> int: + """Count number of rows that features from `extractor_name` have.""" + extractor_dict = event_dict[extractor_name] - Returns: - Extracted data. - """ - # Infer whether method is being run using multiprocessing try: - global_index # type: ignore[name-defined] - multi_processing = True - except NameError: - multi_processing = False - - self._extractors.set_files(fileset.i3_file, fileset.gcd_file) - i3_file_io = dataio.I3File(fileset.i3_file, "r") - data = list() - while i3_file_io.more(): - try: - frame = i3_file_io.pop_physics() - except Exception as e: - if "I3" in str(e): - continue - # check if frame should be skipped - if self._skip_frame(frame): - continue - - # Try to extract data from I3Frame - results = self._extractors(frame) - - data_dict = OrderedDict(zip(self._table_names, results)) - - # If an I3GenericExtractor is used, we want each automatically - # parsed key to be stored as a separate table. - for extractor in self._extractors: - if isinstance(extractor, I3GenericExtractor): - data_dict.update(data_dict.pop(extractor._name)) - - # Get new, unique index and increment value - if multi_processing: - with global_index.get_lock(): # type: ignore[name-defined] - index = global_index.value # type: ignore[name-defined] - global_index.value += 1 # type: ignore[name-defined] + # If all features in extractor_name have the same length + # this line of code will execute without error and result + # in an array with shape [num_features, n_rows_in_feature] + # unless the list is empty! + + shape = np.asarray(list(extractor_dict.values())).shape + if len(shape) > 1: + n_rows = shape[1] else: - index = self._index - self._index += 1 - - # Attach index to all tables - for table in data_dict.keys(): - data_dict[table][self._index_column] = index - - data.append(data_dict) + n_rows = 1 + except ValueError as e: + self.error( + f"Features from {extractor_name} ({extractor_dict.keys()}) have different lengths." + ) + raise e + return n_rows + + def _request_event_nos(self, n_ids: int) -> List[int]: + + # Get new, unique index and increment value + if self._num_workers > 1: + with global_index.get_lock(): # type: ignore[name-defined] + starting_index = global_index.value # type: ignore[name-defined] + event_nos = np.arange( + starting_index, starting_index + n_ids, 1 + ).tolist() + global_index.value += n_ids # type: ignore[name-defined] + else: + starting_index = self._index + event_nos = np.arange( + starting_index, starting_index + n_ids, 1 + ).tolist() + self._index += n_ids - return data + return event_nos + @final def get_map_function( - self, nb_files: int, unit: str = "I3 file(s)" + self, nb_files: int, unit: str = "file(s)" ) -> Tuple[Any, Optional[multiprocessing.pool.Pool]]: """Identify map function to use (pure python or multiprocess).""" # Choose relevant map-function given the requested number of workers. - workers = min(self._workers, nb_files) - if workers > 1: + n_workers = min(self._num_workers, nb_files) + if n_workers > 1: self.info( - f"Starting pool of {workers} workers to process {nb_files} {unit}" + f"Starting pool of {n_workers} workers to process {nb_files} {unit}" ) manager = Manager() @@ -498,7 +253,7 @@ def get_map_function( output_files = manager.list() pool = Pool( - processes=workers, + processes=n_workers, initializer=init_global_index, initargs=(index, output_files), ) @@ -513,75 +268,50 @@ def get_map_function( return map_fn, pool - def _infer_save_strategy( - self, - nb_files_to_batch: Optional[int] = None, - sequential_batch_pattern: Optional[str] = None, - input_file_batch_pattern: Optional[str] = None, - ) -> str: - if input_file_batch_pattern is not None: - save_strategy = "pattern_batched" - - assert ( - "*" in input_file_batch_pattern - ), "Argument `input_file_batch_pattern` should contain at least one wildcard (*)" - - fields = [ - "(" + field + ")" - for field in input_file_batch_pattern.replace( - ".", r"\." - ).split("*") - ] - nb_fields = len(fields) - self._sub_from = ".*".join(fields) - self._sub_to = "x".join([f"\\{ix + 1}" for ix in range(nb_fields)]) - - if sequential_batch_pattern is not None: - self.warning("Argument `sequential_batch_pattern` ignored.") - if nb_files_to_batch is not None: - self.warning("Argument `nb_files_to_batch` ignored.") - - elif (nb_files_to_batch is not None) or ( - sequential_batch_pattern is not None - ): - save_strategy = "sequential_batched" + @final + def _update_shared_variables( + self, pool: Optional[multiprocessing.pool.Pool] + ) -> None: + """Update `self._index` and `self._output_files`. - assert (nb_files_to_batch is not None) and ( - sequential_batch_pattern is not None - ), "Please specify both `nb_files_to_batch` and `sequential_batch_pattern` for sequential batching." + If `pool` is set, it means that multiprocessing was used. In this case, + the worker processes will not have been able to write directly to + `self._index` and `self._output_files`, and we need to get them synced + up. + """ + if pool: + # Extract information from shared variables to member variables. + index, output_files = pool._initargs # type: ignore + self._index += index.value + self._output_files.extend(list(sorted(output_files[:]))) - else: - save_strategy = "1:1" - - return save_strategy - - def _save_filenames(self, i3_files: List[str]) -> None: - """Save I3 file names in CSV format.""" - self.debug("Saving input file names to config CSV.") - config_dir = os.path.join(self._outdir, "config") - os.makedirs(config_dir, exist_ok=True) - df_i3_files = pd.DataFrame(data=i3_files, columns=["filename"]) - df_i3_files.to_csv(os.path.join(config_dir, "i3files.csv")) - - def _get_output_file(self, input_file: str) -> str: - assert isinstance(input_file, str) - basename = os.path.basename(input_file) - output_file = os.path.join( - self._outdir, - re.sub(r"\.i3\..*", "", basename) + "." + self.file_suffix, - ) - return output_file + @final + def merge_files(self, files: Optional[List[str]] = None) -> None: + """Merge converted files. - def _skip_frame(self, frame: "icetray.I3Frame") -> bool: - """Check the user defined filters. + `DataConverter` will call the `.merge_files` method in the + `GraphNeTWriter` module that it was instantiated with. - Returns: - bool: True if frame should be skipped, False otherwise. + Args: + files: Intermediate files to be merged. """ - if self._i3filters is None: - return False # No filters defined, so we keep the frame + if (files is None) & (len(self._output_files) > 0): + # If no input files are given, but output files from conversion + # is available. + files_to_merge = self._output_files + elif files is not None: + # Proceed to merge specified by user. + files_to_merge = files + else: + # Raise error + self.error( + "This DataConverter does not have output files set," + "and you must therefore specify argument `files`." + ) + assert files is not None - for filter in self._i3filters: - if not filter(frame): - return True # keep_frame call false, skip the frame. - return False # All filter keep_frame calls true, keep the frame. + # Merge files + self._save_method.merge_files( # type:ignore + files=files_to_merge, + output_dir=os.path.join(self._output_dir, "merged"), + ) diff --git a/src/graphnet/data/dataconverter_new.py b/src/graphnet/data/dataconverter_new.py deleted file mode 100644 index eb51495d1..000000000 --- a/src/graphnet/data/dataconverter_new.py +++ /dev/null @@ -1,279 +0,0 @@ -"""Contains `DataConverter`.""" -from typing import List, Union, OrderedDict, Dict, Tuple, Any, Optional, Type -from abc import abstractmethod, ABC - -from tqdm import tqdm -import numpy as np -from multiprocessing import Manager, Pool, Value -import multiprocessing.pool -from multiprocessing.sharedctypes import Synchronized -import pandas as pd -import os - -from graphnet.utilities.decorators import final -from graphnet.utilities.logging import Logger -from .readers import GraphNeTFileReader -from .writers import GraphNeTFileSaveMethod -from .extractors import Extractor -from .dataclasses import I3FileSet - - -def init_global_index(index: Synchronized, output_files: List[str]) -> None: - """Make `global_index` available to pool workers.""" - global global_index, global_output_files # type: ignore[name-defined] - global_index, global_output_files = (index, output_files) # type: ignore[name-defined] - - -class DataConverter(ABC, Logger): - """A finalized data conversion class in GraphNeT. - - `DataConverter` provides parallel processing of file conversion and - extraction from experiment-specific file formats to graphnet-supported data - formats. This class also assigns event id's to training examples. - """ - - def __init__( - self, - file_reader: Type[GraphNeTFileReader], - save_method: Type[GraphNeTFileSaveMethod], - extractors: Union[Type[Extractor], List[Type[Extractor]]], - index_column: str = "event_no", - num_workers: int = 1, - ) -> None: - """Initialize `DataConverter`. - - Args: - file_reader: The method used for reading and applying `Extractors`. - save_method: The method used to save the interim data format to - a graphnet supported file format. - extractors: The `Extractor`(s) that will be applied to the input - files. - index_column: Name of the event id column added to the events. - Defaults to "event_no". - num_workers: The number of CPUs used for parallel processing. - Defaults to 1 (no multiprocessing). - """ - # Member Variable Assignment - self._file_reader = file_reader - self._save_method = save_method - self._num_workers = num_workers - self._index_column = index_column - self._index = 0 - self._output_files: List[str] = [] - - # Set Extractors. Will throw error if extractors are incompatible - # with reader. - self._file_reader.set_extractors(extractors) - - # Base class constructor - super().__init__(name=__name__, class_name=self.__class__.__name__) - - @final - def __call__( - self, input_dir: Union[str, List[str]], output_dir: str - ) -> None: - """Extract data from files in `input_dir` and save to disk. - - Args: - input_dir: A directory that contains the input files. - The directory will be searched recursively for files - matching the file extension. - output_dir: The directory to save the files to. Input folder - structure is not respected. - """ - # Set outdir - self._output_dir = output_dir - # Get the file reader to produce a list of input files - # in the directory - input_files = self._file_reader.find_files(path=input_dir) # type: ignore - self._launch_jobs(input_files=input_files) - - @final - def _launch_jobs( - self, - input_files: Union[List[str], List[I3FileSet]], - ) -> None: - """Multi Processing Logic. - - Spawns worker pool, - distributes the input files evenly across workers. - declare event_no as globally accessible variable across workers. - starts jobs. - - Will call process_file in parallel. - """ - # Get appropriate mapping function - map_fn, pool = self.get_map_function(nb_files=len(input_files)) - - # Iterate over files - for _ in map_fn( - self._process_file, - tqdm(input_files, unit="file(s)", colour="green"), - ): - self.debug("processing file.") - - self._update_shared_variables(pool) - - @final - def _process_file(self, file_path: Union[str, I3FileSet]) -> None: - """Process a single file. - - Calls file reader to recieve extracted output, event ids - is assigned to the extracted data and is handed to save method. - - This function is called in parallel. - """ - # Read and apply extractors - data = self._file_reader(file_path=file_path) - n_events = len(data) # type: ignore - - # Assign event_no's to each event in data and transform to pd.DataFrame - data = self._assign_event_no(data=data) - - # Create output file name - output_file_name = self._create_file_name(input_file_path=file_path) - - # Apply save method - self._save_method( - data=data, - file_name=output_file_name, - n_events=n_events, - output_dir=self._output_dir, - ) - - @final - def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: - """Convert input file path to an output file name.""" - if isinstance(input_file_path, I3FileSet): - input_file_path = input_file_path.i3_file - path_without_extension = os.path.splitext(input_file_path)[0] - base_file_name = path_without_extension.split("/")[-1] - return base_file_name # type: ignore - - @final - def _assign_event_no( - self, data: List[OrderedDict[str, Any]] - ) -> Dict[str, pd.DataFrame]: - - # Request event_no's for the entire file - event_nos = self._request_event_nos(n_ids=len(data)) - - # Dict holding pd.DataFrame's - dataframe_dict: Dict = {} - # Loop through events (again..) to assign event_nos - for k in range(len(data)): - for extractor_name in data[k].keys(): - n_rows = self._count_rows( - event_dict=data[k], extractor_name=extractor_name - ) - if n_rows > 0: - data[k][extractor_name][self._index_column] = np.repeat( - event_nos[k], n_rows - ).tolist() - df = pd.DataFrame( - data[k][extractor_name], - index=[0] if n_rows == 1 else None, - ) - if extractor_name in dataframe_dict.keys(): - dataframe_dict[extractor_name].append(df) - else: - dataframe_dict[extractor_name] = [df] - # Merge each list of dataframes - for key in dataframe_dict.keys(): - dataframe_dict[key] = pd.concat( - dataframe_dict[key], axis=0 - ).reset_index(drop=True) - return dataframe_dict - - @final - def _count_rows( - self, event_dict: OrderedDict[str, Any], extractor_name: str - ) -> int: - """Count number of rows that features from `extractor_name` have.""" - extractor_dict = event_dict[extractor_name] - - try: - # If all features in extractor_name have the same length - # this line of code will execute without error and result - # in an array with shape [num_features, n_rows_in_feature] - # unless the list is empty! - - shape = np.asarray(list(extractor_dict.values())).shape - if len(shape) > 1: - n_rows = shape[1] - else: - n_rows = 1 - except ValueError as e: - self.error( - f"Features from {extractor_name} ({extractor_dict.keys()}) have different lengths." - ) - raise e - return n_rows - - def _request_event_nos(self, n_ids: int) -> List[int]: - - # Get new, unique index and increment value - if self._num_workers > 1: - with global_index.get_lock(): # type: ignore[name-defined] - starting_index = global_index.value # type: ignore[name-defined] - event_nos = np.arange( - starting_index, starting_index + n_ids, 1 - ).tolist() - global_index.value += n_ids # type: ignore[name-defined] - else: - starting_index = self._index - event_nos = np.arange( - starting_index, starting_index + n_ids, 1 - ).tolist() - self._index += n_ids - - return event_nos - - @final - def get_map_function( - self, nb_files: int, unit: str = "file(s)" - ) -> Tuple[Any, Optional[multiprocessing.pool.Pool]]: - """Identify map function to use (pure python or multiprocess).""" - # Choose relevant map-function given the requested number of workers. - n_workers = min(self._num_workers, nb_files) - if n_workers > 1: - self.info( - f"Starting pool of {n_workers} workers to process {nb_files} {unit}" - ) - - manager = Manager() - index = Value("i", 0) - output_files = manager.list() - - pool = Pool( - processes=n_workers, - initializer=init_global_index, - initargs=(index, output_files), - ) - map_fn = pool.imap - - else: - self.info( - f"Processing {nb_files} {unit} in main thread (not multiprocessing)" - ) - map_fn = map # type: ignore - pool = None - - return map_fn, pool - - @final - def _update_shared_variables( - self, pool: Optional[multiprocessing.pool.Pool] - ) -> None: - """Update `self._index` and `self._output_files`. - - If `pool` is set, it means that multiprocessing was used. In this case, - the worker processes will not have been able to write directly to - `self._index` and `self._output_files`, and we need to get them synced - up. - """ - if pool: - # Extract information from shared variables to member variables. - index, output_files = pool._initargs # type: ignore - self._index += index.value - self._output_files.extend(list(sorted(output_files[:]))) diff --git a/src/graphnet/data/extractors/__init__.py b/src/graphnet/data/extractors/__init__.py index ec0ecfe5e..c6f4f325e 100644 --- a/src/graphnet/data/extractors/__init__.py +++ b/src/graphnet/data/extractors/__init__.py @@ -1,21 +1,2 @@ -"""Collection of I3Extractors, extracting pure-python data from I3Frames.""" - -from .i3extractor import I3Extractor, I3ExtractorCollection -from .i3featureextractor import ( - I3FeatureExtractor, - I3FeatureExtractorIceCube86, - I3FeatureExtractorIceCubeDeepCore, - I3FeatureExtractorIceCubeUpgrade, - I3PulseNoiseTruthFlagIceCubeUpgrade, -) -from .i3truthextractor import I3TruthExtractor -from .i3retroextractor import I3RetroExtractor -from .i3splinempeextractor import I3SplineMPEICExtractor -from .i3particleextractor import I3ParticleExtractor -from .i3tumextractor import I3TUMExtractor -from .i3hybridrecoextractor import I3GalacticPlaneHybridRecoExtractor -from .i3genericextractor import I3GenericExtractor -from .i3pisaextractor import I3PISAExtractor -from .i3ntmuonlabelsextractor import I3NTMuonLabelExtractor -from .i3quesoextractor import I3QUESOExtractor +"""Module containing data-specific extractor modules.""" from .extractor import Extractor diff --git a/src/graphnet/data/extractors/extractor.py b/src/graphnet/data/extractors/extractor.py index 795d05cf1..b5e5ed37c 100644 --- a/src/graphnet/data/extractors/extractor.py +++ b/src/graphnet/data/extractors/extractor.py @@ -49,59 +49,3 @@ def __call__(self, frame: "icetray.I3Frame") -> dict: def name(self) -> str: """Get the name of the `I3Extractor` instance.""" return self._extractor_name - - -class I3Extractor(Extractor): - """Base class for extracting information from physics I3-frames. - - Contains functionality required to extract data from i3 files, used by - the IceCube Neutrino Observatory. - - All classes inheriting from `I3Extractor` should implement the `__call__` - method. - """ - - def __init__(self, extractor_name: str): - """Construct I3Extractor. - - Args: - extractor_name: Name of the `I3Extractor` instance. Used to keep track of the - provenance of different data, and to name tables to which this - data is saved. - """ - # Member variable(s) - self._i3_file: str = "" - self._gcd_file: str = "" - self._gcd_dict: Dict[int, Any] = {} - self._calibration: Optional["icetray.I3Frame.Calibration"] = None - - # Base class constructor - super().__init__(extractor_name=extractor_name) - - def set_gcd(self, gcd_file: str, i3_file: str) -> None: - """Load the geospatial information contained in the GCD-file.""" - # If no GCD file is provided, search the I3 file for frames containing - # geometry (G) and calibration (C) information. - gcd = dataio.I3File(gcd_file or i3_file) - - try: - g_frame = gcd.pop_frame(icetray.I3Frame.Geometry) - except RuntimeError: - self.error( - "No GCD file was provided and no G-frame was found. Exiting." - ) - raise - else: - self._gcd_dict = g_frame["I3Geometry"].omgeo - - try: - c_frame = gcd.pop_frame(icetray.I3Frame.Calibration) - except RuntimeError: - self.warning("No GCD file was provided and no C-frame was found.") - else: - self._calibration = c_frame["I3Calibration"] - - @abstractmethod - def __call__(self, frame: "icetray.I3Frame") -> dict: - """Extract information from frame.""" - pass diff --git a/src/graphnet/data/extractors/i3extractor.py b/src/graphnet/data/extractors/i3extractor.py deleted file mode 100644 index 90a982387..000000000 --- a/src/graphnet/data/extractors/i3extractor.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Base I3Extractor class(es).""" - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, Optional - -from graphnet.utilities.imports import has_icecube_package -from graphnet.utilities.logging import Logger - -if has_icecube_package() or TYPE_CHECKING: - from icecube import icetray, dataio # pyright: reportMissingImports=false - - -class I3Extractor(ABC, Logger): - """Base class for extracting information from physics I3-frames. - - All classes inheriting from `I3Extractor` should implement the `__call__` - method, and can be applied directly on `icetray.I3Frame` objects to return - extracted, pure-python data. - """ - - def __init__(self, name: str): - """Construct I3Extractor. - - Args: - name: Name of the `I3Extractor` instance. Used to keep track of the - provenance of different data, and to name tables to which this - data is saved. - """ - # Member variable(s) - self._i3_file: str = "" - self._gcd_file: str = "" - self._gcd_dict: Dict[int, Any] = {} - self._calibration: Optional["icetray.I3Frame.Calibration"] = None - self._name: str = name - - # Base class constructor - super().__init__(name=__name__, class_name=self.__class__.__name__) - - def set_files(self, i3_file: str, gcd_file: str) -> None: - """Store references to the I3- and GCD-files being processed.""" - # @TODO: Is it necessary to set the `i3_file`? It is only used in one - # place in `I3TruthExtractor`, and there only in a way that might - # be solved another way. - self._i3_file = i3_file - self._gcd_file = gcd_file - self._load_gcd_data() - - def _load_gcd_data(self) -> None: - """Load the geospatial information contained in the GCD-file.""" - # If no GCD file is provided, search the I3 file for frames containing - # geometry (G) and calibration (C) information. - gcd_file = dataio.I3File(self._gcd_file or self._i3_file) - - try: - g_frame = gcd_file.pop_frame(icetray.I3Frame.Geometry) - except RuntimeError: - self.error( - "No GCD file was provided and no G-frame was found. Exiting." - ) - raise - else: - self._gcd_dict = g_frame["I3Geometry"].omgeo - - try: - c_frame = gcd_file.pop_frame(icetray.I3Frame.Calibration) - except RuntimeError: - self.warning("No GCD file was provided and no C-frame was found.") - else: - self._calibration = c_frame["I3Calibration"] - - @abstractmethod - def __call__(self, frame: "icetray.I3Frame") -> dict: - """Extract information from frame.""" - pass - - @property - def name(self) -> str: - """Get the name of the `I3Extractor` instance.""" - return self._name - - -class I3ExtractorCollection(list): - """Class to manage multiple I3Extractors.""" - - def __init__(self, *extractors: I3Extractor): - """Construct I3ExtractorCollection. - - Args: - *extractors: List of `I3Extractor`s to be treated as a single - collection. - """ - # Check(s) - for extractor in extractors: - assert isinstance(extractor, I3Extractor) - - # Base class constructor - super().__init__(extractors) - - def set_files(self, i3_file: str, gcd_file: str) -> None: - """Store references to the I3- and GCD-files being processed.""" - for extractor in self: - extractor.set_files(i3_file, gcd_file) - - def __call__(self, frame: "icetray.I3Frame") -> List[dict]: - """Extract information from frame for each member `I3Extractor`.""" - return [extractor(frame) for extractor in self] diff --git a/src/graphnet/data/extractors/i3particleextractor.py b/src/graphnet/data/extractors/i3particleextractor.py deleted file mode 100644 index bd37424d2..000000000 --- a/src/graphnet/data/extractors/i3particleextractor.py +++ /dev/null @@ -1,43 +0,0 @@ -"""I3Extractor class(es) for extracting I3Particle properties.""" - -from typing import TYPE_CHECKING, Dict - -from graphnet.data.extractors.i3extractor import I3Extractor - -if TYPE_CHECKING: - from icecube import icetray # pyright: reportMissingImports=false - - -class I3ParticleExtractor(I3Extractor): - """Class for extracting I3Particle properties. - - Can be used to extract predictions from other algorithms for comparisons - with GraphNeT. - """ - - def __init__(self, name: str): - """Construct I3ParticleExtractor.""" - # Base class constructor - super().__init__(name) - - def __call__(self, frame: "icetray.I3Frame") -> Dict[str, float]: - """Extract I3Particle properties from I3Particle in frame.""" - output = {} - if self._name in frame: - output.update( - { - "zenith_" + self._name: frame[self._name].dir.zenith, - "azimuth_" + self._name: frame[self._name].dir.azimuth, - "dir_x_" + self._name: frame[self._name].dir.x, - "dir_y_" + self._name: frame[self._name].dir.y, - "dir_z_" + self._name: frame[self._name].dir.z, - "pos_x_" + self._name: frame[self._name].pos.x, - "pos_y_" + self._name: frame[self._name].pos.y, - "pos_z_" + self._name: frame[self._name].pos.z, - "time_" + self._name: frame[self._name].time, - "speed_" + self._name: frame[self._name].speed, - "energy_" + self._name: frame[self._name].energy, - } - ) - - return output diff --git a/src/graphnet/data/extractors/icecube/__init__.py b/src/graphnet/data/extractors/icecube/__init__.py new file mode 100644 index 000000000..11befe581 --- /dev/null +++ b/src/graphnet/data/extractors/icecube/__init__.py @@ -0,0 +1,20 @@ +"""Collection of I3Extractors, extracting pure-python data from I3Frames.""" + +from .i3extractor import I3Extractor +from .i3featureextractor import ( + I3FeatureExtractor, + I3FeatureExtractorIceCube86, + I3FeatureExtractorIceCubeDeepCore, + I3FeatureExtractorIceCubeUpgrade, + I3PulseNoiseTruthFlagIceCubeUpgrade, +) +from .i3truthextractor import I3TruthExtractor +from .i3retroextractor import I3RetroExtractor +from .i3splinempeextractor import I3SplineMPEICExtractor +from .i3particleextractor import I3ParticleExtractor +from .i3tumextractor import I3TUMExtractor +from .i3hybridrecoextractor import I3GalacticPlaneHybridRecoExtractor +from .i3genericextractor import I3GenericExtractor +from .i3pisaextractor import I3PISAExtractor +from .i3ntmuonlabelsextractor import I3NTMuonLabelExtractor +from .i3quesoextractor import I3QUESOExtractor diff --git a/src/graphnet/data/extractors/icecube/i3extractor.py b/src/graphnet/data/extractors/icecube/i3extractor.py new file mode 100644 index 000000000..d997efcc4 --- /dev/null +++ b/src/graphnet/data/extractors/icecube/i3extractor.py @@ -0,0 +1,66 @@ +"""Base I3Extractor class(es).""" + +from abc import abstractmethod +from typing import TYPE_CHECKING, Any, Dict, Optional + +from graphnet.utilities.imports import has_icecube_package +from graphnet.data.extractors import Extractor + +if has_icecube_package() or TYPE_CHECKING: + from icecube import icetray, dataio # pyright: reportMissingImports=false + + +class I3Extractor(Extractor): + """Base class for extracting information from physics I3-frames. + + Contains functionality required to extract data from i3 files, used by + the IceCube Neutrino Observatory. + + All classes inheriting from `I3Extractor` should implement the `__call__` + method. + """ + + def __init__(self, extractor_name: str): + """Construct I3Extractor. + + Args: + extractor_name: Name of the `I3Extractor` instance. Used to keep track of the + provenance of different data, and to name tables to which this + data is saved. + """ + # Member variable(s) + self._i3_file: str = "" + self._gcd_file: str = "" + self._gcd_dict: Dict[int, Any] = {} + self._calibration: Optional["icetray.I3Frame.Calibration"] = None + + # Base class constructor + super().__init__(extractor_name=extractor_name) + + def set_gcd(self, gcd_file: str, i3_file: str) -> None: + """Load the geospatial information contained in the GCD-file.""" + # If no GCD file is provided, search the I3 file for frames containing + # geometry (G) and calibration (C) information. + gcd = dataio.I3File(gcd_file or i3_file) + + try: + g_frame = gcd.pop_frame(icetray.I3Frame.Geometry) + except RuntimeError: + self.error( + "No GCD file was provided and no G-frame was found. Exiting." + ) + raise + else: + self._gcd_dict = g_frame["I3Geometry"].omgeo + + try: + c_frame = gcd.pop_frame(icetray.I3Frame.Calibration) + except RuntimeError: + self.warning("No GCD file was provided and no C-frame was found.") + else: + self._calibration = c_frame["I3Calibration"] + + @abstractmethod + def __call__(self, frame: "icetray.I3Frame") -> dict: + """Extract information from frame.""" + pass diff --git a/src/graphnet/data/extractors/i3featureextractor.py b/src/graphnet/data/extractors/icecube/i3featureextractor.py similarity index 97% rename from src/graphnet/data/extractors/i3featureextractor.py rename to src/graphnet/data/extractors/icecube/i3featureextractor.py index f351f0f3a..258bb368c 100644 --- a/src/graphnet/data/extractors/i3featureextractor.py +++ b/src/graphnet/data/extractors/icecube/i3featureextractor.py @@ -1,17 +1,14 @@ """I3Extractor class(es) for extracting specific, reconstructed features.""" from typing import TYPE_CHECKING, Any, Dict, List -from graphnet.data.extractors.extractor import I3Extractor -from graphnet.data.extractors.utilities.frames import ( +from .i3extractor import I3Extractor +from graphnet.data.extractors.icecube.utilities.frames import ( get_om_keys_and_pulseseries, ) from graphnet.utilities.imports import has_icecube_package if has_icecube_package() or TYPE_CHECKING: - from icecube import ( - icetray, - dataclasses, - ) # pyright: reportMissingImports=false + from icecube import icetray # pyright: reportMissingImports=false class I3FeatureExtractor(I3Extractor): diff --git a/src/graphnet/data/extractors/i3genericextractor.py b/src/graphnet/data/extractors/icecube/i3genericextractor.py similarity index 98% rename from src/graphnet/data/extractors/i3genericextractor.py rename to src/graphnet/data/extractors/icecube/i3genericextractor.py index 6a86303e7..e907181d0 100644 --- a/src/graphnet/data/extractors/i3genericextractor.py +++ b/src/graphnet/data/extractors/icecube/i3genericextractor.py @@ -2,12 +2,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from graphnet.data.extractors.i3extractor import I3Extractor -from graphnet.data.extractors.utilities.types import ( +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.extractors.icecube.utilities.types import ( cast_object_to_pure_python, cast_pulse_series_to_pure_python, ) -from graphnet.data.extractors.utilities.collections import ( +from graphnet.data.extractors.icecube.utilities.collections import ( transpose_list_of_dicts, serialise, flatten_nested_dictionary, diff --git a/src/graphnet/data/extractors/i3hybridrecoextractor.py b/src/graphnet/data/extractors/icecube/i3hybridrecoextractor.py similarity index 96% rename from src/graphnet/data/extractors/i3hybridrecoextractor.py rename to src/graphnet/data/extractors/icecube/i3hybridrecoextractor.py index 74f445120..90525bcab 100644 --- a/src/graphnet/data/extractors/i3hybridrecoextractor.py +++ b/src/graphnet/data/extractors/icecube/i3hybridrecoextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/i3ntmuonlabelsextractor.py b/src/graphnet/data/extractors/icecube/i3ntmuonlabelsextractor.py similarity index 96% rename from src/graphnet/data/extractors/i3ntmuonlabelsextractor.py rename to src/graphnet/data/extractors/icecube/i3ntmuonlabelsextractor.py index 1ca3e8bcb..039b13cfe 100644 --- a/src/graphnet/data/extractors/i3ntmuonlabelsextractor.py +++ b/src/graphnet/data/extractors/icecube/i3ntmuonlabelsextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube.i3extractor import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/icecube/i3particleextractor.py b/src/graphnet/data/extractors/icecube/i3particleextractor.py new file mode 100644 index 000000000..a50c11d21 --- /dev/null +++ b/src/graphnet/data/extractors/icecube/i3particleextractor.py @@ -0,0 +1,44 @@ +"""I3Extractor class(es) for extracting I3Particle properties.""" + +from typing import TYPE_CHECKING, Dict + +from graphnet.data.extractors.icecube import I3Extractor + +if TYPE_CHECKING: + from icecube import icetray # pyright: reportMissingImports=false + + +class I3ParticleExtractor(I3Extractor): + """Class for extracting I3Particle properties. + + Can be used to extract predictions from other algorithms for comparisons + with GraphNeT. + """ + + def __init__(self, extractor_name: str): + """Construct I3ParticleExtractor.""" + # Base class constructor + super().__init__(extractor_name=extractor_name) + + def __call__(self, frame: "icetray.I3Frame") -> Dict[str, float]: + """Extract I3Particle properties from I3Particle in frame.""" + output = {} + name = self._extractor_name + if name in frame: + output.update( + { + "zenith_" + name: frame[name].dir.zenith, + "azimuth_" + name: frame[name].dir.azimuth, + "dir_x_" + name: frame[name].dir.x, + "dir_y_" + name: frame[name].dir.y, + "dir_z_" + name: frame[name].dir.z, + "pos_x_" + name: frame[name].pos.x, + "pos_y_" + name: frame[name].pos.y, + "pos_z_" + name: frame[name].pos.z, + "time_" + name: frame[name].time, + "speed_" + name: frame[name].speed, + "energy_" + name: frame[name].energy, + } + ) + + return output diff --git a/src/graphnet/data/extractors/i3pisaextractor.py b/src/graphnet/data/extractors/icecube/i3pisaextractor.py similarity index 94% rename from src/graphnet/data/extractors/i3pisaextractor.py rename to src/graphnet/data/extractors/icecube/i3pisaextractor.py index fd5a09583..f14a8046a 100644 --- a/src/graphnet/data/extractors/i3pisaextractor.py +++ b/src/graphnet/data/extractors/icecube/i3pisaextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube.i3extractor import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/i3quesoextractor.py b/src/graphnet/data/extractors/icecube/i3quesoextractor.py similarity index 94% rename from src/graphnet/data/extractors/i3quesoextractor.py rename to src/graphnet/data/extractors/icecube/i3quesoextractor.py index b72b20046..e29c72a41 100644 --- a/src/graphnet/data/extractors/i3quesoextractor.py +++ b/src/graphnet/data/extractors/icecube/i3quesoextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube.i3extractor import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/i3retroextractor.py b/src/graphnet/data/extractors/icecube/i3retroextractor.py similarity index 97% rename from src/graphnet/data/extractors/i3retroextractor.py rename to src/graphnet/data/extractors/icecube/i3retroextractor.py index cd55d01f4..aaeb773b4 100644 --- a/src/graphnet/data/extractors/i3retroextractor.py +++ b/src/graphnet/data/extractors/icecube/i3retroextractor.py @@ -2,8 +2,8 @@ from typing import TYPE_CHECKING, Any, Dict -from graphnet.data.extractors.i3extractor import I3Extractor -from graphnet.data.extractors.utilities.frames import ( +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.extractors.icecube.utilities.frames import ( frame_is_montecarlo, frame_is_noise, ) diff --git a/src/graphnet/data/extractors/i3splinempeextractor.py b/src/graphnet/data/extractors/icecube/i3splinempeextractor.py similarity index 93% rename from src/graphnet/data/extractors/i3splinempeextractor.py rename to src/graphnet/data/extractors/icecube/i3splinempeextractor.py index e47b2e71d..1439ada51 100644 --- a/src/graphnet/data/extractors/i3splinempeextractor.py +++ b/src/graphnet/data/extractors/icecube/i3splinempeextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/i3truthextractor.py b/src/graphnet/data/extractors/icecube/i3truthextractor.py similarity index 99% rename from src/graphnet/data/extractors/i3truthextractor.py rename to src/graphnet/data/extractors/icecube/i3truthextractor.py index d04be69b2..b715e57ab 100644 --- a/src/graphnet/data/extractors/i3truthextractor.py +++ b/src/graphnet/data/extractors/icecube/i3truthextractor.py @@ -4,8 +4,8 @@ import matplotlib.path as mpath from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple -from graphnet.data.extractors.extractor import I3Extractor -from graphnet.data.extractors.utilities.frames import ( +from .i3extractor import I3Extractor +from .utilities.frames import ( frame_is_montecarlo, frame_is_noise, ) diff --git a/src/graphnet/data/extractors/i3tumextractor.py b/src/graphnet/data/extractors/icecube/i3tumextractor.py similarity index 94% rename from src/graphnet/data/extractors/i3tumextractor.py rename to src/graphnet/data/extractors/icecube/i3tumextractor.py index 38cbca146..685b0a78e 100644 --- a/src/graphnet/data/extractors/i3tumextractor.py +++ b/src/graphnet/data/extractors/icecube/i3tumextractor.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict -from graphnet.data.extractors.i3extractor import I3Extractor +from graphnet.data.extractors.icecube import I3Extractor if TYPE_CHECKING: from icecube import icetray # pyright: reportMissingImports=false diff --git a/src/graphnet/data/extractors/utilities/__init__.py b/src/graphnet/data/extractors/icecube/utilities/__init__.py similarity index 100% rename from src/graphnet/data/extractors/utilities/__init__.py rename to src/graphnet/data/extractors/icecube/utilities/__init__.py diff --git a/src/graphnet/data/extractors/utilities/collections.py b/src/graphnet/data/extractors/icecube/utilities/collections.py similarity index 100% rename from src/graphnet/data/extractors/utilities/collections.py rename to src/graphnet/data/extractors/icecube/utilities/collections.py diff --git a/src/graphnet/data/extractors/utilities/frames.py b/src/graphnet/data/extractors/icecube/utilities/frames.py similarity index 100% rename from src/graphnet/data/extractors/utilities/frames.py rename to src/graphnet/data/extractors/icecube/utilities/frames.py diff --git a/src/graphnet/data/filters.py b/src/graphnet/data/extractors/icecube/utilities/i3_filters.py similarity index 100% rename from src/graphnet/data/filters.py rename to src/graphnet/data/extractors/icecube/utilities/i3_filters.py diff --git a/src/graphnet/data/extractors/utilities/types.py b/src/graphnet/data/extractors/icecube/utilities/types.py similarity index 98% rename from src/graphnet/data/extractors/utilities/types.py rename to src/graphnet/data/extractors/icecube/utilities/types.py index cf58e8357..32ecae0ff 100644 --- a/src/graphnet/data/extractors/utilities/types.py +++ b/src/graphnet/data/extractors/icecube/utilities/types.py @@ -4,11 +4,11 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from graphnet.data.extractors.utilities.collections import ( +from graphnet.data.extractors.icecube.utilities.collections import ( transpose_list_of_dicts, flatten_nested_dictionary, ) -from graphnet.data.extractors.utilities.frames import ( +from graphnet.data.extractors.icecube.utilities.frames import ( get_om_keys_and_pulseseries, ) from graphnet.utilities.imports import has_icecube_package diff --git a/src/graphnet/data/parquet/__init__.py b/src/graphnet/data/parquet/__init__.py deleted file mode 100644 index 616d89c16..000000000 --- a/src/graphnet/data/parquet/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Parquet-specific implementation of data classes.""" -from .parquet_dataconverter import ParquetDataConverter diff --git a/src/graphnet/data/parquet/parquet_dataconverter.py b/src/graphnet/data/parquet/parquet_dataconverter.py deleted file mode 100644 index 68531c8e2..000000000 --- a/src/graphnet/data/parquet/parquet_dataconverter.py +++ /dev/null @@ -1,52 +0,0 @@ -"""DataConverter for the Parquet backend.""" - -from collections import OrderedDict -import os -from typing import List, Optional - -import awkward - -from graphnet.data.dataconverter import DataConverter # type: ignore[attr-defined] - - -class ParquetDataConverter(DataConverter): - """Class for converting I3-files to Parquet format.""" - - # Class variables - file_suffix: str = "parquet" - - # Abstract method implementation(s) - def save_data(self, data: List[OrderedDict], output_file: str) -> None: - """Save data to parquet file.""" - # Check(s) - if os.path.exists(output_file): - self.warning( - f"Output file {output_file} already exists. Overwriting." - ) - - self.debug(f"Saving to {output_file}") - self.debug( - f"- Data has {len(data)} events and {len(data[0])} tables for each" - ) - - awkward.to_parquet(awkward.from_iter(data), output_file) - - self.debug("- Done saving") - self._output_files.append(output_file) - - def merge_files( - self, output_file: str, input_files: Optional[List[str]] = None - ) -> None: - """Parquet-specific method for merging output files. - - Args: - output_file: Name of the output file containing the merged results. - input_files: Intermediate files to be merged, according to the - specific implementation. Default to None, meaning that all - files output by the current instance are merged. - - Raises: - NotImplementedError: If the method has not been implemented for the - Parquet backend. - """ - raise NotImplementedError() diff --git a/src/graphnet/data/pipeline.py b/src/graphnet/data/pipeline.py index d97415bb0..9973c763f 100644 --- a/src/graphnet/data/pipeline.py +++ b/src/graphnet/data/pipeline.py @@ -13,7 +13,9 @@ import torch from torch.utils.data import DataLoader -from graphnet.data.sqlite.sqlite_utilities import create_table_and_save_to_sql +from graphnet.data.utilities.sqlite_utilities import ( + create_table_and_save_to_sql, +) from graphnet.training.utils import get_predictions, make_dataloader from graphnet.models.graphs import GraphDefinition diff --git a/src/graphnet/data/readers/__init__.py b/src/graphnet/data/readers/__init__.py new file mode 100644 index 000000000..0755bd35a --- /dev/null +++ b/src/graphnet/data/readers/__init__.py @@ -0,0 +1,3 @@ +"""Modules for reading experiment-specific data and applying Extractors.""" +from .graphnet_file_reader import GraphNeTFileReader +from .i3reader import I3Reader diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py new file mode 100644 index 000000000..ab6464e13 --- /dev/null +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -0,0 +1,132 @@ +"""Module containing different FileReader classes in GraphNeT. + +These methods are used to open and apply `Extractors` to experiment-specific +file formats. +""" + +from typing import List, Union, OrderedDict +from abc import abstractmethod, ABC +import glob +import os + +from graphnet.utilities.decorators import final +from graphnet.utilities.logging import Logger +from graphnet.data.dataclasses import I3FileSet +from graphnet.data.extractors.extractor import Extractor + + +class GraphNeTFileReader(Logger, ABC): + """A generic base class for FileReaders in GraphNeT. + + Classes inheriting from `GraphNeTFileReader` must implement a + `__call__` method that opens a file, applies `Extractor`(s) and returns + a list of ordered dictionaries. + + In addition, Classes inheriting from `GraphNeTFileReader` must set + class properties `accepted_file_extensions` and `accepted_extractors`. + """ + + @abstractmethod + def __call__(self, file_path: str) -> List[OrderedDict]: + """Open and apply extractors to a single file. + + The `output` must be a list of dictionaries, where the number of events + in the file `n_events` satisfies `len(output) = n_events`. I.e each + element in the list is a dictionary, and each field in the dictionary + is the output of a single extractor. + """ + + @property + def accepted_file_extensions(self) -> List[str]: + """Return list of accepted file extensions.""" + return self._accepted_file_extensions # type: ignore + + @property + def accepted_extractors(self) -> List[Extractor]: + """Return list of compatible `Extractor`(s).""" + return self._accepted_extractors # type: ignore + + @property + def extracor_names(self) -> List[str]: + """Return list of table names produced by extractors.""" + return [extractor.name for extractor in self._extractors] # type: ignore + + def find_files( + self, path: Union[str, List[str]] + ) -> Union[List[str], List[I3FileSet]]: + """Search directory for input files recursively. + + This method may be overwritten by custom implementations. + + Args: + path: path to directory. + + Returns: + List of files matching accepted file extensions. + """ + if isinstance(path, str): + path = [path] + files = [] + for dir in path: + for accepted_file_extension in self.accepted_file_extensions: + files.extend(glob.glob(dir + f"/*{accepted_file_extension}")) + + # Check that files are OK. + self.validate_files(files) + return files + + @final + def set_extractors(self, extractors: List[Extractor]) -> None: + """Set `Extractor`(s) as member variable. + + Args: + extractors: A list of `Extractor`(s) to set as member variable. + """ + self._validate_extractors(extractors) + self._extractors = extractors + + @final + def _validate_extractors(self, extractors: List[Extractor]) -> None: + for extractor in extractors: + try: + assert isinstance(extractor, tuple(self.accepted_extractors)) # type: ignore + except AssertionError as e: + self.error( + f"{extractor.__class__.__name__}" + f" is not supported by {self.__class__.__name__}" + ) + raise e + + @final + def validate_files( + self, input_files: Union[List[str], List[I3FileSet]] + ) -> None: + """Check that the input files are accepted by the reader. + + Args: + input_files: Path(s) to input file(s). + """ + for input_file in input_files: + # Handle filepath vs. FileSet cases + if isinstance(input_file, I3FileSet): + self._validate_file(input_file.i3_file) + self._validate_file(input_file.gcd_file) + else: + self._validate_file(input_file) + + @final + def _validate_file(self, file: str) -> None: + """Validate a single file path. + + Args: + file: path to file. + + Returns: + None + """ + try: + assert file.lower().endswith(tuple(self.accepted_file_extensions)) + except AssertionError: + self.error( + f'{self.__class__.__name__} accepts {self.accepted_file_extensions} but {file.split("/")[-1]} has extension {os.path.splitext(file)[1]}.' + ) diff --git a/src/graphnet/data/readers.py b/src/graphnet/data/readers/i3reader.py similarity index 51% rename from src/graphnet/data/readers.py rename to src/graphnet/data/readers/i3reader.py index 6dd9bd63d..926c2395a 100644 --- a/src/graphnet/data/readers.py +++ b/src/graphnet/data/readers/i3reader.py @@ -1,148 +1,22 @@ -"""Module containing different FileReader classes in GraphNeT. - -These methods are used to open and apply `Extractors` to experiment-specific -file formats. -""" +"""Module containing different I3Reader.""" from typing import List, Union, OrderedDict, Type -from abc import abstractmethod, ABC -import glob -import os -from graphnet.utilities.decorators import final -from graphnet.utilities.logging import Logger from graphnet.utilities.imports import has_icecube_package -from graphnet.data.filters import I3Filter, NullSplitI3Filter - -from .dataclasses import I3FileSet - -from .extractors.extractor import ( - Extractor, - I3Extractor, -) # , I3GenericExtractor +from graphnet.data.extractors.icecube.utilities.i3_filters import ( + I3Filter, + NullSplitI3Filter, +) +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.dataclasses import I3FileSet from graphnet.utilities.filesys import find_i3_files +from .graphnet_file_reader import GraphNeTFileReader + if has_icecube_package(): from icecube import icetray, dataio # pyright: reportMissingImports=false -class GraphNeTFileReader(Logger, ABC): - """A generic base class for FileReaders in GraphNeT. - - Classes inheriting from `GraphNeTFileReader` must implement a - `__call__` method that opens a file, applies `Extractor`(s) and returns - a list of ordered dictionaries. - - In addition, Classes inheriting from `GraphNeTFileReader` must set - class properties `accepted_file_extensions` and `accepted_extractors`. - """ - - @abstractmethod - def __call__(self, file_path: str) -> List[OrderedDict]: - """Open and apply extractors to a single file. - - The `output` must be a list of dictionaries, where the number of events - in the file `n_events` satisfies `len(output) = n_events`. I.e each - element in the list is a dictionary, and each field in the dictionary - is the output of a single extractor. - """ - - @property - def accepted_file_extensions(self) -> List[str]: - """Return list of accepted file extensions.""" - return self._accepted_file_extensions # type: ignore - - @property - def accepted_extractors(self) -> List[Extractor]: - """Return list of compatible `Extractor`(s).""" - return self._accepted_extractors # type: ignore - - @property - def extracor_names(self) -> List[str]: - """Return list of table names produced by extractors.""" - return [extractor.name for extractor in self._extractors] # type: ignore - - def find_files( - self, path: Union[str, List[str]] - ) -> Union[List[str], List[I3FileSet]]: - """Search directory for input files recursively. - - This method may be overwritten by custom implementations. - - Args: - path: path to directory. - - Returns: - List of files matching accepted file extensions. - """ - if isinstance(path, str): - path = [path] - files = [] - for dir in path: - for accepted_file_extension in self.accepted_file_extensions: - files.extend(glob.glob(dir + f"/*{accepted_file_extension}")) - - # Check that files are OK. - self.validate_files(files) - return files - - @final - def set_extractors(self, extractors: List[Extractor]) -> None: - """Set `Extractor`(s) as member variable. - - Args: - extractors: A list of `Extractor`(s) to set as member variable. - """ - self._validate_extractors(extractors) - self._extractors = extractors - - @final - def _validate_extractors(self, extractors: List[Extractor]) -> None: - for extractor in extractors: - try: - assert isinstance(extractor, tuple(self.accepted_extractors)) # type: ignore - except AssertionError as e: - self.error( - f"{extractor.__class__.__name__}" - f" is not supported by {self.__class__.__name__}" - ) - raise e - - @final - def validate_files( - self, input_files: Union[List[str], List[I3FileSet]] - ) -> None: - """Check that the input files are accepted by the reader. - - Args: - input_files: Path(s) to input file(s). - """ - for input_file in input_files: - # Handle filepath vs. FileSet cases - if isinstance(input_file, I3FileSet): - self._validate_file(input_file.i3_file) - self._validate_file(input_file.gcd_file) - else: - self._validate_file(input_file) - - @final - def _validate_file(self, file: str) -> None: - """Validate a single file path. - - Args: - file: path to file. - - Returns: - None - """ - try: - assert file.lower().endswith(tuple(self.accepted_file_extensions)) - except AssertionError: - self.error( - f'{self.__class__.__name__} accepts {self.accepted_file_extensions} but {file.split("/")[-1]} has extension {os.path.splitext(file)[1]}.' - ) - - class I3Reader(GraphNeTFileReader): """A class for reading .i3 files from the IceCube Neutrino Observatory. diff --git a/src/graphnet/data/sqlite/__init__.py b/src/graphnet/data/sqlite/__init__.py deleted file mode 100644 index e4ac554a7..000000000 --- a/src/graphnet/data/sqlite/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""SQLite-specific implementation of data classes.""" -from .sqlite_dataconverter import SQLiteDataConverter -from .sqlite_utilities import create_table_and_save_to_sql -from .sqlite_utilities import run_sql_code, save_to_sql diff --git a/src/graphnet/data/sqlite/sqlite_dataconverter.py b/src/graphnet/data/sqlite/sqlite_dataconverter.py deleted file mode 100644 index 1750b7a33..000000000 --- a/src/graphnet/data/sqlite/sqlite_dataconverter.py +++ /dev/null @@ -1,349 +0,0 @@ -"""DataConverter for the SQLite backend.""" - -from collections import OrderedDict -import os -from typing import Any, Dict, List, Optional, Tuple, Union - -import pandas as pd -import sqlalchemy -import sqlite3 -from tqdm import tqdm - -from graphnet.data.dataconverter import DataConverter # type: ignore[attr-defined] -from graphnet.data.sqlite.sqlite_utilities import ( - create_table, - create_table_and_save_to_sql, -) - - -class SQLiteDataConverter(DataConverter): - """Class for converting I3-file(s) to SQLite format.""" - - # Class variables - file_suffix = "db" - - # Abstract method implementation(s) - def save_data(self, data: List[OrderedDict], output_file: str) -> None: - """Save data to SQLite database.""" - # Check(s) - if os.path.exists(output_file): - self.warning( - f"Output file {output_file} already exists. Appending." - ) - - # Concatenate data - if len(data) == 0: - self.warning( - "No data was extracted from the processed I3 file(s). " - f"No data saved to {output_file}" - ) - return - - saved_any = False - dataframe_list: OrderedDict = OrderedDict( - [(key, []) for key in data[0]] - ) - for data_dict in data: - for key, data_values in data_dict.items(): - df = construct_dataframe(data_values) - - if self.any_pulsemap_is_non_empty(data_dict) and len(df) > 0: - # only include data_dict in temp. databases if at least one pulsemap is non-empty, - # and the current extractor (df) is also non-empty (also since truth is always non-empty) - dataframe_list[key].append(df) - - dataframe = OrderedDict( - [ - ( - key, - pd.concat(dfs, ignore_index=True, sort=True) - if dfs - else pd.DataFrame(), - ) - for key, dfs in dataframe_list.items() - ] - ) - # Can delete dataframe_list here to free up memory. - - # Save each dataframe to SQLite database - self.debug(f"Saving to {output_file}") - for table, df in dataframe.items(): - if len(df) > 0: - create_table_and_save_to_sql( - df, - table, - output_file, - default_type="FLOAT", - integer_primary_key=not ( - is_pulse_map(table) or is_mc_tree(table) - ), - ) - saved_any = True - - if saved_any: - self.debug("- Done saving") - else: - self.warning(f"No data saved to {output_file}") - - def merge_files( - self, - output_file: str, - input_files: Optional[List[str]] = None, - max_table_size: Optional[int] = None, - ) -> None: - """SQLite-specific method for merging output files/databases. - - Args: - output_file: Name of the output file containing the merged results. - input_files: Intermediate files/databases to be merged, according - to the specific implementation. Default to None, meaning that - all files/databases output by the current instance are merged. - max_table_size: The maximum number of rows in any given table. - If any one table exceed this limit, a new database will be - created. - """ - if max_table_size: - self.warning( - f"Merging got max_table_size of {max_table_size}. Will attempt to create databases with a maximum row count of this size." - ) - self.max_table_size = max_table_size - self._partition_count = 1 - - if input_files is None: - self.info("Merging files output by current instance.") - self._input_files = self._output_files - else: - self._input_files = input_files - - if not output_file.endswith("." + self.file_suffix): - output_file = ".".join([output_file, self.file_suffix]) - - if os.path.exists(output_file): - self.warning( - f"Target path for merged database, {output_file}, already exists." - ) - - if len(self._input_files) > 0: - self.info(f"Merging {len(self._input_files)} database files") - # Create one empty database table for each extraction - self._merged_table_names = self._extract_table_names( - self._input_files - ) - if self.max_table_size: - output_file = self._adjust_output_file_name(output_file) - self._create_empty_tables(output_file) - self._row_counts = self._initialize_row_counts() - # Merge temporary databases into newly created one - self._merge_temporary_databases(output_file, self._input_files) - else: - self.warning("No temporary database files found!") - - # Internal methods - def _adjust_output_file_name(self, output_file: str) -> str: - if "_part_" in output_file: - root = ( - output_file.split("_part_")[0] - + output_file.split("_part_")[1][1:] - ) - else: - root = output_file - str_list = root.split(".db") - return str_list[0] + f"_part_{self._partition_count}" + ".db" - - def _update_row_counts( - self, results: "OrderedDict[str, pd.DataFrame]" - ) -> None: - for table_name, data in results.items(): - self._row_counts[table_name] += len(data) - return - - def _initialize_row_counts(self) -> Dict[str, int]: - """Build dictionary with row counts. Initialized with 0. - - Returns: - Dictionary where every field is a table name that contains - corresponding row counts. - """ - row_counts = {} - for table_name in self._merged_table_names: - row_counts[table_name] = 0 - return row_counts - - def _create_empty_tables(self, output_file: str) -> None: - """Create tables for output database. - - Args: - output_file: Path to database. - """ - for table_name in self._merged_table_names: - column_names = self._extract_column_names( - self._input_files, table_name - ) - if len(column_names) > 1: - create_table( - column_names, - table_name, - output_file, - default_type="FLOAT", - integer_primary_key=not ( - is_pulse_map(table_name) or is_mc_tree(table_name) - ), - ) - - def _get_tables_in_database(self, db: str) -> Tuple[str, ...]: - with sqlite3.connect(db) as conn: - table_names = tuple( - [ - p[0] - for p in ( - conn.execute( - "SELECT name FROM sqlite_master WHERE type='table';" - ).fetchall() - ) - ] - ) - return table_names - - def _extract_table_names( - self, db: Union[str, List[str]] - ) -> Tuple[str, ...]: - """Get the names of all tables in database `db`.""" - if isinstance(db, str): - db = [db] - results = [self._get_tables_in_database(path) for path in db] - # @TODO: Check... - if all([results[0] == r for r in results]): - return results[0] - else: - unique_tables = [] - for tables in results: - for table in tables: - if table not in unique_tables: - unique_tables.append(table) - return tuple(unique_tables) - - def _extract_column_names( - self, db_paths: List[str], table_name: str - ) -> List[str]: - for db_path in db_paths: - tables_in_database = self._get_tables_in_database(db_path) - if table_name in tables_in_database: - with sqlite3.connect(db_path) as con: - query = f"select * from {table_name} limit 1" - columns = pd.read_sql(query, con).columns - if len(columns): - return columns - return [] - - def any_pulsemap_is_non_empty(self, data_dict: Dict[str, Dict]) -> bool: - """Check whether there are non-empty pulsemaps extracted from P frame. - - Takes in the data extracted from the P frame, then retrieves the - values, if there are any, from the pulsemap key(s) (e.g - SplitInIcePulses). If at least one of the pulsemaps is non-empty then - return true. If no pulsemaps exist, i.e., if no `I3FeatureExtractor` is - called e.g. because `I3GenericExtractor` is used instead, always return - True. - """ - if len(self._pulsemaps) == 0: - return True - - pulsemap_dicts = [data_dict[pulsemap] for pulsemap in self._pulsemaps] - return any(d["dom_x"] for d in pulsemap_dicts) - - def _submit_to_database( - self, database: str, key: str, data: pd.DataFrame - ) -> None: - """Submit data to the database with specified key.""" - if len(data) == 0: - self.info(f"No data provided for {key}.") - return - engine = sqlalchemy.create_engine("sqlite:///" + database) - data.to_sql(key, engine, index=False, if_exists="append") - engine.dispose() - - def _extract_everything(self, db: str) -> "OrderedDict[str, pd.DataFrame]": - """Extract everything from the temporary database `db`. - - Args: - db: Path to temporary database. - - Returns: - Dictionary containing the data for each extracted table. - """ - results = OrderedDict() - table_names = self._extract_table_names(db) - with sqlite3.connect(db) as conn: - for table_name in table_names: - query = f"select * from {table_name}" - try: - data = pd.read_sql(query, conn) - except: # noqa: E722 - data = [] - results[table_name] = data - return results - - def _merge_temporary_databases( - self, - output_file: str, - input_files: List[str], - ) -> None: - """Merge the temporary databases. - - Args: - output_file: path to the final database - input_files: list of names of temporary databases - """ - file_count = 0 - for input_file in tqdm(input_files, colour="green"): - results = self._extract_everything(input_file) - for table_name, data in results.items(): - self._submit_to_database(output_file, table_name, data) - file_count += 1 - if (self.max_table_size is not None) & ( - file_count < len(input_files) - ): - self._update_row_counts(results) - maximum_row_count_reached = False - for table in self._row_counts.keys(): - assert self.max_table_size is not None - if self._row_counts[table] >= self.max_table_size: - maximum_row_count_reached = True - if maximum_row_count_reached: - self._partition_count += 1 - output_file = self._adjust_output_file_name(output_file) - self.info( - f"Maximum row count reached. Creating new partition at {output_file}" - ) - self._create_empty_tables(output_file) - self._row_counts = self._initialize_row_counts() - - -# Implementation-specific utility function(s) -def construct_dataframe(extraction: Dict[str, Any]) -> pd.DataFrame: - """Convert extraction to pandas.DataFrame. - - Args: - extraction: Dictionary with the extracted data. - - Returns: - Extraction as pandas.DataFrame. - """ - all_scalars = True - for value in extraction.values(): - if isinstance(value, (list, tuple, dict)): - all_scalars = False - break - - out = pd.DataFrame(extraction, index=[0] if all_scalars else None) - return out - - -def is_pulse_map(table_name: str) -> bool: - """Check whether `table_name` corresponds to a pulse map.""" - return "pulse" in table_name.lower() or "series" in table_name.lower() - - -def is_mc_tree(table_name: str) -> bool: - """Check whether `table_name` corresponds to an MC tree.""" - return "I3MCTree" in table_name diff --git a/src/graphnet/data/utilities/__init__.py b/src/graphnet/data/utilities/__init__.py index 0dd9e0600..ad4f0c7db 100644 --- a/src/graphnet/data/utilities/__init__.py +++ b/src/graphnet/data/utilities/__init__.py @@ -1 +1,4 @@ """Utilities for use across `graphnet.data`.""" +from .sqlite_utilities import create_table_and_save_to_sql +from .sqlite_utilities import get_primary_keys +from .sqlite_utilities import query_database diff --git a/src/graphnet/data/utilities/parquet_to_sqlite.py b/src/graphnet/data/utilities/parquet_to_sqlite.py index 146e69ce8..11114698e 100644 --- a/src/graphnet/data/utilities/parquet_to_sqlite.py +++ b/src/graphnet/data/utilities/parquet_to_sqlite.py @@ -9,7 +9,9 @@ import pandas as pd from tqdm.auto import trange -from graphnet.data.sqlite.sqlite_utilities import create_table_and_save_to_sql +from graphnet.data.utilities.sqlite_utilities import ( + create_table_and_save_to_sql, +) from graphnet.utilities.logging import Logger diff --git a/src/graphnet/data/sqlite/sqlite_utilities.py b/src/graphnet/data/utilities/sqlite_utilities.py similarity index 72% rename from src/graphnet/data/sqlite/sqlite_utilities.py rename to src/graphnet/data/utilities/sqlite_utilities.py index 23bae802d..cfa308ba2 100644 --- a/src/graphnet/data/sqlite/sqlite_utilities.py +++ b/src/graphnet/data/utilities/sqlite_utilities.py @@ -1,7 +1,7 @@ """SQLite-specific utility functions for use in `graphnet.data`.""" import os.path -from typing import List +from typing import List, Dict, Tuple import pandas as pd import sqlalchemy @@ -16,6 +16,58 @@ def database_exists(database_path: str) -> bool: return os.path.exists(database_path) +def query_database(database: str, query: str) -> pd.DataFrame: + """Execute query on database, and return result. + + Args: + database: path to database. + query: query to be executed. + + Returns: + DataFrame containing the result of the query. + """ + with sqlite3.connect(database) as conn: + return pd.read_sql(query, conn) + + +def get_primary_keys(database: str) -> Tuple[Dict[str, str], str]: + """Get name of primary key column for each table in database. + + Args: + database: path to database. + + Returns: + A dictionary containing the names of primary keys in each table of + `database`. E.g. {'truth': "event_no", + 'SplitInIcePulses': None} + Name of the primary key. + """ + with sqlite3.connect(database) as conn: + query = 'SELECT name FROM sqlite_master WHERE type == "table"' + table_names = [table[0] for table in conn.execute(query).fetchall()] + + integer_primary_key = {} + for table in table_names: + query = f"SELECT l.name FROM pragma_table_info('{table}') as l WHERE l.pk = 1;" + first_primary_key = [ + key[0] for key in conn.execute(query).fetchall() + ] + integer_primary_key[table] = ( + first_primary_key[0] if len(first_primary_key) else None + ) + + # Get the primary key column name + primary_key_candidates = [] + for val in set(integer_primary_key.values()): + if val is not None: + primary_key_candidates.append(val) + + # There should only be one primary key: + assert len(primary_key_candidates) == 1 + + return integer_primary_key, primary_key_candidates[0] + + def database_table_exists(database_path: str, table_name: str) -> bool: """Check whether `table_name` exists in database at `database_path`.""" if not database_exists(database_path): diff --git a/src/graphnet/data/writers/__init__.py b/src/graphnet/data/writers/__init__.py new file mode 100644 index 000000000..ad3e2748e --- /dev/null +++ b/src/graphnet/data/writers/__init__.py @@ -0,0 +1,4 @@ +"""Modules for saving interim dataformat to various data backends.""" +from .graphnet_writer import GraphNeTWriter +from .parquet_writer import ParquetWriter +from .sqlite_writer import SQLiteWriter diff --git a/src/graphnet/data/writers.py b/src/graphnet/data/writers/graphnet_writer.py similarity index 57% rename from src/graphnet/data/writers.py rename to src/graphnet/data/writers/graphnet_writer.py index d23b21ac8..04ee079f4 100644 --- a/src/graphnet/data/writers.py +++ b/src/graphnet/data/writers/graphnet_writer.py @@ -5,20 +5,16 @@ """ import os -from typing import List, Union, Dict, Any, OrderedDict +from typing import Dict, List from abc import abstractmethod, ABC from graphnet.utilities.decorators import final from graphnet.utilities.logging import Logger -from graphnet.data.sqlite.sqlite_utilities import ( - create_table, - create_table_and_save_to_sql, -) import pandas as pd -class GraphNeTFileSaveMethod(Logger, ABC): +class GraphNeTWriter(Logger, ABC): """Generic base class for saving interim data format in `DataConverter`. Classes inheriting from `GraphNeTFileSaveMethod` must implement the @@ -43,6 +39,21 @@ def _save_file( output_file_path: output file path. n_events: Number of events container in `data`. """ + raise NotImplementedError + + @abstractmethod + def merge_files( + self, + files: List[str], + output_dir: str, + ) -> None: + """Merge smaller files. + + Args: + files: Files to be merged. + output_dir: The directory to store the merged files in. + """ + raise NotImplementedError @final def __call__( @@ -76,49 +87,3 @@ def __call__( def file_extension(self) -> str: """Return file extension used to store the data.""" return self._file_extension # type: ignore - - -class SQLiteSaveMethod(GraphNeTFileSaveMethod): - """A method for saving GraphNeT's interim dataformat to SQLite.""" - - _file_extension = ".db" - - def _save_file( - self, - data: Dict[str, pd.DataFrame], - output_file_path: str, - n_events: int, - ) -> None: - """Save data to SQLite database.""" - # Check(s) - if os.path.exists(output_file_path): - self.warning( - f"Output file {output_file_path} already exists. Appending." - ) - - # Concatenate data - if len(data) == 0: - self.warning( - "No data was extracted from the processed I3 file(s). " - f"No data saved to {output_file_path}" - ) - return - - saved_any = False - # Save each dataframe to SQLite database - self.debug(f"Saving to {output_file_path}") - for table, df in data.items(): - if len(df) > 0: - create_table_and_save_to_sql( - df, - table, - output_file_path, - default_type="FLOAT", - integer_primary_key=len(df) <= n_events, - ) - saved_any = True - - if saved_any: - self.debug("- Done saving") - else: - self.warning(f"No data saved to {output_file_path}") diff --git a/src/graphnet/data/writers/parquet_writer.py b/src/graphnet/data/writers/parquet_writer.py new file mode 100644 index 000000000..a8e74f11f --- /dev/null +++ b/src/graphnet/data/writers/parquet_writer.py @@ -0,0 +1,34 @@ +"""DataConverter for the Parquet backend.""" + +import os +from typing import List, Optional, Dict + +import awkward +import pandas as pd + +from .graphnet_writer import GraphNeTWriter + + +class ParquetWriter(GraphNeTWriter): + """Class for writing interim data format to Parquet.""" + + # Class variables + file_suffix: str = ".parquet" + + # Abstract method implementation(s) + def _save_file( + self, + data: Dict[str, pd.DataFrame], + output_file_path: str, + n_events: int, + ) -> None: + """Save data to parquet file.""" + # Check(s) + if os.path.exists(output_file_path): + self.warning( + f"Output file {output_file_path} already exists. Overwriting." + ) + + self.debug(f"Saving to {output_file_path}") + awkward.to_parquet(awkward.from_iter(data), output_file_path) + self.debug("- Done saving") diff --git a/src/graphnet/data/writers/sqlite_writer.py b/src/graphnet/data/writers/sqlite_writer.py new file mode 100644 index 000000000..e9f400c53 --- /dev/null +++ b/src/graphnet/data/writers/sqlite_writer.py @@ -0,0 +1,224 @@ +"""Module containing `GraphNeTFileSaveMethod`(s). + +These modules are used to save the interim data format from `DataConverter` to +a deep-learning friendly file format. +""" + +import os +from tqdm import tqdm +from typing import List, Dict, Optional + +from graphnet.data.utilities import ( + create_table_and_save_to_sql, + get_primary_keys, + query_database, +) +import pandas as pd +from .graphnet_writer import GraphNeTWriter + + +class SQLiteWriter(GraphNeTWriter): + """A method for saving GraphNeT's interim dataformat to SQLite.""" + + def __init__( + self, + merged_database_name: str = "merged.db", + max_table_size: Optional[int] = None, + ) -> None: + """Initialize `SQLiteWriter`. + + Args: + merged_database_name: name of the database, not path, that files + will be merged into. Defaults to "merged.db". + max_table_size: The maximum number of rows in any given table. + If given, the merging proceedure splits the databases into + partitions each with a maximum table size of max_table_size. + Note that the size is approximate. This feature is useful if + you have many events, as tables exceeding + 400 million rows tend to be noticably slower to query. + Defaults to None (All events are put into a single database). + """ + # Member Variables + self._file_extension = ".db" + self._max_table_size = max_table_size + self._database_name = merged_database_name + + # Add file extension to database name if forgotten + if not self._database_name.endswith(self._file_extension): + self._database_name = self._database_name + self._file_extension + + # Base class constructor + super().__init__(name=__name__, class_name=self.__class__.__name__) + + def _save_file( + self, + data: Dict[str, pd.DataFrame], + output_file_path: str, + n_events: int, + ) -> None: + """Save data to SQLite database.""" + # Check(s) + if os.path.exists(output_file_path): + self.warning( + f"Output file {output_file_path} already exists. Appending." + ) + + # Concatenate data + if len(data) == 0: + self.warning( + "No data was extracted from the processed I3 file(s). " + f"No data saved to {output_file_path}" + ) + return + + saved_any = False + # Save each dataframe to SQLite database + self.debug(f"Saving to {output_file_path}") + for table, df in data.items(): + if len(df) > 0: + create_table_and_save_to_sql( + df, + table, + output_file_path, + default_type="FLOAT", + integer_primary_key=len(df) <= n_events, + ) + saved_any = True + + if saved_any: + self.debug("- Done saving") + else: + self.warning(f"No data saved to {output_file_path}") + + def merge_files( + self, + files: List[str], + output_dir: str, + ) -> None: + """SQLite-specific method for merging output files/databases. + + Args: + files: paths to SQLite databases that needs to be merged. + output_dir: path to store the merged database(s) in. + database_name: name, not path, of database. E.g. "my_database". + max_table_size: The maximum number of rows in any given table. + If given, the merging proceedure splits the databases into + partitions each with a maximum table size of max_table_size. + Note that the size is approximate. This feature is useful if + you have many events, as tables exceeding + 400 million rows tend to be noticably slower to query. + Defaults to None (All events are put into a single database.) + """ + # Warnings + if self._max_table_size: + self.warning( + f"Merging got max_table_size of {self._max_table_size}." + " Will attempt to create databases with a maximum row count of" + " this size." + ) + + # Set variables + self._partition_count = 1 + + # Construct full database path + database_path = os.path.join(output_dir, self._database_name) + print(database_path) + # Start merging if files are given + if len(files) > 0: + os.makedirs(output_dir, exist_ok=True) + self.info(f"Merging {len(files)} database files") + self._merge_databases(files=files, database_path=database_path) + else: + self.warning("No database files given! Exiting.") + + def _merge_databases( + self, + files: List[str], + database_path: str, + ) -> None: + """Merge the temporary databases. + + Args: + files: List of files to be merged. + database_path: Path to a database, can be an empty path, where the + databases listed in `files` will be merged into. If no database + exists at the given path, one will be created. + """ + if os.path.exists(database_path): + self.warning( + "Target path for merged database", + f"{database_path}, already exists.", + ) + + if self._max_table_size is not None: + database_path = self._adjust_output_path(database_path) + self._row_counts: Dict[str, int] = {} + self._largest_table = 0 + + # Merge temporary databases into newly created one + for file_count, input_file in tqdm(enumerate(files), colour="green"): + + # Extract table names and index column name in database + tables, primary_key = get_primary_keys(database=input_file) + + for table_name in tables.keys(): + # Extract all data in the table from the given database + df = query_database( + database=input_file, query=f"SELECT * FROM {table_name}" + ) + + # Infer whether the table was previously indexed with + # A primary key or not. len(tables[table]) = 0 if not. + integer_primary_key = ( + True if tables[table_name] is not None else False + ) + + # Submit to new database + create_table_and_save_to_sql( + df=df, + table_name=table_name, + database_path=database_path, + index_column=primary_key, + integer_primary_key=integer_primary_key, + ) + + # Update row counts if needed + if self._max_table_size is not None: + self._update_row_counts(df=df, table_name=table_name) + + if (self._max_table_size is not None) & (file_count < len(files)): + assert self._max_table_size is not None # mypy... + if self._largest_table >= self._max_table_size: + # Increment partition, reset counts, adjust output path + self._partition_count += 1 + self._row_counts = {} + self._largest_table = 0 + database_path = self._adjust_output_path(database_path) + self.info( + "Maximum row count reached." + f" Creating new partition at {database_path}" + ) + + # Internal methods + + def _adjust_output_path(self, output_file: str) -> str: + """Adjust the file path to reflect that it is a partition.""" + path_without_extension, extension = os.path.splitext(output_file) + if "_part_" in path_without_extension: + # if true, this is already a partition. + database_name = path_without_extension.split("_part_")[:-1][0] + else: + database_name = path_without_extension + # split into multiple lines to avoid one long + database_name = database_name + f"_part_{self._partition_count}" + database_name = database_name + extension + return database_name + + def _update_row_counts(self, df: pd.DataFrame, table_name: str) -> None: + if table_name in self._row_counts.keys(): + self._row_counts[table_name] += len(df) + else: + self._row_counts[table_name] = len(df) + + self._largest_table = max(self._row_counts.values()) + return diff --git a/src/graphnet/deployment/i3modules/graphnet_module.py b/src/graphnet/deployment/i3modules/graphnet_module.py index d3aa878e0..a385413b3 100644 --- a/src/graphnet/deployment/i3modules/graphnet_module.py +++ b/src/graphnet/deployment/i3modules/graphnet_module.py @@ -7,7 +7,7 @@ import torch from torch_geometric.data import Data, Batch -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractor, I3FeatureExtractorIceCubeUpgrade, ) @@ -70,7 +70,7 @@ def __init__( self._i3_extractors = [pulsemap_extractor] for i3_extractor in self._i3_extractors: - i3_extractor.set_files(i3_file="", gcd_file=self._gcd_file) + i3_extractor.set_gcd(i3_file="", gcd_file=self._gcd_file) @abstractmethod def __call__(self, frame: I3Frame) -> bool: diff --git a/src/graphnet/models/graphs/edges/minkowski.py b/src/graphnet/models/graphs/edges/minkowski.py index 5d1134ec5..2526de1cb 100644 --- a/src/graphnet/models/graphs/edges/minkowski.py +++ b/src/graphnet/models/graphs/edges/minkowski.py @@ -69,12 +69,13 @@ def _construct_edges(self, graph: Data) -> Data: row = [] col = [] for batch in range(x.shape[0]): + x_masked = x[batch][mask[batch]] distance_mat = compute_minkowski_distance_mat( - x_masked := x[batch][mask[batch]], - x_masked, - self.c, - self.space_coords, - self.time_coord, + x=x_masked, + y=x_masked, + c=self.c, + space_coords=self.space_coords, + time_coord=self.time_coord, ) num_points = x_masked.shape[0] num_edges = min(self.nb_nearest_neighbours, num_points) diff --git a/src/graphnet/training/weight_fitting.py b/src/graphnet/training/weight_fitting.py index a52c91b29..97411bbe5 100644 --- a/src/graphnet/training/weight_fitting.py +++ b/src/graphnet/training/weight_fitting.py @@ -7,7 +7,9 @@ import pandas as pd import sqlite3 -from graphnet.data.sqlite.sqlite_utilities import create_table_and_save_to_sql +from graphnet.data.utilities.sqlite_utilities import ( + create_table_and_save_to_sql, +) from graphnet.utilities.logging import Logger diff --git a/tests/data/test_dataconverters_and_datasets.py b/tests/data/test_dataconverters_and_datasets.py index 480f11d4d..53a73a5f4 100644 --- a/tests/data/test_dataconverters_and_datasets.py +++ b/tests/data/test_dataconverters_and_datasets.py @@ -11,7 +11,7 @@ from graphnet.constants import TEST_OUTPUT_DIR from graphnet.data.constants import FEATURES, TRUTH from graphnet.data.dataconverter import DataConverter -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCube86, I3TruthExtractor, I3RetroExtractor, diff --git a/tests/data/test_i3extractor.py b/tests/data/test_i3extractor.py index 3fa19f078..f1c8c3ff7 100644 --- a/tests/data/test_i3extractor.py +++ b/tests/data/test_i3extractor.py @@ -1,6 +1,6 @@ """Unit tests for I3Extractor class.""" -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCube86, I3TruthExtractor, I3RetroExtractor, diff --git a/tests/data/test_i3genericextractor.py b/tests/data/test_i3genericextractor.py index 314fa5f44..e77727eaf 100644 --- a/tests/data/test_i3genericextractor.py +++ b/tests/data/test_i3genericextractor.py @@ -5,7 +5,7 @@ import numpy as np import graphnet.constants -from graphnet.data.extractors import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCube86, I3TruthExtractor, I3GenericExtractor, @@ -40,9 +40,9 @@ def test_i3genericextractor(test_data_dir: str = TEST_DATA_DIR) -> None: i3_file = os.path.join(test_data_dir, FILE_NAME) + ".i3.gz" gcd_file = os.path.join(test_data_dir, GCD_FILE) - generic_extractor.set_files(i3_file, gcd_file) - truth_extractor.set_files(i3_file, gcd_file) - feature_extractor.set_files(i3_file, gcd_file) + generic_extractor.set_gcd(i3_file, gcd_file) + truth_extractor.set_gcd(i3_file, gcd_file) + feature_extractor.set_gcd(i3_file, gcd_file) i3_file_io = dataio.I3File(i3_file, "r") ix_test = 5 From 5a23529f224497e1d3c4f3e91deabf7924b88f60 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Thu, 8 Feb 2024 19:20:31 +0100 Subject: [PATCH 041/124] non scaled model working as expected. Still some work needed in the scaled one --- src/graphnet/models/components/layers.py | 8 +-- src/graphnet/models/gnn/icemix.py | 74 ++++++++++-------------- 2 files changed, 35 insertions(+), 47 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index a1832161c..2ded64a97 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -379,10 +379,10 @@ def forward( #Lmax: Optional[int] = None, ) -> Tensor: """Forward pass.""" - #pos = x[:,:,:3] - #time = x[:,:,3] - spacetime_interval = (x[:, :, :3, None] - x[:, :, None, :3]).pow(2).sum(-1) - ( - (x[:, :, 3, None] - x[:, :, None, 3]) * (3e4 / 500 * 3e-1) + pos = x[:,:,:3] + time = x[:,:,3] + spacetime_interval = (pos[:, :, None] - pos[:, None, :]).pow(2).sum(-1) - ( + (time[:, :, None] - time[:, None, :]) * (3e4 / 500 * 3e-1) ).pow(2) four_distance = torch.sign(spacetime_interval) * torch.sqrt(torch.abs(spacetime_interval)) sin_emb = self.sin_emb(1024 * four_distance.clip(-4, 4)) diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 212e4284a..a594d4dd9 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.utils.checkpoint as checkpoint -import math +from typing import List from graphnet.models.components.layers import FourierEncoder, SpacetimeEncoder, Block_rel, Block from graphnet.models.gnn.dynedge import DynEdge @@ -14,17 +14,24 @@ from torch_geometric.data import Data from torch import Tensor +def convert_data(data: Data): + """Convert the input data to a tensor of shape (B, L, D)""" + x_list = torch.split(data.x, data.n_pulses.tolist()) + x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) + mask = torch.ne(x[:,:,1], torch.inf) + x[~mask] = 0 + return x, mask + class DeepIce(GNN): def __init__( self, - dim=384, - dim_base=128, - depth=12, - use_checkpoint=False, - head_size=32, - depth_rel=4, - n_rel=1, - **kwargs, + dim: int = 384, + dim_base: int = 128, + depth: int = 12, + head_size: int = 32, + depth_rel: int = 4, + n_rel: int = 1, + max_pulses: int = 768, ): super().__init__(dim_base, dim) self.fourier_ext = FourierEncoder(dim_base, dim) @@ -45,29 +52,18 @@ def __init__( for i in range(depth) ] ) - self.use_checkpoint = use_checkpoint self.n_rel = n_rel - - @torch.jit.ignore def no_weight_decay(self): return {"cls_token"} - - def _convert_data(self, data: Data): - """Convert the input data to a tensor of shape (B, L, D)""" - x_list = torch.split(data.x, data.n_pulses.tolist()) - x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) - mask = torch.ne(x[:,:,1], torch.inf) - x[~mask] = 0 - return x, mask def forward(self, data: Data) -> Tensor: - x0, mask = self._convert_data(data) + x0, mask = convert_data(data) n_pulses = data.n_pulses x = self.fourier_ext(x0, n_pulses) rel_pos_bias, rel_enc = self.rel_pos(x0) - B, _ = mask.shape + batch_size = mask.shape[0] attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf @@ -77,18 +73,15 @@ def forward(self, data: Data) -> Tensor: rel_pos_bias = None mask = torch.cat( - [torch.ones(B, 1, dtype=mask.dtype, device=mask.device), mask], 1 + [torch.ones(batch_size, 1, dtype=mask.dtype, device=mask.device), mask], 1 ) attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf - cls_token = self.cls_token.weight.unsqueeze(0).expand(B, -1, -1) + cls_token = self.cls_token.weight.unsqueeze(0).expand(batch_size, -1, -1) x = torch.cat([cls_token, x], 1) for blk in self.blocks: - if self.use_checkpoint: - x = checkpoint.checkpoint(blk, x, None, attn_mask) - else: - x = blk(x, None, attn_mask) + x = blk(x, None, attn_mask) return x[:, 0] @@ -96,16 +89,15 @@ def forward(self, data: Data) -> Tensor: class DeepIceWithDynEdge(GNN): def __init__( self, - dim=384, - dim_base=128, - depth=8, - use_checkpoint=False, - head_size=64, - knn_features=3, - **kwargs, + dim: int = 384, + dim_base: int = 128, + depth: int = 8, + head_size: int = 64, + features_subset: List[int] = [0, 1, 2], + max_pulses: int = 768, ): super().__init__(dim_base, dim) - self.knn_features = knn_features + self.features_subset = features_subset self.fourier_ext = FourierEncoder(dim_base, dim // 2, scaled=True) self.rel_pos = SpacetimeEncoder(head_size) self.sandwich = nn.ModuleList( @@ -129,7 +121,6 @@ def __init__( for i in range(depth) ] ) - self.use_checkpoint = use_checkpoint self.dyn_edge = DynEdge( 9, post_processing_layer_sizes=[336, dim // 2], @@ -142,7 +133,7 @@ def no_weight_decay(self): return {"cls_token"} def forward(self, data: Data) -> Tensor: - mask = data.mask + x0, mask = convert_data(data) graph_feature = torch.concat( [ data.pos[mask], @@ -159,7 +150,7 @@ def forward(self, data: Data) -> Tensor: rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) mask = mask[:, :Lmax] batch_index = mask.nonzero()[:, 0] - edge_index = knn_graph(x=graph_feature[:, :self.knn_features], k=8, batch=batch_index).to( + edge_index = knn_graph(x=graph_feature[:, self.features_subset], k=8, batch=batch_index).to( mask.device ) graph_feature = self.dyn_edge( @@ -185,9 +176,6 @@ def forward(self, data: Data) -> Tensor: x = torch.cat([cls_token, x], 1) for blk in self.blocks: - if self.use_checkpoint: - x = checkpoint.checkpoint(blk, x, None, attn_mask) - else: - x = blk(x, None, attn_mask) + x = blk(x, None, attn_mask) return x[:, 0] \ No newline at end of file From 5bfe19af63b50fcb1f960e16ee98905a0d26c771 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 9 Feb 2024 04:09:05 +0100 Subject: [PATCH 042/124] Working version. changes missing in DynEdge. Docstrings generated --- src/graphnet/models/components/layers.py | 2 +- src/graphnet/models/gnn/dynedge.py | 29 ++++---- src/graphnet/models/gnn/icemix.py | 87 ++++++++++++++---------- 3 files changed, 68 insertions(+), 50 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 2ded64a97..4f650fbf0 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -387,7 +387,7 @@ def forward( four_distance = torch.sign(spacetime_interval) * torch.sqrt(torch.abs(spacetime_interval)) sin_emb = self.sin_emb(1024 * four_distance.clip(-4, 4)) rel_attn = self.projection(sin_emb) - return rel_attn, sin_emb + return rel_attn # BEiTv2 block class Block_rel(LightningModule): diff --git a/src/graphnet/models/gnn/dynedge.py b/src/graphnet/models/gnn/dynedge.py index cb65df9ab..117411b89 100644 --- a/src/graphnet/models/gnn/dynedge.py +++ b/src/graphnet/models/gnn/dynedge.py @@ -309,19 +309,20 @@ def forward(self, data: Data) -> Tensor: # Post-processing x = self._post_processing(x) - # (Optional) Global pooling - if self._global_pooling_schemes: - x = self._global_pooling(x, batch=batch) - if self._add_global_variables_after_pooling: - x = torch.cat( - [ - x, - global_variables, - ], - dim=1, - ) - - # Read-out - x = self._readout(x) + if not self._icemix_encoder: + # (Optional) Global pooling + if self._global_pooling_schemes: + x = self._global_pooling(x, batch=batch) + if self._add_global_variables_after_pooling: + x = torch.cat( + [ + x, + global_variables, + ], + dim=1, + ) + + # Read-out + x = self._readout(x) return x \ No newline at end of file diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index a594d4dd9..88e6ceb4a 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -1,14 +1,20 @@ +"""Implementation of IceMix architecture used in. + + IceCube - Neutrinos in Deep Ice +Reconstruct the direction of neutrinos from the Universe to the South Pole + +Kaggle competition. + +Solution by DrHB: https://github.com/DrHB/icecube-2nd-place +""" import torch import torch.nn as nn -import torch.utils.checkpoint as checkpoint from typing import List from graphnet.models.components.layers import FourierEncoder, SpacetimeEncoder, Block_rel, Block from graphnet.models.gnn.dynedge import DynEdge from graphnet.models.gnn.gnn import GNN -from timm.models.layers import trunc_normal_ - from torch_geometric.nn.pool import knn_graph from torch_geometric.utils import to_dense_batch from torch_geometric.data import Data @@ -23,6 +29,7 @@ def convert_data(data: Data): return x, mask class DeepIce(GNN): + """DeepIce model.""" def __init__( self, dim: int = 384, @@ -31,8 +38,17 @@ def __init__( head_size: int = 32, depth_rel: int = 4, n_rel: int = 1, - max_pulses: int = 768, ): + """Construct `DeepIce`. + + Args: + dim: The latent feature dimension. + dim_base: The base feature dimension. + depth: The depth of the transformer. + head_size: The size of the attention heads. + depth_rel: The depth of the relative transformer. + n_rel: The number of relative transformer layers to use. + """ super().__init__(dim_base, dim) self.fourier_ext = FourierEncoder(dim_base, dim) self.rel_pos = SpacetimeEncoder(head_size) @@ -59,10 +75,11 @@ def no_weight_decay(self): return {"cls_token"} def forward(self, data: Data) -> Tensor: + """Apply learnable forward pass.""" x0, mask = convert_data(data) n_pulses = data.n_pulses x = self.fourier_ext(x0, n_pulses) - rel_pos_bias, rel_enc = self.rel_pos(x0) + rel_pos_bias = self.rel_pos(x0) batch_size = mask.shape[0] attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf @@ -87,6 +104,7 @@ def forward(self, data: Data) -> Tensor: class DeepIceWithDynEdge(GNN): + """DeepIce model with DynEdge.""" def __init__( self, dim: int = 384, @@ -94,8 +112,17 @@ def __init__( depth: int = 8, head_size: int = 64, features_subset: List[int] = [0, 1, 2], - max_pulses: int = 768, ): + """Construct `DeepIceWithDynEdge`. + + Args: + dim: The latent feature dimension. + dim_base: The base feature dimension. + depth: The depth of the transformer. + head_size: The size of the attention heads. + features_subset: The subset of features to + use for the edge construction. + """ super().__init__(dim_base, dim) self.features_subset = features_subset self.fourier_ext = FourierEncoder(dim_base, dim // 2, scaled=True) @@ -125,7 +152,8 @@ def __init__( 9, post_processing_layer_sizes=[336, dim // 2], dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], - global_pooling_schemes=None + global_pooling_schemes=None, + icemix_encoder=True, ) @torch.jit.ignore @@ -133,46 +161,35 @@ def no_weight_decay(self): return {"cls_token"} def forward(self, data: Data) -> Tensor: + """Apply learnable forward pass.""" x0, mask = convert_data(data) - graph_feature = torch.concat( - [ - data.pos[mask], - data.time[mask].view(-1, 1), - data.auxiliary[mask].view(-1, 1), - data.qe[mask].view(-1, 1), - data.charge[mask].view(-1, 1), - data.ice_properties[mask], - ], - dim=1, - ) - Lmax = mask.sum(-1).max() - x = self.fourier_ext(data, Lmax) - rel_pos_bias, rel_enc = self.rel_pos(data, Lmax) - mask = mask[:, :Lmax] - batch_index = mask.nonzero()[:, 0] - edge_index = knn_graph(x=graph_feature[:, self.features_subset], k=8, batch=batch_index).to( - mask.device - ) - graph_feature = self.dyn_edge( - graph_feature, edge_index, batch_index, data.n_pulses - ) - graph_feature, _ = to_dense_batch(graph_feature, batch_index) + n_pulses = data.n_pulses + for i in range(3, 7): + data.x[:, i] = torch.squeeze(data.x[:, i].view(-1, 1)) + + x = self.fourier_ext(x0, n_pulses) + rel_pos_bias = self.rel_pos(x0) + #edge_index = knn_graph(x=graph_feature[:, self.features_subset], k=8, batch=batch).to( + # mask.device + #) + graph = self.dyn_edge(data) + graph, _ = to_dense_batch(graph, data.batch) - B, _ = mask.shape + batch_size = mask.shape[0] attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf - x = torch.cat([x, graph_feature], 2) + x = torch.cat([x, graph], 2) for blk in self.sandwich: x = blk(x, attn_mask, rel_pos_bias) - if self.knn_features == 3: + if len(self.features_subset) == 3: rel_pos_bias = None mask = torch.cat( - [torch.ones(B, 1, dtype=mask.dtype, device=mask.device), mask], 1 + [torch.ones(batch_size, 1, dtype=mask.dtype, device=mask.device), mask], 1 ) attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf - cls_token = self.cls_token.weight.unsqueeze(0).expand(B, -1, -1) + cls_token = self.cls_token.weight.unsqueeze(0).expand(batch_size, -1, -1) x = torch.cat([cls_token, x], 1) for blk in self.blocks: From e27c2214fe15485b589164395a274f61b6a44182 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 9 Feb 2024 12:29:05 +0100 Subject: [PATCH 043/124] error in attn_mask.dtype. Everything else already workin --- src/graphnet/models/detector/icecube.py | 42 +------------------------ src/graphnet/models/gnn/dynedge.py | 5 ++- src/graphnet/models/gnn/icemix.py | 3 -- 3 files changed, 3 insertions(+), 47 deletions(-) diff --git a/src/graphnet/models/detector/icecube.py b/src/graphnet/models/detector/icecube.py index c99706149..3838eae1f 100644 --- a/src/graphnet/models/detector/icecube.py +++ b/src/graphnet/models/detector/icecube.py @@ -28,6 +28,7 @@ def feature_map(self) -> Dict[str, Callable]: "charge": self._charge, "rde": self._rde, "pmt_area": self._pmt_area, + "hlc": self._identity, } return feature_map @@ -158,44 +159,3 @@ def _dom_xyz(self, x: torch.tensor) -> torch.tensor: def _pmt_area(self, x: torch.tensor) -> torch.tensor: return x / 0.05 - - -class IceMixDetector(Detector): - """`Detector` class for IceCube-86.""" - - geometry_table_path = os.path.join( - ICECUBE_GEOMETRY_TABLE_DIR, "icecube86.parquet" - ) - - xyz = ["dom_x", "dom_y", "dom_z"] - string_id_column = "string" - sensor_id_column = "sensor_id" - - def feature_map(self) -> Dict[str, Callable]: - """Map standardization functions to each dimension of input data.""" - feature_map = { - "dom_x": self._dom_xyz, - "dom_y": self._dom_xyz, - "dom_z": self._dom_xyz, - "dom_time": self._dom_time, - "charge": self._charge, - "rde": self._rde, - "pmt_area": self._pmt_area, - "hlc": self._identity, - } - return feature_map - - def _dom_xyz(self, x: torch.tensor) -> torch.tensor: - return x / 500.0 - - def _dom_time(self, x: torch.tensor) -> torch.tensor: - return (x - 1.0e04) / 3.0e4 - - def _charge(self, x: torch.tensor) -> torch.tensor: - return torch.log10(x) / 3.0 - - def _rde(self, x: torch.tensor) -> torch.tensor: - return (x - 1.0) / 0.35 - - def _pmt_area(self, x: torch.tensor) -> torch.tensor: - return x / 0.05 diff --git a/src/graphnet/models/gnn/dynedge.py b/src/graphnet/models/gnn/dynedge.py index 117411b89..5ab46e413 100644 --- a/src/graphnet/models/gnn/dynedge.py +++ b/src/graphnet/models/gnn/dynedge.py @@ -66,7 +66,8 @@ def __init__( operations. icemix_encoder: Whether to use the IceCubeMix encoder. If `True`, the activation function is GELU, and layer normalization is - applied after each linear layer. Defaults to `False`. + applied after each linear layer. Additionally global pooling + and readout layer skipped. Defaults to `False`. """ # Latent feature subset for computing nearest neighbours in DynEdge. if features_subset is None: @@ -222,8 +223,6 @@ def _construct_layers(self) -> None: layer_sizes = [nb_latent_features] + list(self._readout_layer_sizes) for nb_in, nb_out in zip(layer_sizes[:-1], layer_sizes[1:]): readout_layers.append(torch.nn.Linear(nb_in, nb_out)) - if self._icemix_encoder: - readout_layers.append(torch.nn.LayerNorm(nb_out)) readout_layers.append(self._activation) self._readout = torch.nn.Sequential(*readout_layers) diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 88e6ceb4a..d562bd5d4 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -169,9 +169,6 @@ def forward(self, data: Data) -> Tensor: x = self.fourier_ext(x0, n_pulses) rel_pos_bias = self.rel_pos(x0) - #edge_index = knn_graph(x=graph_feature[:, self.features_subset], k=8, batch=batch).to( - # mask.device - #) graph = self.dyn_edge(data) graph, _ = to_dense_batch(graph, data.batch) From 31d040f0e32bb6a9580e3307e1bf30baf0ecce55 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Fri, 9 Feb 2024 14:55:43 +0100 Subject: [PATCH 044/124] add NotImplementedError in parquet_writer --- src/graphnet/data/dataconverter.py | 4 ++- src/graphnet/data/writers/parquet_writer.py | 27 ++++++++++++--------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index efae14a2f..76796d844 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -311,7 +311,9 @@ def merge_files(self, files: Optional[List[str]] = None) -> None: assert files is not None # Merge files + merge_path = os.path.join(self._output_dir, "merged") + self.info(f"Merging files to {merge_path}") self._save_method.merge_files( # type:ignore files=files_to_merge, - output_dir=os.path.join(self._output_dir, "merged"), + output_dir=merge_path, ) diff --git a/src/graphnet/data/writers/parquet_writer.py b/src/graphnet/data/writers/parquet_writer.py index a8e74f11f..fa07d266d 100644 --- a/src/graphnet/data/writers/parquet_writer.py +++ b/src/graphnet/data/writers/parquet_writer.py @@ -13,7 +13,7 @@ class ParquetWriter(GraphNeTWriter): """Class for writing interim data format to Parquet.""" # Class variables - file_suffix: str = ".parquet" + _file_extension = ".parquet" # Abstract method implementation(s) def _save_file( @@ -22,13 +22,18 @@ def _save_file( output_file_path: str, n_events: int, ) -> None: - """Save data to parquet file.""" - # Check(s) - if os.path.exists(output_file_path): - self.warning( - f"Output file {output_file_path} already exists. Overwriting." - ) - - self.debug(f"Saving to {output_file_path}") - awkward.to_parquet(awkward.from_iter(data), output_file_path) - self.debug("- Done saving") + """Save data to parquet.""" + raise NotImplementedError + + def merge_files(self, files: List[str], output_dir: str) -> None: + """Merge parquet files. + + Args: + files: input files for merging. + output_dir: directory to store merged file(s) in. + + Raises: + NotImplementedError: _description_ + """ + self.error(f"{self.__class__.__name__} does not have a merge method.") + raise NotImplementedError From b787a0783ca3ab2a14419318300a369d7a2f4684 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Fri, 9 Feb 2024 14:55:55 +0100 Subject: [PATCH 045/124] docstring --- src/graphnet/data/writers/parquet_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/data/writers/parquet_writer.py b/src/graphnet/data/writers/parquet_writer.py index fa07d266d..df64dc91a 100644 --- a/src/graphnet/data/writers/parquet_writer.py +++ b/src/graphnet/data/writers/parquet_writer.py @@ -33,7 +33,7 @@ def merge_files(self, files: List[str], output_dir: str) -> None: output_dir: directory to store merged file(s) in. Raises: - NotImplementedError: _description_ + NotImplementedError """ self.error(f"{self.__class__.__name__} does not have a merge method.") raise NotImplementedError From e36e29cc5fd6dad04ebb4c954f134f2f265909d3 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Fri, 9 Feb 2024 16:44:10 +0100 Subject: [PATCH 046/124] bug with n_pulses solved. Implementation working --- src/graphnet/models/components/layers.py | 6 +++--- src/graphnet/models/gnn/icemix.py | 19 +++++++++---------- src/graphnet/models/graphs/nodes/nodes.py | 8 +++----- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 4f650fbf0..44980e79c 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -336,18 +336,18 @@ def __init__( def forward( self, x: Tensor, - n_pulses: Tensor, + seq_length: Tensor, #Lmax: Optional[int] = None ) -> Tensor: """Forward pass.""" - length = torch.log10(n_pulses.to(dtype=x.dtype)) + length = torch.log10(seq_length.to(dtype=x.dtype)) x = torch.cat( [ self.sin_emb(4096 * x[:,:,:3]).flatten(-2), #pos self.sin_emb(1024 * x[:,:,4]), #charge self.sin_emb(4096 * x[:,:,3]), #time self.aux_emb(x[:,:,5].long()), #auxiliary - self.sin_emb2(length).unsqueeze(1).expand(-1, max(n_pulses), -1), + self.sin_emb2(length).unsqueeze(1).expand(-1, max(seq_length), -1), ], -1, ) diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index d562bd5d4..87a5cb49a 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -22,11 +22,12 @@ def convert_data(data: Data): """Convert the input data to a tensor of shape (B, L, D)""" - x_list = torch.split(data.x, data.n_pulses.tolist()) + _, seq_length = torch.unique(data.batch, return_counts=True) + x_list = torch.split(data.x, seq_length.tolist()) x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) mask = torch.ne(x[:,:,1], torch.inf) x[~mask] = 0 - return x, mask + return x, mask, seq_length class DeepIce(GNN): """DeepIce model.""" @@ -76,9 +77,8 @@ def no_weight_decay(self): def forward(self, data: Data) -> Tensor: """Apply learnable forward pass.""" - x0, mask = convert_data(data) - n_pulses = data.n_pulses - x = self.fourier_ext(x0, n_pulses) + x0, mask, seq_length = convert_data(data) + x = self.fourier_ext(x0, seq_length) rel_pos_bias = self.rel_pos(x0) batch_size = mask.shape[0] attn_mask = torch.zeros(mask.shape, device=mask.device) @@ -162,12 +162,11 @@ def no_weight_decay(self): def forward(self, data: Data) -> Tensor: """Apply learnable forward pass.""" - x0, mask = convert_data(data) - n_pulses = data.n_pulses - for i in range(3, 7): - data.x[:, i] = torch.squeeze(data.x[:, i].view(-1, 1)) + x0, mask, seq_length = convert_data(data) + #for i in range(3, 7): + # data.x[:, i] = torch.squeeze(data.x[:, i].view(-1, 1)) - x = self.fourier_ext(x0, n_pulses) + x = self.fourier_ext(x0, seq_length) rel_pos_bias = self.rel_pos(x0) graph = self.dyn_edge(data) graph, _ = to_dense_batch(graph, data.batch) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index 3757b211e..a09e20a56 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -269,10 +269,7 @@ def _add_ice_properties(self, def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: - n_pulses = x.shape[0] - graph = torch.zeros([n_pulses, len(self.all_features)]) - - event_length = n_pulses + event_length = x.shape[0] x[:, self.feature_indexes["hlc"]] = torch.logical_not(x[:, self.feature_indexes["hlc"]]) if event_length < self.max_length: @@ -284,8 +281,9 @@ def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] ids = torch.cat([ids_n, ids_p]).sort().values - event_length = len(ids) + event_length = self.max_length + graph = torch.zeros([event_length, len(self.all_features)]) for idx, feature in enumerate(self.all_features[:7]): graph[:event_length, idx] = x[ids, self.feature_indexes[feature]] From 57f49fe4191067d5ea4fae64a37c8c171fb46457 Mon Sep 17 00:00:00 2001 From: samadpls Date: Fri, 9 Feb 2024 20:46:20 +0500 Subject: [PATCH 047/124] Fix ensemble dataset functionality and added unit tests Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 4 +- tests/data/test_datamodule.py | 102 ++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 92751b092..8c5aa7aeb 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -325,8 +325,8 @@ def _infer_selections(self) -> Tuple[List[int], List[int]]: train_selection, val_selection, ) = self._infer_selections_on_single_dataset(dataset_path) - self._train_selection.extend(train_selection) # type: ignore - self._val_selection.extend(val_selection) # type: ignore + self._train_selection.append(train_selection) # type: ignore + self._val_selection.append(val_selection) # type: ignore else: # Infer selection on a single dataset ( diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py index 9dab2b1d1..9f8a1b745 100644 --- a/tests/data/test_datamodule.py +++ b/tests/data/test_datamodule.py @@ -230,3 +230,105 @@ def test_dataloader_args( assert ( dm.test_dataloader.batch_size == test_dataloader_kwargs["batch_size"] ) + + +@pytest.mark.parametrize( + "dataset_ref", [SQLiteDataset, ParquetDataset], indirect=True +) +def test_ensemble_dataset_without_selections( + dataset_setup: Tuple[Any, Dict[str, Any], Dict[str, int]] +) -> None: + """Test ensemble dataset functionality without selections. + + Args: + dataset_setup (Tuple[Any, Dict[str, Any], Dict[str, int]]): A tuple containing the dataset reference, + dataset keyword arguments, and dataloader keyword arguments. + + Returns: + None + """ + # Make dataloaders from single dataset + dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + dm_single = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=deepcopy(dataset_kwargs), + train_dataloader_kwargs=dataloader_kwargs, + ) + + # Copy dataset path twice; mimic ensemble dataset behavior + ensemble_dataset_kwargs = deepcopy(dataset_kwargs) + dataset_path = ensemble_dataset_kwargs["path"] + ensemble_dataset_kwargs["path"] = [dataset_path, dataset_path] + + # Create dataloaders from multiple datasets + dm_ensemble = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=ensemble_dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + ) + + # Test that the ensemble dataloaders contain more batches + assert len(dm_single.train_dataloader) < len(dm_ensemble.train_dataloader) + assert len(dm_single.val_dataloader) < len(dm_ensemble.val_dataloader) + + +@pytest.mark.parametrize("dataset_ref", [SQLiteDataset, ParquetDataset]) +def test_ensemble_dataset_with_selections( + dataset_setup: Tuple[Any, Dict[str, Any], Dict[str, int]] +) -> None: + """Test ensemble dataset functionality with selections. + + Args: + dataset_setup (Tuple[Any, Dict[str, Any], Dict[str, int]]): A tuple containing the dataset reference, + dataset keyword arguments, and dataloader keyword arguments. + + Returns: + None + """ + # extract all events + dataset_ref, dataset_kwargs, dataloader_kwargs = dataset_setup + file_path = dataset_kwargs["path"] + selection = extract_all_events_ids( + file_path=file_path, dataset_kwargs=dataset_kwargs + ) + + # Copy dataset path twice; mimic ensemble dataset behavior + ensemble_dataset_kwargs = deepcopy(dataset_kwargs) + dataset_path = ensemble_dataset_kwargs["path"] + ensemble_dataset_kwargs["path"] = [dataset_path, dataset_path] + + # pass two datasets but only one selection; should fail: + with pytest.raises(Exception): + _ = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=ensemble_dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + selection=selection, + ) + + # Pass two datasets and two selections; should work: + selection_1 = selection[0:20] + selection_2 = selection[0:10] + dm = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=ensemble_dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + selection=[selection_1, selection_2], + ) + n_events_in_dataloaders = len(dm.train_dataloader.dataset) + len(dm.val_dataloader.dataset) # type: ignore + + # Check that the number of events in train/val match + assert n_events_in_dataloaders == len(selection_1) + len(selection_2) + + # Pass two datasets, two selections and two test selections; should work + dm2 = GraphNeTDataModule( + dataset_reference=dataset_ref, + dataset_args=ensemble_dataset_kwargs, + train_dataloader_kwargs=dataloader_kwargs, + selection=[selection, selection], + test_selection=[selection_1, selection_2], + ) + + # Check that the number of events in test dataloaders are correct. + n_events_in_test_dataloaders = len(dm2.test_dataloader.dataset) # type: ignore + assert n_events_in_test_dataloaders == len(selection_1) + len(selection_2) From 56ccf038bf281d265203b5e4775b73de4dbe43ac Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Fri, 9 Feb 2024 23:24:45 +0100 Subject: [PATCH 048/124] deprecation warnings --- src/graphnet/data/__init__.py | 2 + src/graphnet/data/dataconverter.py | 11 +- src/graphnet/data/parquet/__init__.py | 2 + .../data/parquet/deprecated_methods.py | 63 +++++++++++ src/graphnet/data/pre_configured/__init__.py | 2 + .../data/pre_configured/dataconverters.py | 106 ++++++++++++++++++ src/graphnet/data/sqlite/__init__.py | 2 + .../data/sqlite/deprecated_methods.py | 64 +++++++++++ tests/deployment/queso_test.py | 2 +- 9 files changed, 246 insertions(+), 8 deletions(-) create mode 100644 src/graphnet/data/parquet/__init__.py create mode 100644 src/graphnet/data/parquet/deprecated_methods.py create mode 100644 src/graphnet/data/pre_configured/__init__.py create mode 100644 src/graphnet/data/pre_configured/dataconverters.py create mode 100644 src/graphnet/data/sqlite/__init__.py create mode 100644 src/graphnet/data/sqlite/deprecated_methods.py diff --git a/src/graphnet/data/__init__.py b/src/graphnet/data/__init__.py index e7eb84ca4..77cbc1af8 100644 --- a/src/graphnet/data/__init__.py +++ b/src/graphnet/data/__init__.py @@ -5,3 +5,5 @@ """ from .extractors.icecube.utilities.i3_filters import I3Filter, I3FilterMask from .dataconverter import DataConverter +from .pre_configured import I3ToParquetConverter +from .pre_configured import I3ToSQLiteConverter diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 76796d844..bdba2a733 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -38,6 +38,7 @@ def __init__( self, file_reader: Type[GraphNeTFileReader], save_method: Type[GraphNeTWriter], + outdir: str, extractors: Union[Type[Extractor], List[Type[Extractor]]], index_column: str = "event_no", num_workers: int = 1, @@ -48,6 +49,7 @@ def __init__( file_reader: The method used for reading and applying `Extractors`. save_method: The method used to save the interim data format to a graphnet supported file format. + outdir: The directory to save the files in. extractors: The `Extractor`(s) that will be applied to the input files. index_column: Name of the event id column added to the events. @@ -61,6 +63,7 @@ def __init__( self._num_workers = num_workers self._index_column = index_column self._index = 0 + self._output_dir = outdir self._output_files: List[str] = [] # Set Extractors. Will throw error if extractors are incompatible @@ -71,20 +74,14 @@ def __init__( super().__init__(name=__name__, class_name=self.__class__.__name__) @final - def __call__( - self, input_dir: Union[str, List[str]], output_dir: str - ) -> None: + def __call__(self, input_dir: Union[str, List[str]]) -> None: """Extract data from files in `input_dir` and save to disk. Args: input_dir: A directory that contains the input files. The directory will be searched recursively for files matching the file extension. - output_dir: The directory to save the files to. Input folder - structure is not respected. """ - # Set outdir - self._output_dir = output_dir # Get the file reader to produce a list of input files # in the directory input_files = self._file_reader.find_files(path=input_dir) # type: ignore diff --git a/src/graphnet/data/parquet/__init__.py b/src/graphnet/data/parquet/__init__.py new file mode 100644 index 000000000..2c41ca75d --- /dev/null +++ b/src/graphnet/data/parquet/__init__.py @@ -0,0 +1,2 @@ +"""Module for deprecated parquet methods.""" +from .deprecated_methods import ParquetDataConverter diff --git a/src/graphnet/data/parquet/deprecated_methods.py b/src/graphnet/data/parquet/deprecated_methods.py new file mode 100644 index 000000000..299cdbae8 --- /dev/null +++ b/src/graphnet/data/parquet/deprecated_methods.py @@ -0,0 +1,63 @@ +"""Module containing deprecated data conversion code. + +This code will be removed in GraphNeT 2.0. +""" +from typing import List, Union, Type + +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.extractors.icecube.utilities.i3_filters import ( + I3Filter, + NullSplitI3Filter, +) +from graphnet.data import I3ToParquetConverter + + +class ParquetDataConverter(I3ToParquetConverter): + """Method for converting i3 files to parquet files.""" + + def __init__( + self, + gcd_rescue: str, + extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + outdir: str, + index_column: str = "event_no", + num_workers: int = 1, + i3_filters: Union[ + Type[I3Filter], List[Type[I3Filter]] + ] = NullSplitI3Filter(), # type: ignore + ): + """Convert I3 files to Parquet. + + Args: + gcd_rescue: gcd_rescue: Path to a GCD file that will be used if no GCD file is + found in subfolder. `I3Reader` will recursively search + the input directory for I3-GCD file pairs. By IceCube + convention, a folder containing i3 files will have an + accompanying GCD file. However, in some cases, this + convention is broken. In cases where a folder contains + i3 files but no GCD file, the `gcd_rescue` is used + instead. + extractors: The `Extractor`(s) that will be applied to the input + files. + outdir: The directory to save the files in. + icetray_verbose: Set the level of verbosity of icetray. + Defaults to 0. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to + `NullSplitI3Filter`. + """ + self.warning( + f"{self.__class__.__name__} will be deprecated in " + "GraphNeT 2.0. Please use I3ToParquetConverter instead." + ) + super().__init__( + extractors=extractors, + num_workers=num_workers, + index_column=index_column, + i3_filters=i3_filters, + outdir=outdir, + gcd_rescue=gcd_rescue, + ) diff --git a/src/graphnet/data/pre_configured/__init__.py b/src/graphnet/data/pre_configured/__init__.py new file mode 100644 index 000000000..f56f0de18 --- /dev/null +++ b/src/graphnet/data/pre_configured/__init__.py @@ -0,0 +1,2 @@ +"""Module for pre-configured converter modules.""" +from .dataconverters import I3ToParquetConverter, I3ToSQLiteConverter diff --git a/src/graphnet/data/pre_configured/dataconverters.py b/src/graphnet/data/pre_configured/dataconverters.py new file mode 100644 index 000000000..fcd26fd49 --- /dev/null +++ b/src/graphnet/data/pre_configured/dataconverters.py @@ -0,0 +1,106 @@ +"""Pre-configured combinations of writers and readers.""" + +from typing import List, Union, Type + +from graphnet.data import DataConverter +from graphnet.data.readers import I3Reader +from graphnet.data.writers import ParquetWriter, SQLiteWriter +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.extractors.icecube.utilities.i3_filters import ( + I3Filter, + NullSplitI3Filter, +) + + +class I3ToParquetConverter(DataConverter): + """Preconfigured DataConverter for converting i3 files to parquet files.""" + + def __init__( + self, + gcd_rescue: str, + extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + outdir: str, + index_column: str = "event_no", + num_workers: int = 1, + i3_filters: Union[ + Type[I3Filter], List[Type[I3Filter]] + ] = NullSplitI3Filter(), # type: ignore + ): + """Convert I3 files to Parquet. + + Args: + gcd_rescue: gcd_rescue: Path to a GCD file that will be used if no GCD file is + found in subfolder. `I3Reader` will recursively search + the input directory for I3-GCD file pairs. By IceCube + convention, a folder containing i3 files will have an + accompanying GCD file. However, in some cases, this + convention is broken. In cases where a folder contains + i3 files but no GCD file, the `gcd_rescue` is used + instead. + extractors: The `Extractor`(s) that will be applied to the input + files. + outdir: The directory to save the files in. + icetray_verbose: Set the level of verbosity of icetray. + Defaults to 0. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to + `NullSplitI3Filter`. + """ + super().__init__( + file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), # type: ignore + save_method=ParquetWriter(), # type: ignore + extractors=extractors, # type: ignore + num_workers=num_workers, + index_column=index_column, + outdir=outdir, + ) + + +class I3ToSQLiteConverter(DataConverter): + """Preconfigured DataConverter for converting i3 files to SQLite files.""" + + def __init__( + self, + gcd_rescue: str, + extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + outdir: str, + index_column: str = "event_no", + num_workers: int = 1, + i3_filters: Union[ + Type[I3Filter], List[Type[I3Filter]] + ] = NullSplitI3Filter(), # type: ignore + ): + """Convert I3 files to Parquet. + + Args: + gcd_rescue: gcd_rescue: Path to a GCD file that will be used if no GCD file is + found in subfolder. `I3Reader` will recursively search + the input directory for I3-GCD file pairs. By IceCube + convention, a folder containing i3 files will have an + accompanying GCD file. However, in some cases, this + convention is broken. In cases where a folder contains + i3 files but no GCD file, the `gcd_rescue` is used + instead. + extractors: The `Extractor`(s) that will be applied to the input + files. + outdir: The directory to save the files in. + icetray_verbose: Set the level of verbosity of icetray. + Defaults to 0. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to + `NullSplitI3Filter`. + """ + super().__init__( + file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), # type: ignore + save_method=SQLiteWriter(), # type: ignore + extractors=extractors, # type: ignore + num_workers=num_workers, + index_column=index_column, + outdir=outdir, + ) diff --git a/src/graphnet/data/sqlite/__init__.py b/src/graphnet/data/sqlite/__init__.py new file mode 100644 index 000000000..436a86f2d --- /dev/null +++ b/src/graphnet/data/sqlite/__init__.py @@ -0,0 +1,2 @@ +"""Module for deprecated methods using sqlite.""" +from .deprecated_methods import SQLiteDataConverter diff --git a/src/graphnet/data/sqlite/deprecated_methods.py b/src/graphnet/data/sqlite/deprecated_methods.py new file mode 100644 index 000000000..3dd1e04b5 --- /dev/null +++ b/src/graphnet/data/sqlite/deprecated_methods.py @@ -0,0 +1,64 @@ +"""Module containing deprecated data conversion code. + +This code will be removed in GraphNeT 2.0. +""" + +from typing import List, Union, Type + +from graphnet.data.extractors.icecube import I3Extractor +from graphnet.data.extractors.icecube.utilities.i3_filters import ( + I3Filter, + NullSplitI3Filter, +) +from graphnet.data import I3ToSQLiteConverter + + +class SQLiteDataConverter(I3ToSQLiteConverter): + """Method for converting i3 files to SQLite files.""" + + def __init__( + self, + gcd_rescue: str, + extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + outdir: str, + index_column: str = "event_no", + num_workers: int = 1, + i3_filters: Union[ + Type[I3Filter], List[Type[I3Filter]] + ] = NullSplitI3Filter(), # type: ignore + ): + """Convert I3 files to Parquet. + + Args: + gcd_rescue: gcd_rescue: Path to a GCD file that will be used if no GCD file is + found in subfolder. `I3Reader` will recursively search + the input directory for I3-GCD file pairs. By IceCube + convention, a folder containing i3 files will have an + accompanying GCD file. However, in some cases, this + convention is broken. In cases where a folder contains + i3 files but no GCD file, the `gcd_rescue` is used + instead. + extractors: The `Extractor`(s) that will be applied to the input + files. + outdir: The directory to save the files in. + icetray_verbose: Set the level of verbosity of icetray. + Defaults to 0. + index_column: Name of the event id column added to the events. + Defaults to "event_no". + num_workers: The number of CPUs used for parallel processing. + Defaults to 1 (no multiprocessing). + i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to + `NullSplitI3Filter`. + """ + self.warning( + f"{self.__class__.__name__} will be deprecated in " + "GraphNeT 2.0. Please use I3ToSQLiteConverter instead." + ) + super().__init__( + extractors=extractors, + num_workers=num_workers, + index_column=index_column, + i3_filters=i3_filters, + outdir=outdir, + gcd_rescue=gcd_rescue, + ) diff --git a/tests/deployment/queso_test.py b/tests/deployment/queso_test.py index d1258ed89..5c0088f5d 100644 --- a/tests/deployment/queso_test.py +++ b/tests/deployment/queso_test.py @@ -8,7 +8,7 @@ import pytest from graphnet.data.constants import FEATURES -from graphnet.data.extractors.i3featureextractor import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCubeUpgrade, ) from graphnet.constants import ( From 92834439402a95b7b7c42c4792a21d10c8d316b3 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 11:39:38 +0100 Subject: [PATCH 049/124] add legacy parquet writer --- src/graphnet/data/dataconverter.py | 12 +++++++----- src/graphnet/data/writers/graphnet_writer.py | 5 +++++ src/graphnet/data/writers/parquet_writer.py | 14 +++++++++++++- src/graphnet/data/writers/sqlite_writer.py | 1 + 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index bdba2a733..7160a0c2e 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -182,11 +182,13 @@ def _assign_event_no( dataframe_dict[extractor_name].append(df) else: dataframe_dict[extractor_name] = [df] - # Merge each list of dataframes - for key in dataframe_dict.keys(): - dataframe_dict[key] = pd.concat( - dataframe_dict[key], axis=0 - ).reset_index(drop=True) + + # Merge each list of dataframes if wanted by writer + if self._save_method.expects_merged_dataframes: + for key in dataframe_dict.keys(): + dataframe_dict[key] = pd.concat( + dataframe_dict[key], axis=0 + ).reset_index(drop=True) return dataframe_dict @final diff --git a/src/graphnet/data/writers/graphnet_writer.py b/src/graphnet/data/writers/graphnet_writer.py index 04ee079f4..330a3d868 100644 --- a/src/graphnet/data/writers/graphnet_writer.py +++ b/src/graphnet/data/writers/graphnet_writer.py @@ -87,3 +87,8 @@ def __call__( def file_extension(self) -> str: """Return file extension used to store the data.""" return self._file_extension # type: ignore + + @property + def expects_merged_dataframes(self) -> bool: + """Return if writer expects input to be merged dataframes or not.""" + return self._merge_dataframes # type: ignore diff --git a/src/graphnet/data/writers/parquet_writer.py b/src/graphnet/data/writers/parquet_writer.py index df64dc91a..755a829c1 100644 --- a/src/graphnet/data/writers/parquet_writer.py +++ b/src/graphnet/data/writers/parquet_writer.py @@ -14,6 +14,7 @@ class ParquetWriter(GraphNeTWriter): # Class variables _file_extension = ".parquet" + _merge_dataframes = False # Abstract method implementation(s) def _save_file( @@ -23,7 +24,18 @@ def _save_file( n_events: int, ) -> None: """Save data to parquet.""" - raise NotImplementedError + # Check(s) + + if n_events > 0: + events = [] + for k in range(n_events): + event = {} + for table in data.keys(): + event[table] = data[table][k].to_dict(orient="list") + + events.append(event) + + awkward.to_parquet(awkward.from_iter(events), output_file_path) def merge_files(self, files: List[str], output_dir: str) -> None: """Merge parquet files. diff --git a/src/graphnet/data/writers/sqlite_writer.py b/src/graphnet/data/writers/sqlite_writer.py index e9f400c53..d7cc48297 100644 --- a/src/graphnet/data/writers/sqlite_writer.py +++ b/src/graphnet/data/writers/sqlite_writer.py @@ -40,6 +40,7 @@ def __init__( """ # Member Variables self._file_extension = ".db" + self._merge_dataframes = True self._max_table_size = max_table_size self._database_name = merged_database_name From 198dce4f1204a4ec1189684990f18253db9d151c Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 11:42:09 +0100 Subject: [PATCH 050/124] remove is_pulse_map unit test --- tests/data/test_dataconverters_and_datasets.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/data/test_dataconverters_and_datasets.py b/tests/data/test_dataconverters_and_datasets.py index 53a73a5f4..e1d9e773b 100644 --- a/tests/data/test_dataconverters_and_datasets.py +++ b/tests/data/test_dataconverters_and_datasets.py @@ -19,7 +19,6 @@ from graphnet.data.parquet import ParquetDataConverter from graphnet.data.dataset import ParquetDataset, SQLiteDataset from graphnet.data.sqlite import SQLiteDataConverter -from graphnet.data.sqlite.sqlite_dataconverter import is_pulse_map from graphnet.data.utilities.parquet_to_sqlite import ParquetToSQLiteConverter from graphnet.utilities.imports import has_icecube_package from graphnet.models.graphs import KNNGraph @@ -52,17 +51,6 @@ def get_file_path(backend: str) -> str: return path -# Unit test(s) -def test_is_pulsemap_check() -> None: - """Test behaviour of `is_pulsemap_check`.""" - assert is_pulse_map("SplitInIcePulses") is True - assert is_pulse_map("SRTInIcePulses") is True - assert is_pulse_map("InIceDSTPulses") is True - assert is_pulse_map("RTTWOfflinePulses") is True - assert is_pulse_map("truth") is False - assert is_pulse_map("retro") is False - - @pytest.mark.order(1) @pytest.mark.parametrize("backend", ["sqlite", "parquet"]) def test_dataconverter( From 9cf58916d96b97a0353a2b12deef7f1710cd670c Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 12:12:32 +0100 Subject: [PATCH 051/124] change num_workers -> workers in deprecated methods. Adjust output file name in dataconverter --- src/graphnet/data/dataconverter.py | 7 ++++--- src/graphnet/data/parquet/deprecated_methods.py | 14 +++++++------- src/graphnet/data/sqlite/deprecated_methods.py | 14 +++++++------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 7160a0c2e..f50912e4e 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -150,9 +150,10 @@ def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: """Convert input file path to an output file name.""" if isinstance(input_file_path, I3FileSet): input_file_path = input_file_path.i3_file - path_without_extension = os.path.splitext(input_file_path)[0] - base_file_name = path_without_extension.split("/")[-1] - return base_file_name # type: ignore + file_name = os.path.basename(input_file_path) + index_of_dot = file_name.index(".") + file_name_without_extension = file_name[:index_of_dot] + return file_name_without_extension # type: ignore @final def _assign_event_no( diff --git a/src/graphnet/data/parquet/deprecated_methods.py b/src/graphnet/data/parquet/deprecated_methods.py index 299cdbae8..717e798bb 100644 --- a/src/graphnet/data/parquet/deprecated_methods.py +++ b/src/graphnet/data/parquet/deprecated_methods.py @@ -21,7 +21,7 @@ def __init__( extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], outdir: str, index_column: str = "event_no", - num_workers: int = 1, + workers: int = 1, i3_filters: Union[ Type[I3Filter], List[Type[I3Filter]] ] = NullSplitI3Filter(), # type: ignore @@ -44,20 +44,20 @@ def __init__( Defaults to 0. index_column: Name of the event id column added to the events. Defaults to "event_no". - num_workers: The number of CPUs used for parallel processing. + workers: The number of CPUs used for parallel processing. Defaults to 1 (no multiprocessing). i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to `NullSplitI3Filter`. """ - self.warning( - f"{self.__class__.__name__} will be deprecated in " - "GraphNeT 2.0. Please use I3ToParquetConverter instead." - ) super().__init__( extractors=extractors, - num_workers=num_workers, + num_workers=workers, index_column=index_column, i3_filters=i3_filters, outdir=outdir, gcd_rescue=gcd_rescue, ) + self.warning( + f"{self.__class__.__name__} will be deprecated in " + "GraphNeT 2.0. Please use I3ToParquetConverter instead." + ) diff --git a/src/graphnet/data/sqlite/deprecated_methods.py b/src/graphnet/data/sqlite/deprecated_methods.py index 3dd1e04b5..f3da0d10f 100644 --- a/src/graphnet/data/sqlite/deprecated_methods.py +++ b/src/graphnet/data/sqlite/deprecated_methods.py @@ -22,7 +22,7 @@ def __init__( extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], outdir: str, index_column: str = "event_no", - num_workers: int = 1, + workers: int = 1, i3_filters: Union[ Type[I3Filter], List[Type[I3Filter]] ] = NullSplitI3Filter(), # type: ignore @@ -45,20 +45,20 @@ def __init__( Defaults to 0. index_column: Name of the event id column added to the events. Defaults to "event_no". - num_workers: The number of CPUs used for parallel processing. + workers: The number of CPUs used for parallel processing. Defaults to 1 (no multiprocessing). i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to `NullSplitI3Filter`. """ - self.warning( - f"{self.__class__.__name__} will be deprecated in " - "GraphNeT 2.0. Please use I3ToSQLiteConverter instead." - ) super().__init__( extractors=extractors, - num_workers=num_workers, + num_workers=workers, index_column=index_column, i3_filters=i3_filters, outdir=outdir, gcd_rescue=gcd_rescue, ) + self.warning( + f"{self.__class__.__name__} will be deprecated in " + "GraphNeT 2.0. Please use I3ToSQLiteConverter instead." + ) From 6cd49626d8859b379c8b5fc861455837fea9b194 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Sun, 11 Feb 2024 13:05:34 +0100 Subject: [PATCH 052/124] Solve pre-commit issues --- src/graphnet/models/components/layers.py | 232 +++++++++++-------- src/graphnet/models/gnn/dynedge.py | 26 ++- src/graphnet/models/gnn/icemix.py | 102 +++++--- src/graphnet/models/graphs/nodes/__init__.py | 7 +- src/graphnet/models/graphs/nodes/nodes.py | 151 +++++++----- src/graphnet/models/graphs/utils.py | 21 +- 6 files changed, 345 insertions(+), 194 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 44980e79c..9043e4796 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -1,9 +1,9 @@ - """Class(es) implementing layers to be used in `graphnet` models.""" from typing import Any, Callable, Optional, Sequence, Union, List, Tuple import torch +import math from torch.functional import Tensor from torch_geometric.nn import EdgeConv from torch_geometric.nn.pool import knn_graph @@ -17,7 +17,7 @@ from torch_geometric.utils import to_dense_batch from pytorch_lightning import LightningModule from timm.models.layers import drop_path -import math + class DynEdgeConv(EdgeConv, LightningModule): """Dynamical edge convolution layer.""" @@ -198,15 +198,14 @@ def forward( return x - class DropPath(LightningModule): """DropPath regularization module for neural networks.""" + def __init__( - self, + self, drop_prob: Optional[float] = None, ): - """ - Construct `DropPath`. + """Construct `DropPath`. Args: drop_prob: Probability of dropping a path during training. @@ -225,19 +224,17 @@ def extra_repr(self) -> str: class Mlp(LightningModule): - """ - Multi-Layer Perceptron (MLP) module. - """ + """Multi-Layer Perceptron (MLP) module.""" + def __init__( self, - in_features: int = None, + in_features: int, hidden_features: Optional[int] = None, out_features: Optional[int] = None, - activation: Optional[nn.Module] = nn.GELU, + activation: nn.Module = nn.GELU, dropout_prob: Optional[float] = 0.0, ): - """ - Construct `Mlp`. + """Construct `Mlp`. Args: in_features: Number of input features. @@ -248,12 +245,11 @@ def __init__( activation: Activation layer. Defaults to `nn.GELU`. dropout_prob: Dropout probability. Defaults to 0.0. """ - + super().__init__() if in_features <= 0: raise ValueError( f"in_features must be greater than 0, got in_features={in_features} instead" ) - super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.input_projection = nn.Linear(in_features, hidden_features) @@ -269,17 +265,19 @@ def forward(self, x: Tensor) -> Tensor: x = self.dropout(x) return x + class SinusoidalPosEmb(LightningModule): + """Sinusoidal positional embeddings module.""" + def __init__( - self, - dim: int = 16, + self, + dim: int = 16, n_freq: int = 10000, scaled: bool = False, ): - """ - Construct `SinusoidalPosEmb`. + """Construct `SinusoidalPosEmb`. - This module generates sinusoidal positional embeddings to be + This module generates sinusoidal positional embeddings to be added to input sequences. Args: @@ -290,29 +288,33 @@ def __init__( super().__init__() if dim % 2 != 0: raise ValueError("dim must be even") - self.scale = nn.Parameter(torch.ones(1) * dim**-0.5) if scaled else 1.0 + self.scale = ( + nn.Parameter(torch.ones(1) * dim**-0.5) if scaled else 1.0 + ) self.dim = dim - self.n_freq = n_freq + self.n_freq = torch.Tensor([n_freq]) def forward(self, x: Tensor) -> Tensor: """Forward pass.""" device = x.device half_dim = self.dim // 2 - emb = math.log(self.n_freq) / half_dim + emb = torch.log(self.n_freq.to(device=device)) / half_dim emb = torch.exp(torch.arange(half_dim, device=device) * (-emb)) - emb = x[..., None] * emb[None, ...] - emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + emb = x.unsqueeze(-1) * emb.unsqueeze(0) + emb = torch.cat((torch.sin(emb), torch.cos(emb)), dim=-1) return emb * self.scale + class FourierEncoder(LightningModule): + """Fourier encoder module.""" + def __init__( - self, - base_dim: int = 128, + self, + base_dim: int = 128, output_dim: int = 384, scaled: bool = False, ): - """ - Construct `FourierEncoder`. + """Construct `FourierEncoder`. This module incorporates sinusoidal positional embeddings and auxiliary embeddings to process input sequences and produce meaningful representations. @@ -334,20 +336,21 @@ def __init__( ) def forward( - self, - x: Tensor, + self, + x: Tensor, seq_length: Tensor, - #Lmax: Optional[int] = None ) -> Tensor: """Forward pass.""" length = torch.log10(seq_length.to(dtype=x.dtype)) x = torch.cat( [ - self.sin_emb(4096 * x[:,:,:3]).flatten(-2), #pos - self.sin_emb(1024 * x[:,:,4]), #charge - self.sin_emb(4096 * x[:,:,3]), #time - self.aux_emb(x[:,:,5].long()), #auxiliary - self.sin_emb2(length).unsqueeze(1).expand(-1, max(seq_length), -1), + self.sin_emb(4096 * x[:, :, :3]).flatten(-2), # pos + self.sin_emb(1024 * x[:, :, 4]), # charge + self.sin_emb(4096 * x[:, :, 3]), # time + self.aux_emb(x[:, :, 5].long()), # auxiliary + self.sin_emb2(length) + .unsqueeze(1) + .expand(-1, max(seq_length), -1), ], -1, ) @@ -356,12 +359,13 @@ def forward( class SpacetimeEncoder(LightningModule): + """Spacetime encoder module.""" + def __init__( - self, + self, base_dim: int = 32, ): - """ - Construct `SpacetimeEncoder`. + """Construct `SpacetimeEncoder`. This module calculates space-time interval between each pair of events and generates sinusoidal positional embeddings to be added to input sequences. @@ -376,30 +380,32 @@ def __init__( def forward( self, x: Tensor, - #Lmax: Optional[int] = None, + # Lmax: Optional[int] = None, ) -> Tensor: """Forward pass.""" - pos = x[:,:,:3] - time = x[:,:,3] - spacetime_interval = (pos[:, :, None] - pos[:, None, :]).pow(2).sum(-1) - ( - (time[:, :, None] - time[:, None, :]) * (3e4 / 500 * 3e-1) - ).pow(2) - four_distance = torch.sign(spacetime_interval) * torch.sqrt(torch.abs(spacetime_interval)) + pos = x[:, :, :3] + time = x[:, :, 3] + spacetime_interval = (pos[:, :, None] - pos[:, None, :]).pow(2).sum( + -1 + ) - ((time[:, :, None] - time[:, None, :]) * (3e4 / 500 * 3e-1)).pow(2) + four_distance = torch.sign(spacetime_interval) * torch.sqrt( + torch.abs(spacetime_interval) + ) sin_emb = self.sin_emb(1024 * four_distance.clip(-4, 4)) rel_attn = self.projection(sin_emb) return rel_attn -# BEiTv2 block + class Block_rel(LightningModule): - """Implementation of BEiTv2 Block. - """ + """Implementation of BEiTv2 Block.""" + def __init__( self, - dim: int = None, - num_heads: int = None, + dim: int, + num_heads: int, mlp_ratio: float = 4.0, qkv_bias: bool = False, - qk_scale: float = None, + qk_scale: Optional[float] = None, dropout: float = 0.0, attn_drop: float = 0.0, drop_path: float = 0.0, @@ -407,37 +413,42 @@ def __init__( activation: nn.Module = nn.GELU, norm_layer: nn.Module = nn.LayerNorm, attn_head_dim: int = None, - ): - """ - Construct 'Block_rel'. + ): + """Construct 'Block_rel'. Args: dim: Dimension of the input tensor. num_heads: Number of attention heads to use in the `Attention_rel` layer. mlp_ratio: Ratio of the hidden size of the feedforward network to the input size in the `Mlp` layer. - qkv_bias: Whether or not to include bias terms in the query, key, and + qkv_bias: Whether or not to include bias terms in the query, key, and value matrices in the `Attention_rel` layer. qk_scale: Scaling factor for the dot product of the query and key matrices in the `Attention_rel` layer. dropout: Dropout probability to use in the `Mlp` layer. - attn_dropt: Dropout probability to use in the `Attention_rel` layer. - drop_path: Probability of applying drop path regularization to the output + attn_drop: Dropout probability to use in the `Attention_rel` layer. + drop_path: Probability of applying drop path regularization to the output of the layer. - init_values: Initial value to use for the `gamma_1` and `gamma_2` + init_values: Initial value to use for the `gamma_1` and `gamma_2` parameters if not `None`. - act_layer: Activation function to use in the `Mlp` layer. + activation: Activation function to use in the `Mlp` layer. norm_layer: Normalization layer to use. - attn_head_dim: Dimension of the attention head outputs in the - `Attention_rel` layer. + attn_head_dim: Dimension of the attention head outputs in the + `Attention_rel` layer. """ - super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention_rel( - dim, num_heads, attn_drop=attn_drop, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_head_dim=attn_head_dim + dim, + num_heads, + attn_drop=attn_drop, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_head_dim=attn_head_dim, + ) + self.drop_path = ( + DropPath(drop_path) if drop_path > 0.0 else nn.Identity() ) - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( @@ -457,7 +468,13 @@ def __init__( else: self.gamma_1, self.gamma_2 = None, None - def forward(self, x: Tensor, key_padding_mask=None, rel_pos_bias=None, kv=None): + def forward( + self, + x: Tensor, + key_padding_mask: Optional[Tensor] = None, + rel_pos_bias: Optional[Tensor] = None, + kv: Tensor = None, + ) -> Tensor: """Forward pass.""" if self.gamma_1 is None: xn = self.norm1(x) @@ -490,7 +507,10 @@ def forward(self, x: Tensor, key_padding_mask=None, rel_pos_bias=None, kv=None): x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x + class Attention_rel(LightningModule): + """Attention mechanism with relative position bias.""" + def __init__( self, dim: int, @@ -501,15 +521,16 @@ def __init__( proj_drop: float = 0.0, attn_head_dim: int = None, ): - """ + """Construct 'Attention_rel'. + Args: dim: Dimension of the input tensor. num_heads: the number of attention heads to use (default: 8) qkv_bias: whether to add bias to the query, key, and value projections. Defaults to False. - qk_scale: a scaling factor that multiplies the dot product of query + qk_scale: a scaling factor that multiplies the dot product of query and key vectors. Defaults to None. If None, computed as - :math: `\sqrt{1/head_dim}` + :math: `head_dim^(-1/2)`. attn_drop: the dropout probability for the attention weights. Defaults to 0.0. proj_drop: the dropout probability for the output of the attention @@ -522,7 +543,7 @@ def __init__( f"dim and num_heads must be greater than 0," f" got dim={dim} and num_heads={num_heads} instead" ) - + super().__init__() self.num_heads = num_heads head_dim = attn_head_dim or dim // num_heads @@ -543,16 +564,29 @@ def __init__( self.proj = nn.Linear(all_head_dim, dim) self.proj_drop = nn.Dropout(proj_drop) - def forward(self, q, k, v, rel_pos_bias=None, key_padding_mask=None): + def forward( + self, + q: Tensor, + k: Tensor, + v: Tensor, + rel_pos_bias: Optional[Tensor] = None, + key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Forward pass.""" - B, N, C = q.shape + batch_size, event_length, _ = q.shape q = linear(input=q, weight=self.proj_q.weight, bias=self.q_bias) - q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + q = q.reshape(batch_size, event_length, self.num_heads, -1).permute( + 0, 2, 1, 3 + ) k = linear(input=k, weight=self.proj_k.weight, bias=None) - k = k.reshape(B, k.shape[1], self.num_heads, -1).permute(0, 2, 1, 3) + k = k.reshape(batch_size, k.shape[1], self.num_heads, -1).permute( + 0, 2, 1, 3 + ) v = linear(input=v, weight=self.proj_v.weight, bias=self.v_bias) - v = v.reshape(B, v.shape[1], self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(batch_size, v.shape[1], self.num_heads, -1).permute( + 0, 2, 1, 3 + ) q = q * self.scale attn = q @ k.transpose(-2, -1) @@ -564,12 +598,15 @@ def forward(self, q, k, v, rel_pos_bias=None, key_padding_mask=None): key_padding_mask.dtype == torch.float32 or key_padding_mask.dtype == torch.float16 ), "incorrect mask dtype" - bias = torch.min(key_padding_mask[:, None, :], key_padding_mask[:, :, None]) + bias = torch.min( + key_padding_mask[:, None, :], key_padding_mask[:, :, None] + ) bias[ - torch.max(key_padding_mask[:, None, :], key_padding_mask[:, :, None]) + torch.max( + key_padding_mask[:, None, :], key_padding_mask[:, :, None] + ) < 0 ] = 0 - # print(bias.shape,bias.min(),bias.max()) attn = attn + bias.unsqueeze(1) attn = attn.softmax(dim=-1) @@ -578,16 +615,19 @@ def forward(self, q, k, v, rel_pos_bias=None, key_padding_mask=None): x = (attn @ v).transpose(1, 2) if rel_pos_bias is not None: x = x + torch.einsum("bhij,bijc->bihc", attn, rel_pos_bias) - x = x.reshape(B, N, -1) + x = x.reshape(batch_size, event_length, -1) x = self.proj(x) x = self.proj_drop(x) return x + class Block(LightningModule): + """Transformer block.""" + def __init__( self, - dim: int = None, - num_heads: int = None, + dim: int, + num_heads: int, mlp_ratio: float = 4.0, dropout: float = 0.0, attn_drop: float = 0.0, @@ -596,8 +636,7 @@ def __init__( activation: nn.Module = nn.GELU, norm_layer: nn.Module = nn.LayerNorm, ): - """ - Construct 'Block'. + """Construct 'Block'. Args: dim: Dimension of the input tensor. @@ -606,22 +645,22 @@ def __init__( mlp_ratio: Ratio of the hidden size of the feedforward network to the input size in the `Mlp` layer. dropout: Dropout probability to use in the `Mlp` layer. - attn_dropt: Dropout probability to use in the `MultiheadAttention` layer. - drop_path: Probability of applying drop path regularization to the output + attn_drop: Dropout probability to use in the `MultiheadAttention` layer. + drop_path: Probability of applying drop path regularization to the output of the layer. - init_values: Initial value to use for the `gamma_1` and `gamma_2` + init_values: Initial value to use for the `gamma_1` and `gamma_2` parameters if not `None`. - act_layer: Activation function to use in the `Mlp` layer. + activation: Activation function to use in the `Mlp` layer. norm_layer: Normalization layer to use. - attn_head_dim: Dimension of the attention head outputs in the - `MultiheadAttention` layer. - """ + """ super().__init__() self.norm1 = norm_layer(dim) self.attn = nn.MultiheadAttention( dim, num_heads, dropout=attn_drop, batch_first=True ) - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path = ( + DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + ) self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( @@ -641,7 +680,12 @@ def __init__( else: self.gamma_1, self.gamma_2 = None, None - def forward(self, x, attn_mask=None, key_padding_mask=None): + def forward( + self, + x: Tensor, + attn_mask: Optional[Tensor] = None, + key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Forward pass.""" if self.gamma_1 is None: xn = self.norm1(x) @@ -670,4 +714,4 @@ def forward(self, x, attn_mask=None, key_padding_mask=None): )[0] ) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x \ No newline at end of file + return x diff --git a/src/graphnet/models/gnn/dynedge.py b/src/graphnet/models/gnn/dynedge.py index 5ab46e413..ae0888147 100644 --- a/src/graphnet/models/gnn/dynedge.py +++ b/src/graphnet/models/gnn/dynedge.py @@ -17,6 +17,7 @@ "mean": scatter_mean, } + class DynEdge(GNN): """DynEdge (dynamical edge convolutional) model.""" @@ -34,6 +35,7 @@ def __init__( icemix_encoder: bool = False, ): """Construct `DynEdge`. + Args: nb_inputs: Number of input features on each node. nb_neighbours: Number of neighbours to used in the k-nearest @@ -98,7 +100,9 @@ def __init__( assert len(dynedge_layer_sizes) assert all(isinstance(sizes, tuple) for sizes in dynedge_layer_sizes) assert all(len(sizes) > 0 for sizes in dynedge_layer_sizes) - assert all(all(size > 0 for size in sizes) for sizes in dynedge_layer_sizes) + assert all( + all(size > 0 for size in sizes) for sizes in dynedge_layer_sizes + ) self._dynedge_layer_sizes = dynedge_layer_sizes @@ -146,7 +150,9 @@ def __init__( "No global pooling schemes were request, so cannot add global" " variables after pooling." ) - self._add_global_variables_after_pooling = add_global_variables_after_pooling + self._add_global_variables_after_pooling = ( + add_global_variables_after_pooling + ) # Base class constructor super().__init__(nb_inputs, self._readout_layer_sizes[-1]) @@ -198,11 +204,14 @@ def _construct_layers(self) -> None: # Post-processing operations nb_latent_features = ( - sum(sizes[-1] for sizes in self._dynedge_layer_sizes) + nb_input_features + sum(sizes[-1] for sizes in self._dynedge_layer_sizes) + + nb_input_features ) post_processing_layers = [] - layer_sizes = [nb_latent_features] + list(self._post_processing_layer_sizes) + layer_sizes = [nb_latent_features] + list( + self._post_processing_layer_sizes + ) for nb_in, nb_out in zip(layer_sizes[:-1], layer_sizes[1:]): post_processing_layers.append(torch.nn.Linear(nb_in, nb_out)) if self._icemix_encoder: @@ -213,7 +222,9 @@ def _construct_layers(self) -> None: # Read-out operations nb_poolings = ( - len(self._global_pooling_schemes) if self._global_pooling_schemes else 1 + len(self._global_pooling_schemes) + if self._global_pooling_schemes + else 1 ) nb_latent_features = nb_out * nb_poolings if self._add_global_variables_after_pooling: @@ -290,7 +301,8 @@ def forward(self, data: Data) -> Tensor: ).type(torch.float) global_variables_distributed = torch.sum( - distribute.unsqueeze(dim=2) * global_variables.unsqueeze(dim=0), + distribute.unsqueeze(dim=2) + * global_variables.unsqueeze(dim=0), dim=1, ) @@ -324,4 +336,4 @@ def forward(self, data: Data) -> Tensor: # Read-out x = self._readout(x) - return x \ No newline at end of file + return x diff --git a/src/graphnet/models/gnn/icemix.py b/src/graphnet/models/gnn/icemix.py index 87a5cb49a..c3dfe52ee 100644 --- a/src/graphnet/models/gnn/icemix.py +++ b/src/graphnet/models/gnn/icemix.py @@ -9,28 +9,47 @@ """ import torch import torch.nn as nn -from typing import List - -from graphnet.models.components.layers import FourierEncoder, SpacetimeEncoder, Block_rel, Block +from typing import List, Set + +from graphnet.models.components.layers import ( + FourierEncoder, + SpacetimeEncoder, + Block_rel, + Block, +) from graphnet.models.gnn.dynedge import DynEdge from graphnet.models.gnn.gnn import GNN -from torch_geometric.nn.pool import knn_graph from torch_geometric.utils import to_dense_batch from torch_geometric.data import Data from torch import Tensor -def convert_data(data: Data): - """Convert the input data to a tensor of shape (B, L, D)""" + +def convert_data(data: Data) -> tuple: + """Convert the input data to a tensor of shape (B, L, D). + + Args: + data: The input data. + + Returns: + x: Reshaped input tensor [B x L x D]. + mask: The mask tensor. + seq_length: The actual number of pulses tensor + (after sub-sampling). + """ _, seq_length = torch.unique(data.batch, return_counts=True) x_list = torch.split(data.x, seq_length.tolist()) - x = torch.nn.utils.rnn.pad_sequence(x_list, batch_first=True, padding_value=torch.inf) - mask = torch.ne(x[:,:,1], torch.inf) + x = torch.nn.utils.rnn.pad_sequence( + x_list, batch_first=True, padding_value=torch.inf + ) + mask = torch.ne(x[:, :, 1], torch.inf) x[~mask] = 0 return x, mask, seq_length + class DeepIce(GNN): """DeepIce model.""" + def __init__( self, dim: int = 384, @@ -54,7 +73,10 @@ def __init__( self.fourier_ext = FourierEncoder(dim_base, dim) self.rel_pos = SpacetimeEncoder(head_size) self.sandwich = nn.ModuleList( - [Block_rel(dim=dim, num_heads=dim // head_size) for i in range(depth_rel)] + [ + Block_rel(dim=dim, num_heads=dim // head_size) + for i in range(depth_rel) + ] ) self.cls_token = nn.Linear(dim, 1, bias=False) self.blocks = nn.ModuleList( @@ -70,9 +92,10 @@ def __init__( ] ) self.n_rel = n_rel - + @torch.jit.ignore - def no_weight_decay(self): + def no_weight_decay(self) -> Set: + """cls_tocken should not be subject to weight decay during training.""" return {"cls_token"} def forward(self, data: Data) -> Tensor: @@ -90,37 +113,48 @@ def forward(self, data: Data) -> Tensor: rel_pos_bias = None mask = torch.cat( - [torch.ones(batch_size, 1, dtype=mask.dtype, device=mask.device), mask], 1 + [ + torch.ones( + batch_size, 1, dtype=mask.dtype, device=mask.device + ), + mask, + ], + 1, ) attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf - cls_token = self.cls_token.weight.unsqueeze(0).expand(batch_size, -1, -1) + cls_token = self.cls_token.weight.unsqueeze(0).expand( + batch_size, -1, -1 + ) x = torch.cat([cls_token, x], 1) for blk in self.blocks: x = blk(x, None, attn_mask) return x[:, 0] - - + + class DeepIceWithDynEdge(GNN): - """DeepIce model with DynEdge.""" + """DeepIce model with DynEdge.""" + def __init__( self, dim: int = 384, dim_base: int = 128, depth: int = 8, head_size: int = 64, + nb_neighbours: int = 9, features_subset: List[int] = [0, 1, 2], ): """Construct `DeepIceWithDynEdge`. - + Args: dim: The latent feature dimension. dim_base: The base feature dimension. depth: The depth of the transformer. head_size: The size of the attention heads. - features_subset: The subset of features to + nb_neighbours: Number of neighbours to used in the knn graph. + features_subset: The subset of features to use for the edge construction. """ super().__init__(dim_base, dim) @@ -149,23 +183,27 @@ def __init__( ] ) self.dyn_edge = DynEdge( - 9, + nb_inputs=9, # 9 features, TBD by user + nb_neighbours=nb_neighbours, post_processing_layer_sizes=[336, dim // 2], - dynedge_layer_sizes=[(128, 256), (336, 256), (336, 256), (336, 256)], + dynedge_layer_sizes=[ + (128, 256), + (336, 256), + (336, 256), + (336, 256), + ], global_pooling_schemes=None, icemix_encoder=True, ) - + @torch.jit.ignore - def no_weight_decay(self): + def no_weight_decay(self) -> Set: + """cls_tocken should not be subject to weight decay during training.""" return {"cls_token"} def forward(self, data: Data) -> Tensor: """Apply learnable forward pass.""" x0, mask, seq_length = convert_data(data) - #for i in range(3, 7): - # data.x[:, i] = torch.squeeze(data.x[:, i].view(-1, 1)) - x = self.fourier_ext(x0, seq_length) rel_pos_bias = self.rel_pos(x0) graph = self.dyn_edge(data) @@ -181,14 +219,22 @@ def forward(self, data: Data) -> Tensor: if len(self.features_subset) == 3: rel_pos_bias = None mask = torch.cat( - [torch.ones(batch_size, 1, dtype=mask.dtype, device=mask.device), mask], 1 + [ + torch.ones( + batch_size, 1, dtype=mask.dtype, device=mask.device + ), + mask, + ], + 1, ) attn_mask = torch.zeros(mask.shape, device=mask.device) attn_mask[~mask] = -torch.inf - cls_token = self.cls_token.weight.unsqueeze(0).expand(batch_size, -1, -1) + cls_token = self.cls_token.weight.unsqueeze(0).expand( + batch_size, -1, -1 + ) x = torch.cat([cls_token, x], 1) for blk in self.blocks: x = blk(x, None, attn_mask) - return x[:, 0] \ No newline at end of file + return x[:, 0] diff --git a/src/graphnet/models/graphs/nodes/__init__.py b/src/graphnet/models/graphs/nodes/__init__.py index dbcf3e477..f5182a60d 100644 --- a/src/graphnet/models/graphs/nodes/__init__.py +++ b/src/graphnet/models/graphs/nodes/__init__.py @@ -5,4 +5,9 @@ and their features. """ -from .nodes import NodeDefinition, NodesAsPulses, PercentileClusters, IceMixNodes +from .nodes import ( + NodeDefinition, + NodesAsPulses, + PercentileClusters, + IceMixNodes, +) diff --git a/src/graphnet/models/graphs/nodes/nodes.py b/src/graphnet/models/graphs/nodes/nodes.py index a09e20a56..073a74bbc 100644 --- a/src/graphnet/models/graphs/nodes/nodes.py +++ b/src/graphnet/models/graphs/nodes/nodes.py @@ -213,79 +213,122 @@ def _construct_nodes(self, x: torch.Tensor) -> Data: return Data(x=torch.tensor(array)) + class IceMixNodes(NodeDefinition): - + """Calculate ice properties and perform random sampling. + + Ice properties are calculated based on the z-coordinate of the pulse. For + each event, a random sampling is performed to keep the number of pulses + below a maximum number of pulses if n_pulses is over the limit. + """ + def __init__( - self, + self, input_feature_names: Optional[List[str]] = None, - max_pulses: int = 384, + max_pulses: int = 768, + z_name: str = "dom_z", + hlc_name: str = "hlc", ) -> None: - + """Construct `IceMixNodes`. + + Args: + input_feature_names: Column names for input features. Minimum + required features are z coordinate and hlc column names. + max_pulses: Maximum number of pulses to keep in the event. + z_name: Name of the z-coordinate column. + hlc_name: Name of the `Hard Local Coincidence Check` column. + """ super().__init__(input_feature_names=input_feature_names) - + if input_feature_names is None: - input_feature_names = ["dom_x", - "dom_y", - "dom_z", - "dom_time", - "charge", - "hlc", - "rde"] - - self.all_features = ["dom_x", - "dom_y", - "dom_z", - "dom_time", - "charge", - "hlc", - "rde", - "scatt_lenght", - "abs_lenght"] - - missing_features = set(self.all_features) - set(input_feature_names) - if any(feat in missing_features for feat in self.all_features[:7]): - raise ValueError(f"Features dom_x, dom_y, dom_z, dom_time, charge, hlc, rde" - f" are required for IceMixNodes") - - self.feature_indexes = {feat: self.all_features.index(feat) for feat in input_feature_names} - self.input_feature_names = input_feature_names + input_feature_names = [ + "dom_x", + "dom_y", + "dom_z", + "dom_time", + "charge", + "hlc", + "rde", + ] + + if z_name not in input_feature_names: + raise ValueError( + f"z name {z_name} not in input_feature_names, {input_feature_names}" + ) + if hlc_name not in input_feature_names: + raise ValueError( + f"hlc name {hlc_name} not in input_feature_names, {input_feature_names}" + ) + + self.all_features = input_feature_names + [ + "scatt_lenght", + "abs_lenght", + ] + + self.feature_indexes = { + feat: self.all_features.index(feat) for feat in input_feature_names + } + + self.f_scattering, self.f_absoprtion = ice_transparency() + + self.input_feature_names = input_feature_names + self.n_features = len(self.all_features) self.max_length = max_pulses + self.z_name = z_name + self.hlc_name = hlc_name def _define_output_feature_names( - self, - input_feature_names: List[str] + self, input_feature_names: List[str] ) -> List[str]: return self.all_features - - def _add_ice_properties(self, - graph: torch.Tensor, - x: torch.Tensor, - ids: List[int]) -> torch.Tensor: - - f_scattering, f_absoprtion = ice_transparency() - graph[:len(ids),7] = torch.tensor(f_scattering(x[ids, self.feature_indexes["dom_z"]])) - graph[:len(ids),8] = torch.tensor(f_absoprtion(x[ids, self.feature_indexes["dom_z"]])) + + def _add_ice_properties( + self, graph: torch.Tensor, x: torch.Tensor, ids: List[int] + ) -> torch.Tensor: + + graph[: len(ids), -2] = torch.tensor( + self.f_scattering(x[ids, self.feature_indexes[self.z_name]]) + ) + graph[: len(ids), -1] = torch.tensor( + self.f_absoprtion(x[ids, self.feature_indexes[self.z_name]]) + ) return graph - def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: - - event_length = x.shape[0] - x[:, self.feature_indexes["hlc"]] = torch.logical_not(x[:, self.feature_indexes["hlc"]]) + def _pulse_sampler( + self, x: torch.Tensor, event_length: int + ) -> torch.Tensor: if event_length < self.max_length: ids = torch.arange(event_length) else: ids = torch.randperm(event_length) - auxiliary_n = torch.nonzero(x[:, self.feature_indexes["hlc"]] == 0).squeeze(1) - auxiliary_p = torch.nonzero(x[:, self.feature_indexes["hlc"]] == 1).squeeze(1) + auxiliary_n = torch.nonzero( + x[:, self.feature_indexes[self.hlc_name]] == 0 + ).squeeze(1) + auxiliary_p = torch.nonzero( + x[:, self.feature_indexes[self.hlc_name]] == 1 + ).squeeze(1) ids_n = ids[auxiliary_n][: min(self.max_length, len(auxiliary_n))] - ids_p = ids[auxiliary_p][: min(self.max_length - len(ids_n), len(auxiliary_p))] + ids_p = ids[auxiliary_p][ + : min(self.max_length - len(ids_n), len(auxiliary_p)) + ] ids = torch.cat([ids_n, ids_p]).sort().values - event_length = self.max_length - - graph = torch.zeros([event_length, len(self.all_features)]) - for idx, feature in enumerate(self.all_features[:7]): + return ids + + def _construct_nodes(self, x: torch.Tensor) -> Tuple[Data, List[str]]: + + event_length = x.shape[0] + x[:, self.feature_indexes[self.hlc_name]] = torch.logical_not( + x[:, self.feature_indexes[self.hlc_name]] + ) # hlc in kaggle was flipped + ids = self._pulse_sampler(x, event_length) + event_length = min(self.max_length, event_length) + + graph = torch.zeros([event_length, self.n_features]) + for idx, feature in enumerate( + self.all_features[: self.n_features - 2] + ): graph[:event_length, idx] = x[ids, self.feature_indexes[feature]] - graph = self._add_ice_properties(graph, x, ids) #ice properties - return Data(x=graph) \ No newline at end of file + graph = self._add_ice_properties(graph, x, ids) # ice properties + return Data(x=graph) diff --git a/src/graphnet/models/graphs/utils.py b/src/graphnet/models/graphs/utils.py index 2d5ff5196..e97130cb6 100644 --- a/src/graphnet/models/graphs/utils.py +++ b/src/graphnet/models/graphs/utils.py @@ -165,17 +165,18 @@ def cluster_summarize_with_percentiles( return array -def ice_transparency(datum: int = 1950): - """Calculate the normalized scattering and absorption lengths - of ice as a function of depth. +def ice_transparency(datum: int = 1950) -> Tuple[interp1d, interp1d]: + """Calculate the normalized scattering and absorption lengths. + + Values are calculated for iceCube ice as a function of depth. Args: datum: The datum depth in meters. Default to 1950. Returns: - f_scattering: Function that takes a normalized depth - and returns the corresponding normalized + f_scattering: Function that takes a normalized depth + and returns the corresponding normalized scattering length. f_absorption: Function that takes a normalized depth and returns the corresponding normalized @@ -184,13 +185,13 @@ def ice_transparency(datum: int = 1950): # Data from page 31 of https://arxiv.org/pdf/1301.5361.pdf # Datum is from footnote 8 of page 29 df = pd.read_parquet( - os.path.join(DATA_DIR, "ice_properties/ice_transparency.parquet"), - ) + os.path.join(DATA_DIR, "ice_properties/ice_transparency.parquet"), + ) df["z"] = df["depth"] - datum df["z_norm"] = df["z"] / 500 - df[["scattering_len_norm", "absorption_len_norm"]] = RobustScaler().fit_transform( - df[["scattering_len", "absorption_len"]] - ) + df[ + ["scattering_len_norm", "absorption_len_norm"] + ] = RobustScaler().fit_transform(df[["scattering_len", "absorption_len"]]) f_scattering = interp1d(df["z_norm"], df["scattering_len_norm"]) f_absorption = interp1d(df["z_norm"], df["absorption_len_norm"]) From e38ded5cb763988d8b7e6fb9e09f87db3e96bbc5 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:14:48 +0100 Subject: [PATCH 053/124] fix examples --- examples/01_icetray/01_convert_i3_files.py | 3 --- examples/01_icetray/03_i3_deployer_example.py | 2 +- examples/01_icetray/04_i3_module_in_native_icetray_example.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/01_icetray/01_convert_i3_files.py b/examples/01_icetray/01_convert_i3_files.py index 9a39f95e7..279adf6e0 100644 --- a/examples/01_icetray/01_convert_i3_files.py +++ b/examples/01_icetray/01_convert_i3_files.py @@ -72,9 +72,6 @@ def main_icecube_upgrade(backend: str) -> None: ], outdir, workers=workers, - # nb_files_to_batch=10, - # sequential_batch_pattern="temp_{:03d}", - # input_file_batch_pattern="[A-Z]{1}_[0-9]{5}*.i3.zst", icetray_verbose=1, ) converter(inputs) diff --git a/examples/01_icetray/03_i3_deployer_example.py b/examples/01_icetray/03_i3_deployer_example.py index f55aa769c..28d73c00d 100644 --- a/examples/01_icetray/03_i3_deployer_example.py +++ b/examples/01_icetray/03_i3_deployer_example.py @@ -10,7 +10,7 @@ PRETRAINED_MODEL_DIR, ) from graphnet.data.constants import FEATURES, TRUTH -from graphnet.data.extractors.i3featureextractor import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCubeUpgrade, ) from graphnet.utilities.argparse import ArgumentParser diff --git a/examples/01_icetray/04_i3_module_in_native_icetray_example.py b/examples/01_icetray/04_i3_module_in_native_icetray_example.py index 74da5e499..ab8c1b58c 100644 --- a/examples/01_icetray/04_i3_module_in_native_icetray_example.py +++ b/examples/01_icetray/04_i3_module_in_native_icetray_example.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, List, Sequence from graphnet.data.constants import FEATURES -from graphnet.data.extractors.i3featureextractor import ( +from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCubeUpgrade, ) from graphnet.constants import ( From 19ce2392f017b59c15ba2f052658f84cb1cd5d59 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:17:28 +0100 Subject: [PATCH 054/124] remove unused imports in extractor.py --- src/graphnet/data/extractors/extractor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/graphnet/data/extractors/extractor.py b/src/graphnet/data/extractors/extractor.py index b5e5ed37c..3e9a8f715 100644 --- a/src/graphnet/data/extractors/extractor.py +++ b/src/graphnet/data/extractors/extractor.py @@ -1,13 +1,12 @@ """Base I3Extractor class(es).""" - +from typing import TYPE_CHECKING from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, Optional from graphnet.utilities.imports import has_icecube_package from graphnet.utilities.logging import Logger if has_icecube_package() or TYPE_CHECKING: - from icecube import icetray, dataio # pyright: reportMissingImports=false + from icecube import icetray # pyright: reportMissingImports=false class Extractor(ABC, Logger): From b6bf74a62829b28db67bfc6ed3930fd090398405 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:40:29 +0100 Subject: [PATCH 055/124] Type hint test --- src/graphnet/data/dataconverter.py | 8 ++++---- src/graphnet/data/readers/graphnet_file_reader.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index f50912e4e..12926e510 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -36,10 +36,10 @@ class DataConverter(ABC, Logger): def __init__( self, - file_reader: Type[GraphNeTFileReader], - save_method: Type[GraphNeTWriter], + file_reader: GraphNeTFileReader, + save_method: GraphNeTWriter, outdir: str, - extractors: Union[Type[Extractor], List[Type[Extractor]]], + extractors: Union[Extractor, List[Extractor]], index_column: str = "event_no", num_workers: int = 1, ) -> None: @@ -68,7 +68,7 @@ def __init__( # Set Extractors. Will throw error if extractors are incompatible # with reader. - self._file_reader.set_extractors(extractors) + self._file_reader.set_extractors(extractors=extractors) # Base class constructor super().__init__(name=__name__, class_name=self.__class__.__name__) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index ab6464e13..c9d859335 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -27,7 +27,7 @@ class properties `accepted_file_extensions` and `accepted_extractors`. """ @abstractmethod - def __call__(self, file_path: str) -> List[OrderedDict]: + def __call__(self, file_path: Union[str, I3FileSet]) -> List[OrderedDict]: """Open and apply extractors to a single file. The `output` must be a list of dictionaries, where the number of events From a10b8334fc12f105d0cea20673e0a60b65245630 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:41:31 +0100 Subject: [PATCH 056/124] type hint test --- src/graphnet/data/dataconverter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 12926e510..77c4839c1 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -84,7 +84,7 @@ def __call__(self, input_dir: Union[str, List[str]]) -> None: """ # Get the file reader to produce a list of input files # in the directory - input_files = self._file_reader.find_files(path=input_dir) # type: ignore + input_files = self._file_reader.find_files(path=input_dir) self._launch_jobs(input_files=input_files) self._output_files = glob( os.path.join( @@ -129,7 +129,7 @@ def _process_file(self, file_path: Union[str, I3FileSet]) -> None: """ # Read and apply extractors data = self._file_reader(file_path=file_path) - n_events = len(data) # type: ignore + n_events = len(data) # Assign event_no's to each event in data and transform to pd.DataFrame data = self._assign_event_no(data=data) @@ -153,7 +153,7 @@ def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: file_name = os.path.basename(input_file_path) index_of_dot = file_name.index(".") file_name_without_extension = file_name[:index_of_dot] - return file_name_without_extension # type: ignore + return file_name_without_extension @final def _assign_event_no( @@ -313,7 +313,7 @@ def merge_files(self, files: Optional[List[str]] = None) -> None: # Merge files merge_path = os.path.join(self._output_dir, "merged") self.info(f"Merging files to {merge_path}") - self._save_method.merge_files( # type:ignore + self._save_method.merge_files( files=files_to_merge, output_dir=merge_path, ) From 9a3288c989c4d016218b91ae27031b013cd6c402 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:54:37 +0100 Subject: [PATCH 057/124] type hints --- src/graphnet/data/readers/graphnet_file_reader.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index c9d859335..c24ab12a4 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -26,6 +26,9 @@ class GraphNeTFileReader(Logger, ABC): class properties `accepted_file_extensions` and `accepted_extractors`. """ + _accepted_file_extensions: List[str] = [] + _accepted_extractors: List[Extractor] = [] + @abstractmethod def __call__(self, file_path: Union[str, I3FileSet]) -> List[OrderedDict]: """Open and apply extractors to a single file. @@ -39,17 +42,17 @@ def __call__(self, file_path: Union[str, I3FileSet]) -> List[OrderedDict]: @property def accepted_file_extensions(self) -> List[str]: """Return list of accepted file extensions.""" - return self._accepted_file_extensions # type: ignore + return self._accepted_file_extensions @property def accepted_extractors(self) -> List[Extractor]: """Return list of compatible `Extractor`(s).""" - return self._accepted_extractors # type: ignore + return self._accepted_extractors @property def extracor_names(self) -> List[str]: """Return list of table names produced by extractors.""" - return [extractor.name for extractor in self._extractors] # type: ignore + return [extractor.name for extractor in self._extractors] def find_files( self, path: Union[str, List[str]] From b619017fa151a4b7e408224bbf953ac7b300e2dc Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 14:56:05 +0100 Subject: [PATCH 058/124] delete I3GenericExtractor unit test --- tests/data/test_i3genericextractor.py | 97 --------------------------- 1 file changed, 97 deletions(-) delete mode 100644 tests/data/test_i3genericextractor.py diff --git a/tests/data/test_i3genericextractor.py b/tests/data/test_i3genericextractor.py deleted file mode 100644 index e77727eaf..000000000 --- a/tests/data/test_i3genericextractor.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Unit tests for I3GenericExtractor class.""" - -import os - -import numpy as np - -import graphnet.constants -from graphnet.data.extractors.icecube import ( - I3FeatureExtractorIceCube86, - I3TruthExtractor, - I3GenericExtractor, -) -from graphnet.utilities.imports import has_icecube_package - -if has_icecube_package(): - from icecube import dataio # pyright: reportMissingImports=false - -# Global variable(s) -TEST_DATA_DIR = os.path.join( - graphnet.constants.TEST_DATA_DIR, "i3", "oscNext_genie_level7_v02" -) -FILE_NAME = "oscNext_genie_level7_v02_first_5_frames" -GCD_FILE = ( - "GeoCalibDetectorStatus_AVG_55697-57531_PASS2_SPE_withScaledNoise.i3.gz" -) - - -# Unit test(s) -def test_i3genericextractor(test_data_dir: str = TEST_DATA_DIR) -> None: - """Test the implementation of `I3GenericExtractor`.""" - # Constants(s) - mc_tree = "I3MCTree" - pulse_series = "SRTInIcePulses" - - # Constructor I3Extractor instance(s) - generic_extractor = I3GenericExtractor(keys=[mc_tree, pulse_series]) - truth_extractor = I3TruthExtractor() - feature_extractor = I3FeatureExtractorIceCube86(pulse_series) - - i3_file = os.path.join(test_data_dir, FILE_NAME) + ".i3.gz" - gcd_file = os.path.join(test_data_dir, GCD_FILE) - - generic_extractor.set_gcd(i3_file, gcd_file) - truth_extractor.set_gcd(i3_file, gcd_file) - feature_extractor.set_gcd(i3_file, gcd_file) - - i3_file_io = dataio.I3File(i3_file, "r") - ix_test = 5 - while i3_file_io.more(): - try: - frame = i3_file_io.pop_physics() - except: # noqa: E722 - continue - - generic_data = generic_extractor(frame) - truth_data = truth_extractor(frame) - feature_data = feature_extractor(frame) - - if ix_test == 5: - print(list(generic_data[pulse_series].keys())) - print(list(truth_data.keys())) - print(list(feature_data.keys())) - - # Truth vs. generic - key_pairs = [ - ("energy", "energy"), - ("zenith", "dir__zenith"), - ("azimuth", "dir__azimuth"), - ("pid", "pdg_encoding"), - ] - - for truth_key, generic_key in key_pairs: - assert ( - truth_data[truth_key] - == generic_data[f"{mc_tree}__primaries"][generic_key][0] - ) - - # Reco vs. generic - key_pairs = [ - ("charge", "charge"), - ("dom_time", "time"), - ("dom_x", "position__x"), - ("dom_y", "position__y"), - ("dom_z", "position__z"), - ("width", "width"), - ("pmt_area", "area"), - ("rde", "relative_dom_eff"), - ] - - for reco_key, generic_key in key_pairs: - assert np.allclose( - feature_data[reco_key], generic_data[pulse_series][generic_key] - ) - - ix_test -= 1 - if ix_test == 0: - break From 636e116db6ea2a5bf1481e19536684dcdd4df5aa Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Sun, 11 Feb 2024 20:40:38 +0100 Subject: [PATCH 059/124] test --- tests/data/test_i3extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_i3extractor.py b/tests/data/test_i3extractor.py index f1c8c3ff7..ce40626c0 100644 --- a/tests/data/test_i3extractor.py +++ b/tests/data/test_i3extractor.py @@ -1,4 +1,4 @@ -"""Unit tests for I3Extractor class.""" +"""Unit tests for I3Extractor.""" from graphnet.data.extractors.icecube import ( I3FeatureExtractorIceCube86, From fca802650d7fc670f1a9b5ea1c6dd87c02971179 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Mon, 12 Feb 2024 07:04:35 +0100 Subject: [PATCH 060/124] Remove timm package dependency --- src/graphnet/models/components/layers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/graphnet/models/components/layers.py b/src/graphnet/models/components/layers.py index 9043e4796..097ca2a23 100644 --- a/src/graphnet/models/components/layers.py +++ b/src/graphnet/models/components/layers.py @@ -16,7 +16,6 @@ from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer from torch_geometric.utils import to_dense_batch from pytorch_lightning import LightningModule -from timm.models.layers import drop_path class DynEdgeConv(EdgeConv, LightningModule): @@ -199,24 +198,31 @@ def forward( class DropPath(LightningModule): - """DropPath regularization module for neural networks.""" + """Drop paths (Stochastic Depth) per sample.""" def __init__( self, - drop_prob: Optional[float] = None, + drop_prob: float = 0.0, ): """Construct `DropPath`. Args: drop_prob: Probability of dropping a path during training. - If None, no paths are dropped. Defaults to None. + If 0.0, no paths are dropped. Defaults to None. """ super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x: Tensor) -> Tensor: """Forward pass.""" - return drop_path(x, self.drop_prob, self.training) + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + return x * random_tensor def extra_repr(self) -> str: """Return extra representation of the module.""" From 31a448659611db1e2b0d7c4704783fc35b6f7e29 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 08:36:49 +0100 Subject: [PATCH 061/124] test --- src/graphnet/data/dataconverter.py | 4 ++-- src/graphnet/data/readers/graphnet_file_reader.py | 10 +++++++--- src/graphnet/data/readers/i3reader.py | 12 +++--------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 77c4839c1..aabf465ae 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -157,8 +157,8 @@ def _create_file_name(self, input_file_path: Union[str, I3FileSet]) -> str: @final def _assign_event_no( - self, data: List[OrderedDict[str, Any]] - ) -> Dict[str, pd.DataFrame]: + self, data: List[OrderedDict] + ) -> Union[Dict[str, pd.DataFrame], Dict[str, List[pd.DataFrame]]]: # Request event_no's for the entire file event_nos = self._request_event_nos(n_ids=len(data)) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index c24ab12a4..87f829fac 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -4,7 +4,7 @@ file formats. """ -from typing import List, Union, OrderedDict +from typing import List, Union, OrderedDict, Any from abc import abstractmethod, ABC import glob import os @@ -27,7 +27,7 @@ class properties `accepted_file_extensions` and `accepted_extractors`. """ _accepted_file_extensions: List[str] = [] - _accepted_extractors: List[Extractor] = [] + _accepted_extractors: List[Any] = [] @abstractmethod def __call__(self, file_path: Union[str, I3FileSet]) -> List[OrderedDict]: @@ -79,13 +79,17 @@ def find_files( return files @final - def set_extractors(self, extractors: List[Extractor]) -> None: + def set_extractors( + self, extractors: Union[Extractor, List[Extractor]] + ) -> None: """Set `Extractor`(s) as member variable. Args: extractors: A list of `Extractor`(s) to set as member variable. """ self._validate_extractors(extractors) + if not isinstance(extractors, list): + extractors = [extractors] self._extractors = extractors @final diff --git a/src/graphnet/data/readers/i3reader.py b/src/graphnet/data/readers/i3reader.py index 926c2395a..523367943 100644 --- a/src/graphnet/data/readers/i3reader.py +++ b/src/graphnet/data/readers/i3reader.py @@ -27,9 +27,7 @@ class I3Reader(GraphNeTFileReader): def __init__( self, gcd_rescue: str, - i3_filters: Union[ - Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter(), # type: ignore + i3_filters: Union[I3Filter, List[I3Filter]] = None, icetray_verbose: int = 0, ): """Initialize `I3Reader`. @@ -52,6 +50,8 @@ def __init__( if icetray_verbose == 0: icetray.I3Logger.global_logger = icetray.I3NullLogger() + if i3_filters is None: + i3_filters = [NullSplitI3Filter()] # Set Member Variables self._accepted_file_extensions = [".bz2", ".zst", ".gz"] self._accepted_extractors = [I3Extractor] @@ -97,12 +97,6 @@ def __call__(self, file_path: I3FileSet) -> List[OrderedDict]: # type: ignore data_dict = OrderedDict(zip(self.extracor_names, results)) - # If an I3GenericExtractor is used, we want each automatically - # parsed key to be stored as a separate table. - # for extractor in self._extractors: - # if isinstance(extractor, I3GenericExtractor): - # data_dict.update(data_dict.pop(extractor._name)) - data.append(data_dict) return data From dec554cc7535e949ad7be1201ea7e892e78695af Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 08:37:47 +0100 Subject: [PATCH 062/124] update import in graphnet.pisa --- src/graphnet/pisa/fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/pisa/fitting.py b/src/graphnet/pisa/fitting.py index dfcc20a37..5408f9bfc 100644 --- a/src/graphnet/pisa/fitting.py +++ b/src/graphnet/pisa/fitting.py @@ -23,7 +23,7 @@ from pisa.analysis.analysis import Analysis from pisa import ureg -from graphnet.data.sqlite import create_table_and_save_to_sql +from graphnet.data.utilities import create_table_and_save_to_sql mpl.use("pdf") plt.rc("font", family="serif") From 4c428140d36cdcc64b7103bbd7806487bc8dac8f Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 08:40:35 +0100 Subject: [PATCH 063/124] polish 01-01 --- examples/01_icetray/01_convert_i3_files.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/01_icetray/01_convert_i3_files.py b/examples/01_icetray/01_convert_i3_files.py index 279adf6e0..6a9d010ec 100644 --- a/examples/01_icetray/01_convert_i3_files.py +++ b/examples/01_icetray/01_convert_i3_files.py @@ -42,12 +42,12 @@ def main_icecube86(backend: str) -> None: inputs = [f"{TEST_DATA_DIR}/i3/oscNext_genie_level7_v02"] outdir = f"{EXAMPLE_OUTPUT_DIR}/convert_i3_files/ic86" - converter: DataConverter = CONVERTER_CLASS[backend]( - [ + converter = CONVERTER_CLASS[backend]( + extractors=[ I3FeatureExtractorIceCube86("SRTInIcePulses"), I3TruthExtractor(), ], - outdir, + outdir=outdir, ) converter(inputs) if backend == "sqlite": From 10f4b36c6976078193ff667bdb2461b643b3fc2d Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 08:47:31 +0100 Subject: [PATCH 064/124] mypy --- src/graphnet/data/readers/graphnet_file_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index 87f829fac..d168ba09d 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -87,9 +87,9 @@ def set_extractors( Args: extractors: A list of `Extractor`(s) to set as member variable. """ - self._validate_extractors(extractors) if not isinstance(extractors, list): extractors = [extractors] + self._validate_extractors(extractors) self._extractors = extractors @final From 4c22db923be333f78504324031ebc428555ad61d Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 09:18:37 +0100 Subject: [PATCH 065/124] polish 01-01 --- examples/01_icetray/01_convert_i3_files.py | 15 ++++++++-- src/graphnet/data/dataconverter.py | 7 +++-- .../data/parquet/deprecated_methods.py | 7 ++--- .../data/pre_configured/dataconverters.py | 29 +++++++------------ .../data/sqlite/deprecated_methods.py | 6 ++-- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/examples/01_icetray/01_convert_i3_files.py b/examples/01_icetray/01_convert_i3_files.py index 6a9d010ec..870fd09f4 100644 --- a/examples/01_icetray/01_convert_i3_files.py +++ b/examples/01_icetray/01_convert_i3_files.py @@ -1,6 +1,7 @@ """Example of converting I3-files to SQLite and Parquet.""" import os +from glob import glob from graphnet.constants import EXAMPLE_OUTPUT_DIR, TEST_DATA_DIR from graphnet.data.extractors.icecube import ( @@ -41,6 +42,9 @@ def main_icecube86(backend: str) -> None: inputs = [f"{TEST_DATA_DIR}/i3/oscNext_genie_level7_v02"] outdir = f"{EXAMPLE_OUTPUT_DIR}/convert_i3_files/ic86" + gcd_rescue = glob( + "{TEST_DATA_DIR}/i3/oscNext_genie_level7_v02/*GeoCalib*" + )[0] converter = CONVERTER_CLASS[backend]( extractors=[ @@ -48,6 +52,8 @@ def main_icecube86(backend: str) -> None: I3TruthExtractor(), ], outdir=outdir, + gcd_rescue=gcd_rescue, + workers=1, ) converter(inputs) if backend == "sqlite": @@ -61,18 +67,21 @@ def main_icecube_upgrade(backend: str) -> None: inputs = [f"{TEST_DATA_DIR}/i3/upgrade_genie_step4_140028_000998"] outdir = f"{EXAMPLE_OUTPUT_DIR}/convert_i3_files/upgrade" + gcd_rescue = glob( + "{TEST_DATA_DIR}/i3/upgrade_genie_step4_140028_000998/*GeoCalib*" + )[0] workers = 1 converter: DataConverter = CONVERTER_CLASS[backend]( - [ + extractors=[ I3TruthExtractor(), I3RetroExtractor(), I3FeatureExtractorIceCubeUpgrade("I3RecoPulseSeriesMap_mDOM"), I3FeatureExtractorIceCubeUpgrade("I3RecoPulseSeriesMap_DEgg"), ], - outdir, + outdir=outdir, workers=workers, - icetray_verbose=1, + gcd_rescue=gcd_rescue, ) converter(inputs) if backend == "sqlite": diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index aabf465ae..828efa199 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -17,6 +17,7 @@ from .readers.graphnet_file_reader import GraphNeTFileReader from .writers.graphnet_writer import GraphNeTWriter from .extractors import Extractor +from .extractors.icecube import I3Extractor from .dataclasses import I3FileSet @@ -39,7 +40,7 @@ def __init__( file_reader: GraphNeTFileReader, save_method: GraphNeTWriter, outdir: str, - extractors: Union[Extractor, List[Extractor]], + extractors: Union[List[Extractor], List[I3Extractor]], index_column: str = "event_no", num_workers: int = 1, ) -> None: @@ -68,6 +69,8 @@ def __init__( # Set Extractors. Will throw error if extractors are incompatible # with reader. + if not isinstance(extractors, list): + extractors = [extractors] self._file_reader.set_extractors(extractors=extractors) # Base class constructor @@ -132,7 +135,7 @@ def _process_file(self, file_path: Union[str, I3FileSet]) -> None: n_events = len(data) # Assign event_no's to each event in data and transform to pd.DataFrame - data = self._assign_event_no(data=data) + data = self._assign_event_no(data=data) # type: ignore # Create output file name output_file_name = self._create_file_name(input_file_path=file_path) diff --git a/src/graphnet/data/parquet/deprecated_methods.py b/src/graphnet/data/parquet/deprecated_methods.py index 717e798bb..423e1aa00 100644 --- a/src/graphnet/data/parquet/deprecated_methods.py +++ b/src/graphnet/data/parquet/deprecated_methods.py @@ -7,7 +7,6 @@ from graphnet.data.extractors.icecube import I3Extractor from graphnet.data.extractors.icecube.utilities.i3_filters import ( I3Filter, - NullSplitI3Filter, ) from graphnet.data import I3ToParquetConverter @@ -18,13 +17,11 @@ class ParquetDataConverter(I3ToParquetConverter): def __init__( self, gcd_rescue: str, - extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + extractors: List[I3Extractor], outdir: str, index_column: str = "event_no", workers: int = 1, - i3_filters: Union[ - Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter(), # type: ignore + i3_filters: Union[I3Filter, List[I3Filter]] = None, # type: ignore ): """Convert I3 files to Parquet. diff --git a/src/graphnet/data/pre_configured/dataconverters.py b/src/graphnet/data/pre_configured/dataconverters.py index fcd26fd49..63d8e61ab 100644 --- a/src/graphnet/data/pre_configured/dataconverters.py +++ b/src/graphnet/data/pre_configured/dataconverters.py @@ -6,10 +6,7 @@ from graphnet.data.readers import I3Reader from graphnet.data.writers import ParquetWriter, SQLiteWriter from graphnet.data.extractors.icecube import I3Extractor -from graphnet.data.extractors.icecube.utilities.i3_filters import ( - I3Filter, - NullSplitI3Filter, -) +from graphnet.data.extractors.icecube.utilities.i3_filters import I3Filter class I3ToParquetConverter(DataConverter): @@ -18,13 +15,11 @@ class I3ToParquetConverter(DataConverter): def __init__( self, gcd_rescue: str, - extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + extractors: List[I3Extractor], outdir: str, index_column: str = "event_no", num_workers: int = 1, - i3_filters: Union[ - Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter(), # type: ignore + i3_filters: Union[I3Filter, List[I3Filter]] = None, # type: ignore ): """Convert I3 files to Parquet. @@ -50,9 +45,9 @@ def __init__( `NullSplitI3Filter`. """ super().__init__( - file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), # type: ignore - save_method=ParquetWriter(), # type: ignore - extractors=extractors, # type: ignore + file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), + save_method=ParquetWriter(), + extractors=extractors, num_workers=num_workers, index_column=index_column, outdir=outdir, @@ -65,13 +60,11 @@ class I3ToSQLiteConverter(DataConverter): def __init__( self, gcd_rescue: str, - extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + extractors: List[I3Extractor], outdir: str, index_column: str = "event_no", num_workers: int = 1, - i3_filters: Union[ - Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter(), # type: ignore + i3_filters: Union[I3Filter, List[I3Filter]] = None, # type: ignore ): """Convert I3 files to Parquet. @@ -97,9 +90,9 @@ def __init__( `NullSplitI3Filter`. """ super().__init__( - file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), # type: ignore - save_method=SQLiteWriter(), # type: ignore - extractors=extractors, # type: ignore + file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters), + save_method=SQLiteWriter(), + extractors=extractors, num_workers=num_workers, index_column=index_column, outdir=outdir, diff --git a/src/graphnet/data/sqlite/deprecated_methods.py b/src/graphnet/data/sqlite/deprecated_methods.py index f3da0d10f..30b563c59 100644 --- a/src/graphnet/data/sqlite/deprecated_methods.py +++ b/src/graphnet/data/sqlite/deprecated_methods.py @@ -19,13 +19,11 @@ class SQLiteDataConverter(I3ToSQLiteConverter): def __init__( self, gcd_rescue: str, - extractors: Union[Type[I3Extractor], List[Type[I3Extractor]]], + extractors: List[I3Extractor], outdir: str, index_column: str = "event_no", workers: int = 1, - i3_filters: Union[ - Type[I3Filter], List[Type[I3Filter]] - ] = NullSplitI3Filter(), # type: ignore + i3_filters: Union[I3Filter, List[I3Filter]] = None, # type: ignore ): """Convert I3 files to Parquet. From 664189bf97bda05d359658388d86b83fc32384be Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 10:22:15 +0100 Subject: [PATCH 066/124] mypy.. --- src/graphnet/data/readers/graphnet_file_reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index d168ba09d..13a01faf9 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -13,6 +13,7 @@ from graphnet.utilities.logging import Logger from graphnet.data.dataclasses import I3FileSet from graphnet.data.extractors.extractor import Extractor +from graphnet.data.extractors.icecube import I3Extractor class GraphNeTFileReader(Logger, ABC): @@ -80,7 +81,7 @@ def find_files( @final def set_extractors( - self, extractors: Union[Extractor, List[Extractor]] + self, extractors: Union[List[Extractor], List[I3Extractor]] ) -> None: """Set `Extractor`(s) as member variable. From 83a011c91b9cf89cbca5e20a4baa4c825b33503b Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 10:45:29 +0100 Subject: [PATCH 067/124] mypy --- src/graphnet/data/writers/graphnet_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/graphnet/data/writers/graphnet_writer.py b/src/graphnet/data/writers/graphnet_writer.py index 330a3d868..518823b5a 100644 --- a/src/graphnet/data/writers/graphnet_writer.py +++ b/src/graphnet/data/writers/graphnet_writer.py @@ -5,7 +5,7 @@ """ import os -from typing import Dict, List +from typing import Dict, List, Union from abc import abstractmethod, ABC from graphnet.utilities.decorators import final @@ -58,7 +58,7 @@ def merge_files( @final def __call__( self, - data: Dict[str, pd.DataFrame], + data: Union[Dict[str, pd.DataFrame], Dict[str, List[pd.DataFrame]]], file_name: str, output_dir: str, n_events: int, From 6aca5d8013e6c0128eb69e8138fc3e82f6631a14 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 10:50:42 +0100 Subject: [PATCH 068/124] polish 01-01 --- examples/01_icetray/01_convert_i3_files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/01_icetray/01_convert_i3_files.py b/examples/01_icetray/01_convert_i3_files.py index 870fd09f4..9f0795cb1 100644 --- a/examples/01_icetray/01_convert_i3_files.py +++ b/examples/01_icetray/01_convert_i3_files.py @@ -57,7 +57,7 @@ def main_icecube86(backend: str) -> None: ) converter(inputs) if backend == "sqlite": - converter.merge_files(os.path.join(outdir, "merged")) + converter.merge_files() def main_icecube_upgrade(backend: str) -> None: @@ -85,7 +85,7 @@ def main_icecube_upgrade(backend: str) -> None: ) converter(inputs) if backend == "sqlite": - converter.merge_files(os.path.join(outdir, "merged")) + converter.merge_files() if __name__ == "__main__": From 398c27ac9c1f3a85c4e309274d3781efa648d604 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 11:03:35 +0100 Subject: [PATCH 069/124] mypy --- src/graphnet/data/readers/graphnet_file_reader.py | 4 +++- src/graphnet/data/writers/graphnet_writer.py | 2 +- src/graphnet/data/writers/parquet_writer.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/graphnet/data/readers/graphnet_file_reader.py b/src/graphnet/data/readers/graphnet_file_reader.py index 13a01faf9..c590c6424 100644 --- a/src/graphnet/data/readers/graphnet_file_reader.py +++ b/src/graphnet/data/readers/graphnet_file_reader.py @@ -94,7 +94,9 @@ def set_extractors( self._extractors = extractors @final - def _validate_extractors(self, extractors: List[Extractor]) -> None: + def _validate_extractors( + self, extractors: Union[List[Extractor], List[I3Extractor]] + ) -> None: for extractor in extractors: try: assert isinstance(extractor, tuple(self.accepted_extractors)) # type: ignore diff --git a/src/graphnet/data/writers/graphnet_writer.py b/src/graphnet/data/writers/graphnet_writer.py index 518823b5a..f6ec03029 100644 --- a/src/graphnet/data/writers/graphnet_writer.py +++ b/src/graphnet/data/writers/graphnet_writer.py @@ -28,7 +28,7 @@ class GraphNeTWriter(Logger, ABC): @abstractmethod def _save_file( self, - data: Dict[str, pd.DataFrame], + data: Union[Dict[str, pd.DataFrame], Dict[str, List[pd.DataFrame]]], output_file_path: str, n_events: int, ) -> None: diff --git a/src/graphnet/data/writers/parquet_writer.py b/src/graphnet/data/writers/parquet_writer.py index 755a829c1..18e524ca9 100644 --- a/src/graphnet/data/writers/parquet_writer.py +++ b/src/graphnet/data/writers/parquet_writer.py @@ -19,7 +19,7 @@ class ParquetWriter(GraphNeTWriter): # Abstract method implementation(s) def _save_file( self, - data: Dict[str, pd.DataFrame], + data: Dict[str, List[pd.DataFrame]], output_file_path: str, n_events: int, ) -> None: From bd627a3e794dd7d3e5e55208f570c79650107918 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 11:15:47 +0100 Subject: [PATCH 070/124] mypy please --- src/graphnet/data/dataconverter.py | 4 ++-- src/graphnet/data/readers/i3reader.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 828efa199..70d5ae89e 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -131,11 +131,11 @@ def _process_file(self, file_path: Union[str, I3FileSet]) -> None: This function is called in parallel. """ # Read and apply extractors - data = self._file_reader(file_path=file_path) + data: List[OrderedDict] = self._file_reader(file_path=file_path) n_events = len(data) # Assign event_no's to each event in data and transform to pd.DataFrame - data = self._assign_event_no(data=data) # type: ignore + data: Union[Dict[str, pd.DataFrame], Dict[str, List[pd.DataFrame]]] = self._assign_event_no(data=data) # type: ignore # Create output file name output_file_name = self._create_file_name(input_file_path=file_path) diff --git a/src/graphnet/data/readers/i3reader.py b/src/graphnet/data/readers/i3reader.py index 523367943..ed5fd7c1f 100644 --- a/src/graphnet/data/readers/i3reader.py +++ b/src/graphnet/data/readers/i3reader.py @@ -77,7 +77,7 @@ def __call__(self, file_path: I3FileSet) -> List[OrderedDict]: # type: ignore assert isinstance(extractor, I3Extractor) extractor.set_gcd( i3_file=file_path.i3_file, gcd_file=file_path.gcd_file - ) # type: ignore + ) # Open I3 file i3_file_io = dataio.I3File(file_path.i3_file, "r") From e0b4ba4e3bf719bb599fe6545b30022ce84ff0c5 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 11:21:02 +0100 Subject: [PATCH 071/124] mypy... --- src/graphnet/data/dataconverter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 70d5ae89e..43929cedd 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -135,14 +135,17 @@ def _process_file(self, file_path: Union[str, I3FileSet]) -> None: n_events = len(data) # Assign event_no's to each event in data and transform to pd.DataFrame - data: Union[Dict[str, pd.DataFrame], Dict[str, List[pd.DataFrame]]] = self._assign_event_no(data=data) # type: ignore + dataframes = self._assign_event_no(data=data) + + # Delete `data` to save memory + del data # Create output file name output_file_name = self._create_file_name(input_file_path=file_path) # Apply save method self._save_method( - data=data, + data=dataframes, file_name=output_file_name, n_events=n_events, output_dir=self._output_dir, From 23f0a9c945b2b8b24b5e9ea7ccfe3cf0fc8eea41 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 11:28:40 +0100 Subject: [PATCH 072/124] add comment in dataconverter --- src/graphnet/data/dataconverter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/graphnet/data/dataconverter.py b/src/graphnet/data/dataconverter.py index 43929cedd..69d13be50 100644 --- a/src/graphnet/data/dataconverter.py +++ b/src/graphnet/data/dataconverter.py @@ -132,6 +132,8 @@ def _process_file(self, file_path: Union[str, I3FileSet]) -> None: """ # Read and apply extractors data: List[OrderedDict] = self._file_reader(file_path=file_path) + + # Count number of events n_events = len(data) # Assign event_no's to each event in data and transform to pd.DataFrame From f3720caaafb8c0f05794f1b6b401910285d43f14 Mon Sep 17 00:00:00 2001 From: Rasmus Oersoe Date: Mon, 12 Feb 2024 12:13:22 +0100 Subject: [PATCH 073/124] polish extractor.py --- src/graphnet/data/extractors/extractor.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/graphnet/data/extractors/extractor.py b/src/graphnet/data/extractors/extractor.py index 3e9a8f715..d03419870 100644 --- a/src/graphnet/data/extractors/extractor.py +++ b/src/graphnet/data/extractors/extractor.py @@ -1,13 +1,9 @@ """Base I3Extractor class(es).""" -from typing import TYPE_CHECKING +from typing import Any from abc import ABC, abstractmethod -from graphnet.utilities.imports import has_icecube_package from graphnet.utilities.logging import Logger -if has_icecube_package() or TYPE_CHECKING: - from icecube import icetray # pyright: reportMissingImports=false - class Extractor(ABC, Logger): """Base class for extracting information from data files. @@ -40,11 +36,11 @@ def __init__(self, extractor_name: str): super().__init__(name=__name__, class_name=self.__class__.__name__) @abstractmethod - def __call__(self, frame: "icetray.I3Frame") -> dict: - """Extract information from frame.""" + def __call__(self, data: Any) -> dict: + """Extract information from data.""" pass @property def name(self) -> str: - """Get the name of the `I3Extractor` instance.""" + """Get the name of the `Extractor` instance.""" return self._extractor_name From 9f665874b9ffc02c67097e321a6d3185d9150fa7 Mon Sep 17 00:00:00 2001 From: samadpls Date: Mon, 12 Feb 2024 19:54:00 +0500 Subject: [PATCH 074/124] Refactored `GraphNeTDataModule` class Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 76 +++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 27 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 8c5aa7aeb..6c85fb061 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -34,14 +34,22 @@ def __init__( Args: dataset_reference: A non-instantiated reference to the dataset class. - dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. - selection: (Optional) a list of event id's used for training and validation, Default None. - test_selection: (Optional) a list of event id's used for testing, Default None. - train_dataloader_kwargs: Arguments for the training DataLoader, Default None. - validation_dataloader_kwargs: Arguments for the validation DataLoader, Default None. - test_dataloader_kwargs: Arguments for the test DataLoader, Default None. - train_val_split (Optional): Split ratio for training and validation sets. Default is [0.9, 0.10]. - split_seed: seed used for shuffling and splitting selections into train/validation, Default 42. + dataset_args: Arguments to instantiate + graphnet.data.dataset.Dataset with. + selection: (Optional) a list of event id's used for training + and validation, Default None. + test_selection: (Optional) a list of event id's used for testing, + Default None. + train_dataloader_kwargs: Arguments for the training DataLoader, + Default None. + validation_dataloader_kwargs: Arguments for the validation + DataLoader, Default None. + test_dataloader_kwargs: Arguments for the test DataLoader, + Default None. + train_val_split (Optional): Split ratio for training and + validation sets. Default is [0.9, 0.10]. + split_seed: seed used for shuffling and splitting selections into + train/validation, Default 42. """ Logger.__init__(self) self._make_sure_root_logger_is_configured() @@ -158,7 +166,8 @@ def _create_dataloader( """Create a DataLoader for the given dataset. Args: - dataset (Union[Dataset, EnsembleDataset]): The dataset to create a DataLoader for. + dataset (Union[Dataset, EnsembleDataset]): + The dataset to create a DataLoader for. Returns: DataLoader: The DataLoader configured for the given dataset. @@ -186,15 +195,13 @@ def _validate_dataset_class(self) -> None: ParquetDataset, or Dataset. Raises a TypeError if an invalid dataset type is detected, or if an EnsembleDataset is used. """ - print(self._dataset, "Dataset\n") - print( - f"Type of self._dataset before validation check: {type(self._dataset)}" - ) - # if type(self._dataset) not in [SQLiteDataset, ParquetDataset, Dataset]: - # raise TypeError( - # "dataset_reference must be an instance of SQLiteDataset, ParquetDataset, or Dataset." - # ) - if isinstance(self._dataset, EnsembleDataset): + allowed_types = (SQLiteDataset, ParquetDataset, Dataset) + if self._dataset not in allowed_types: + raise TypeError( + "dataset_reference must be an instance " + "of SQLiteDataset, ParquetDataset, or Dataset." + ) + if self._dataset is EnsembleDataset: raise TypeError( "EnsembleDataset is not allowed as dataset_reference." ) @@ -211,7 +218,10 @@ def _validate_dataset_args(self) -> None: ) except AssertionError: raise ValueError( - f"The number of dataset paths ({len(self._dataset_args['path'])}) does not match the number of selections ({len(self._selection)})." + "The number of dataset paths" + f" ({len(self._dataset_args['path'])})" + " does not match the number of" + f" selections ({len(self._selection)})." ) if self._test_selection is not None: @@ -223,7 +233,13 @@ def _validate_dataset_args(self) -> None: ) except AssertionError: raise ValueError( - f"The number of dataset paths ({len(self._dataset_args['path'])}) does not match the number of test selections ({len(self._test_selection)}). If you'd like to test on only a subset of the {len(self._dataset_args['path'])} datasets, please provide empty test selections for the others." + "The number of dataset paths " + f" ({len(self._dataset_args['path'])}) does not match " + "the number of test selections " + f"({len(self._test_selection)}).If you'd like to test " + "on only a subset of the " + f"{len(self._dataset_args['path'])} datasets, " + "please provide empty test selections for the others." ) def _validate_dataloader_args(self) -> None: @@ -244,7 +260,9 @@ def _validate_dataloader_args(self) -> None: def _resolve_selections(self) -> None: if self._test_selection is None: self.warning_once( - f"{self.__class__.__name__} did not receive an argument for `test_selection` and will therefore not have a prediction dataloader available." + f"{self.__class__.__name__} did not receive an" + " argument for `test_selection` and will " + "therefore not have a prediction dataloader available." ) if self._selection is not None: # Split the selection into train/validation @@ -270,9 +288,14 @@ def _resolve_selections(self) -> None: ) else: # selection is None - # If not provided, we infer it by grabbing all event ids in the dataset. + # If not provided, we infer it by grabbing + # all event ids in the dataset. self.info( - f"{self.__class__.__name__} did not receive an argument for `selection`. Selection will automatically be created with a split of train: {self._train_val_split[0]} and validation: {self._train_val_split[1]}" + f"{self.__class__.__name__} did not receive an" + " for `selection`. Selection will " + "will automatically be created with a split of " + f"train: {self._train_val_split[0]} and " + f"validation: {self._train_val_split[1]}" ) ( self._train_selection, @@ -372,8 +395,6 @@ def _construct_dataset(self, tmp_args: Dict[str, Any]) -> Dataset: Return: Dataset object constructed from input arguments. """ - print(tmp_args, "temp argument") - print(self._dataset, "<-dataset") dataset = self._dataset(**tmp_args) # type: ignore return dataset @@ -390,7 +411,7 @@ def _create_dataset( """ if self._use_ensemble_dataset: # Construct multiple datasets and pass to EnsembleDataset - # At this point, we have checked that len(selection) == len(dataset_args['path']) + # len(selection) == len(dataset_args['path']) datasets = [] for dataset_idx in range(len(selection)): datasets.append( @@ -405,7 +426,8 @@ def _create_dataset( else: # Construct single dataset dataset = self._create_single_dataset( - selection=selection, path=self._dataset_args["path"] # type: ignore + selection=selection, + path=self._dataset_args["path"], # type:ignore ) return dataset From 33fc7d02707e78af43c67932eddc121997313da4 Mon Sep 17 00:00:00 2001 From: samadpls Date: Mon, 12 Feb 2024 20:02:06 +0500 Subject: [PATCH 075/124] Refactored `dataset_reference` argument in GraphNeTDataModule Signed-off-by: samadpls --- src/graphnet/data/datamodule.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/graphnet/data/datamodule.py b/src/graphnet/data/datamodule.py index 6c85fb061..e629ce4a0 100644 --- a/src/graphnet/data/datamodule.py +++ b/src/graphnet/data/datamodule.py @@ -33,7 +33,8 @@ def __init__( """Create dataloaders from dataset. Args: - dataset_reference: A non-instantiated reference to the dataset class. + dataset_reference: A non-instantiated reference + to the dataset class. dataset_args: Arguments to instantiate graphnet.data.dataset.Dataset with. selection: (Optional) a list of event id's used for training @@ -96,7 +97,9 @@ def setup(self, stage: str) -> None: self._test_selection is not None or len(self._test_dataloader_kwargs) > 0 ): - self._test_dataset = self._create_dataset(self._test_selection) # type: ignore + self._test_dataset = self._create_dataset( + self._test_selection # type: ignore + ) if stage == "fit" or stage == "validate": if self._train_selection is not None: self._train_dataset = self._create_dataset( @@ -318,7 +321,9 @@ def _split_selection( flat_selection = [selection] elif isinstance(selection[0], list): flat_selection = [ - item for sublist in selection for item in sublist # type: ignore + item + for sublist in selection + for item in sublist # type: ignore ] else: flat_selection = selection # type: ignore From ed6d4c0afcc1ff8d6b1809e80002f95388f206d9 Mon Sep 17 00:00:00 2001 From: ArturoLlorente Date: Tue, 13 Feb 2024 14:33:09 +0100 Subject: [PATCH 076/124] added 'hlc' field to test files --- ...xt_genie_level7_v02_first_5_frames.parquet | Bin 31908 -> 24965 bytes ...oscNext_genie_level7_v02_first_5_frames.db | Bin 24576 -> 24576 bytes ...enie_step4_140028_000998_first_5_frames.db | Bin 98304 -> 102400 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/data/tests/parquet/oscNext_genie_level7_v02/oscNext_genie_level7_v02_first_5_frames.parquet b/data/tests/parquet/oscNext_genie_level7_v02/oscNext_genie_level7_v02_first_5_frames.parquet index d9dfb6e9831e1cf4efc0703e291aa1628cb35471..f633a10c3419000685fe75b9cda1bb668d97e97d 100644 GIT binary patch literal 24965 zcmdU23wRqaMwY2;N{Lk*i;oF-t|juQ(UtdV3(cKnd#m+WOXku3QY`GF)W+6*b;By0oY+%!rSJ^DPXd%#iltMNQkFu1S@N7uIqyf^vLeIG~qsKfV z^TB6j1=7r%x%Zs&pV!=b=Ng=xt4OcWPp9d(q}A%n^~^$snPJNqhMC&2m$D*Z8ad%g|*#l4oYLyO}TeW?#Y$kn$Pd>^K~t@0)z<_xXF% z*}5fP@e1F6iCNF+hxMZ?^h5d_eY!q9&w#3|?}9t@40ydzk;@b)P@$b^Ium{~zWUP_ zPc3A%UtKp3u7+6cg8VI9X3_kmkGVH|^!tKKcKyT4wr0a;#dqEGX3kBYoOH0Uoi$3bbt!WIyvbqNjnA!O{=ILiLbWBZl z4$}o+SB6eASqYw63nz>dPO>wMn#r+@9L?mlaOFdC6P#R_t_C2vNNvTl1=&VsBak=1cQ-VthhrUl55hNyt?Dz3y2-((Gp3kh z896#;_*rd^hIuTbKr?v*Gznu9FPwBEvjCj@9a7wI@*|`S;?-z|ZpCEafEj=@%wHWq ze^0Xd&8)=Z40H5Z-s21iGz%V=P=SMarb5Z(=?l#HSqGek`P&N4`S-E~i>jL}E!(g5 z?0EcY>-&GbzTh@r?>#+x_GbNY|Hm2sLT=Ai^}NnE*uO5++b{L~?{vA7>169Ubbw|vVmiQiJQXf47*etg zyWrHC>6a?LtRS1uQw6Ra>-B|tBoRNu=AL57iwnqY3q;fjBBQ7isEi_NaC*2FG5}$l zv1~;yTxvJN5o%5?t6Rh@yXprK$ODbJ0xC)V-tW1vgeh{7w z8JF1Wqj|Z<=aFrT$mxZGy%H5rQEPVgn!b04ivWZvyQc$9!%R29A%D{qXecO{PHM06 zda&2}Cw|A7Y16mP$ApvHi?*N`YeLapCk*8E?+GOp6;G;` zqP-#-{Xr(q0P}Pd-581vQ0Ox&>D2p;kz<)y2g`3 ztU;H*>1#OOJ9sq&AMXaViLh$ft)!@L~AsUu5KKmie^#S(A&t8ek1s)8U+u zR;Rgl66)QIR;P=6dP-}AXXJz9Y(6;T;xOENqFB3CH~HaP@NAu2^YjD=?J7q4PZw(Q zb>_EDfMV;rwvB=U(_tGlowAQ;3v^RMBif#<$)}$kVe_GPDUc=?-4BIyP3~9(fKBb_ z4PqWB6;f0#LQM$pHj18=P|-MND5RRkMVqe9bL?7C`*!}86K3b4%EPvnS2kfez5DZ@ zcnq6&72Le5;5Ksq#b@?+SmN(D)8QdXcIA`(dAz?RY6)Q|p(@vt9Q~qN{XFVzT^`_x z-1(dPv3g#nFP78>dWTwso(V!ERN(MEw^D6jhZyqNQdOL~q5jZ1&#>PkfqI0^eS;;> zFCjaZ@K7c4RJ2eGZRrC8D(0s+eu58uV3h0463Da#g-qKB!X#9 zvcK@p3hc~ERj@e#Hs}eB40}R;b;t)w$n7P6R*+vQQ-zH2>>mce9#t3bSxru_`qp?8 ziq_$KUT!DOwBJT86w>u3E7&{iFVY zkkat}IGcNtB|EPmuU`(pvSR^#nR4#fu-@-&#RlxDqhS1*)i_fH~?+hM&Q+tbs)o^ay$@J4~FE?6cQdOUX{dDK#FNR+V{SinER>>7qe?&wrQvxLcDndXf78ae}RfWaEhqhZoq z-Y7VwHRc@2NLjpN&@B<#pjG5lxcSXS$~*0PS zP|-`lTN;>zB@Ue^G%%lh=tS=%My6l~CPoE{`lI!DjVjPg96Ak*^n3lo60rQ>DN;?g zRs&eMLvJy&DTGN2gYTMXX<_~yg62mfwSrsh5oHH>b z9jNjFwz-MC;sU60c`8~cLUroPYxS8lyb2g5n`KythItSTu&hTv8sP}hkf}d+1^kFB zo#LHXjH6&V6&Pr9($cn6G2~Wv6l{TH)l$bl-UbF?wK6jo%KJ1+ZfqkDwE{#rPZdcj z5SybA?>g{q{)=muovOIP*Aj4NY0cMuSoGIxw=?F=&8L5V^7e)1CqMbe_D|mV0_S29 zxMB-~PUL|~$fHY!#aQB3W5!R=Uc90Y8fKkp!g?)02=1p%; z{O3FGS6mf&@Rz^1(q+D0^XjRLgH7gxzx?^+QyW{&%!vz}Uz^w9+KL3D*utO_d7u(< zY8=Lg*GVyE4h;r-L%o9o>&D}7?)fp<|0BRDXQm<|InK*&sr8(HtR-@A;l?AaEiI9~ zjgHm22OA?*5zU_Kceo-u7i@NnUDk+e4HBH<9fMBfflA1!aX7E znW>0Kjx+O_4{ti0eQo6A7lnIj?hHkCAKG^KqO6gK;cqLqKYC6$^6oub-hH8cOaZ6Z z!=Mv+pb~Ow9L~dC>Nvx3{(Rm~UiSe`IWrXz$#E9EY;QC!`km676zTz28GF~z$8t_re%@It7A_NzLcg7_4=r*o@8?mu;j}@^5h`Ekn>RCtO&!Y zcSN5ROCJ6Yw*1$V*RyNifLB5P`f`S@{N-19%8n!aQC?ZrzQNxy=)^WCUQPvI;zRNA zkQ7?nR8I`X<1E=5B+m~6LOC-P5sC;G>T|`F$BcX}sL>4p;g@=76;q(4Hw^rN_d_N_ zwvOfLv-rg(;gO~b_QFdi7^P5E!@1Y9Br-~F8_kpPP}E0ZI8T=aB>@8pz-OjH9BjnD zM~bIHBM2B!^yq{%^mBR#LjGY-Cw(+U6@G}#eUv3{j+4FPU|T9>HezOtuv=mm^nnt|C5)hjq1z!0H3ozuoR3s?EaO%y`Vx*ae^S29e{s!~4a(o4e zpDXzTf|Cq!#5)EZ??8TKMot9=Vj()XSq5)@B`!J%E`mY%J1mK8V#zxz zPi++vUn$H4fN?W+6e6brg-`q=H_1?Bf|KHb@UeDi3rnW9AP_k>6&Z?dN&q4*qs#L} zyc1h$MwNxB&&0Zbf1q1RH;tXY zDR{xp{`<3vLm}%QK3>9@U%M{Se%_rQRjl@!_GniuG=KWzdoF!x#pen`WwC`pC-%Y6 zkW(QH#kz26JAf578wn5nEPfulj=E1Aqrhx-<_?xj-hp7{9985fhF}8NL+8Ab`QR6i zR6O)sdtleyZ&&2};)c50ra!5O{JebT*(>%}>O-zk?;O--&SLJXItmhr9lV74Nr??WyQ7hJSj^;eV@G(6LPZnWq7rgy9Nc~P$lzx64oF59JlNfLv*d}p z5UiY|iX26-3H4H3xtG^V@lI@YABD)N(R%sZ-^+6HJe%J?WE~v#SM>&9?6s5oTK+b^}iS4$a$(r zno~G8?v}yHu7gskgk|ypmK?hO?8S1|gEB1VH4FqGacYZ2JgehE?aniiv`dF`JR z(5zHLb7UV&?%Q|vqB;4f0-91aG&7H|B=X4Fi)QN+3TRfTp}F;+S@Oo?XD^zEexrb9 zwHlh;`&n}8*WVt^+`h9p`8yd#rFS*Qf6J2Fe~aM0)!UlIUtU$p7}A5CJ^p@=Uc!iu7hnq-} zv!uH=6q_;emJqf%C^k$qS!|oYEE>n&ZYOUXeLI|_sUG!-W-O)zzkaU2lFt<3l_Y-= z_wG(gn2-{}JVAVI!e0sR7vRrCKq@ZL^Jj6MV)R+4khp5Rl&QrHurz69E3|^h*mQ*F z#xEQ4=g|c?{!(1962!(9et3ey0t$a4VIe{w#ZTyYO%kTPJV`P+6xzj4p?GfcsR&P( z4e5Du;?P)NAsb@ybm>q{q$Un|L~7#DL8K-=f<h@0TERzh{BW z^}Yd;lDGRL74P85oWCh2DS4kwQt;N9q~P5!Nx>Ujl7jcFBn59n$y~mj%zd~gHTcjTt`WZYS<7)_oTZzq;<#?k+D&N|3&+*r znU&+J@d(%A5w7smRt9~L4jgRbg@t8!R^NxeHI#KFo;t?hrc}wd1n)_Q<`l?L5-eOabt7}fgXSIcA zg|AAEuTX-o&FM_Aw+gH0Wt74K&+o z?4Gu9V~4w@tFE~dTaG32v(-uXN$pin`MFz1y&F1)ybf2OGf*~OhyGaGSDX+JT(dFJ zUNp2~@j#Wd*xKFoeeG?ofU7?c^x0b{+~LYF%HSHB6ZuuzlJG;;mdcK{*0IK7(?my$ zL_e#oBtbt%Smb9@j#mU_8S?hG20=%or>%a-=b+~0e51|0|GKSXCU7OH%Z<}tsl6JI z3V9Rg?Dv(ri|czJ_Io<);JmfNg8XW()G6@ela?<+@$rsK?5!iJ#j8~QXqjg1J1q#e9I;H?AR&J-WIe)C13BFS{k{q zr2kr{Hz9u&>t$7aQTsqmk#|Eth0ce~ z&&MxSI=J6BTezzrn zigLBXg8tPIKcYRrSnyG_hogST3$YvRZ*9hW>^3FVFGYWL@V@K<9YP%-VYzc4!JnhgODf-C385XM z@@?T&5N|8R{fY(B8yF<8@Z? z#kCX)_QaF~{9|JOr2ZA>4<)W4ZyR)8i}IIui~WW1Q)2Giu6a80waB(RiH|ewze8HzxKM@Z&=_KEGp5um{5KlKLK> zqZRcCbX1u(c#C=ed7Ut|DDI>BK=;|j8%{^!$46+7_Wmju z1M;>Aqx`aH|Jli}k;r#O`wQ}^k>lbo(H}528tN#{RbkVXr zo>;xZLO+I8-oe*n+J?_Rd$X92N%Ntz-xc6$Q2z2*{ZJkcP&_gDmgR!JFrJF$|C#Jh zhti$)BEVfW(b2}6XSEZC2Fs)URmKz4iLrW9KP=#@=Bk{K3TR$S3(ZO}27IXfsp*7K zJvM-tE9&y2RV$kzl}ubTpJ#=an-JPtIeQrT(|RY^m-;u(-&8&o-C;L{+$A`7vUXzs zeoR*ZZVXB6oLcE+^edAUM?X8>x=7jO~RnBJ`PrQAFx(cGMMtxUB`_B$< z4UFNbylT!U!Aph2#S<+eI7Wj>r^gO)T#RAYh;@(pAAW#9Qx`&!#s$3zjqoGr6BjR( zj82O_Pa!0_alMWY4LXopvxYI`4|#^K9`T1R?Hn4q)H6IhIF>)^9}dEQ<0!wV_%h>V XM%V@myO__;f0@=IhH=9GfZ_iEc}6ct literal 31908 zcmdsg3w%`7wf8wB6Ha0xF=f(0(WVx+QIJeRB8eh1XJ(R2Uc==5sJF>XX7V7DWJo5j z5$*VZqUEZkUMp6)DS9oXe%8+_*IKGm#Fko$%GE1ZxEK+ywor*&M5)N_ce|zG-4D0rfe$o8V;m<5a2C7`R2-hIFoyJmgmonhip-jCOqo}Z$Ry3nHiQO-LS0GzKL0@Hm~mESO6DY+d5T4&i2EYK z=Pd$-5{fXFrU*u)4v@ z9KY24x%CFC8llKx;zSu3p_(c|xYM9Sm=f$C><)GJ_qC2nP;O5|yAuGXT9{4~)8kzH z{YuZpf2oh|%2@q+Lw$YpiE8Ik?&sCfl4#=A+cwrjADpw+IkLDqs>Ugnm^e`eRzgjc z;JhzcAE(rt>pAq$Y{01&rqjgqI8$Cca_8Q(TcRgE%h_7~Kq$K9)eU>UnK~3T{bTX{ zPhSv@zPD-Jd%K%QG;oSFOq?hKE1{-Ja2`(8#~GIR^S}k@XbRv|3)5+0dYsvBlvbPP zeqbqoxUcfP++=H8aKW7~WFN6CtDQ6NdpVibM_(}>dFIS1jh-oTm^e`eRzgjc;Cwn= ziF39m+}#U3R08txIcWQZ08%YUNB;C6KYuNG|8L8atQXyzGk(j(taZn+Z&n>}8m#|h zt8KmPxlb%broxQXdsvOmDb_S`A}>}#O_hK=^ED;Nq_DrQTZZvu23nf|7}bJw#A{;A zGA^jZc?nW%`mzf_1^hYwIsHHfOfsfOQ>n;d;zSa$Xd}E?4ktPJY zeZ*DU+030Rx;qo?F#-&=3?0duFx*CGY@}H5)PJzceigZmtvHa(CI0b^WNz6TZ&J#} z*Xg%9vs4_QB_>Yf!Og3w08AVsc3rH5mf|1w_k~(R-MxM>7>}|jIu9K%0YbGfohCF9 zW*9RojkAcOlCun;MkfS>U+JN1nQQ|oFwrmShZF{FzI47ZRptk7u)DW4G!~?)hBI$t z(cbxJeExiu3{8E~jdMkPxcSfwd?s-ZR|Je$V&cRr3|hRNq5=g11{6K<4Q1%3clU+- z1D#O_HTQy~5eYShW&O95SW^4}&tRy# zy*ot59*Sb!eoA5&bm)h$SXYyuyu$*HhFOfCsNq0^n+gyaNgh_h+O3 zx)Ly`1?h;_#NaksV~Hqn7L&{}mnF|mWU|t@WrY+Fvt0NF*C-~cVu^{P&SMzdw3-S6 z#3ZzLkrLbNVw{8o_rOf`FpFMZh(5d;ps6M4$k&9X(c`t^OF+kny@Y~LQ-MNz{JtC| zgcNX(6a@{k*B7Hl7sq-DGkpWoqY%cq&(*y|EP?S=_+V3s*HhFOioHDrJ7*aF77@4M0FiyrvUvea9& z)vzeT`tf%*UGvJKPc-JmB8Q0+>)@bJQz81qL~wYSa;!LyZQrn?EUFW z8c0P>$QA-GRzgjcAbr-VLfYM@7+2{N`jZwET@Faq;&d9(L@LXBEGU9M>Wm?lm^kqY zHzHn7QK2%;d%Nt)*u2Q$_Xq9$1OAfk08DvWbC*a0bf*p7X9F;5i8}2}4~8Jo1S6K5 za}qyPq6Cv}_YAgse41De7NaMM0n0fz6IU#smMO8MRY9`Qa>_|3`knK<1+u8f zYCFqU2xWV%8cMz%lxJ(ux|;JA%H9Swlma~{d+O1q`tugb2hD0I*Xltz+=QNPI&Yzz zxls*esUDP1zl~o0_Lqj@HZF@j_%KWw%dlx2GYcMj5`-k5D156Qd&H8d@c@r#YAOg2 zXFJh0B|wg@z4Y~qGo1+*J==;Vz5~eAqI4S29A@0cni#Ss&>?bx|6>aV^GW+0D}1#* zzu&-e=IG8f7vbCS6`L|NRsqEl0+NWpI;g2Y5kcPNlT365{GoyVG-0W;mE4I)PqOH? zc66T?kgDbBw4#I56+?=jR>vPnd(jHtSwH%G2g`A7=6plCIWl$yA&!@pzo>%~mk?cW zMXRWRoC;z^oNsk1ai-uUG&#uQusqd)KJW`5<-&BD&;coVsq_u&+Z7Kzc%gOgKL%QE z+i|hg{Lm#Q))i%1f4R2x2fxhEu->wc&G_gyb2M;@HB6i+11q7XN^oxNQQ{P*=rVW0 z6n*nb^i($>Rg2SULpE=NKQ- zRo+e87Cdq@LABzNkb&b)1%?3X_&t9c!^Et*C_5}?XO>NKPSwK5jM`54Wf<^(Hzx0&Atq}ETKF@RGq+I5oT<15dc)_B6C z+zB*P39q04$dFQjPegipOo=p=zJ8IToMh37QM7(k2wu53okny}${lJQsrws76uIQi z!w|g8#819w;1;#r^C5vbQTS7Bn4}tsU0lN=XrQ1!DO3?OP*5dckKb_?uw$(EWw9h6humAvA+^2K*7Fb{n2?pFci`nF%=5w$RXf@Y=l zKhS}RYY7nyJfZ?}ss!fAdsHyR>?;EbUG>AeS+w?U5vW|EPCGh4<$aa;(Hk`X-t_4) zoPY1D{R1SX{X>%kX{2Jc4pLl=BE>wS26Czd>B~PwMW8PFUuCeDNPfSQ4?EF?Eq-7nR53&1vG4txk&* z^nbo+MZ4quuf-<$)&oD8tjS8To=6K_Q}7#u#^hIg9DwaI0uNR~O_ezH#Dgl(8T8eO z3Nl@!KDv=bryszO)pB)On;!CaLaz-S;`6NV{X*&3QtPij*xdN(-0Q4EuN8lNtfn&4L;mjPS-)KM z!(*19r~Lgt{rInz)0#&Z=D6ikFG)@P>b|yQ3S{?@NXPf!dz2R`)=^*EJ+gr@#Qu{shLNaclr}D>`jY zZ-By8n=Z+mIA;0hYr>Y?=rK!l@h`Gn`;J-GY`gm4AO3L6(s=vTjeGYW)5u+7O%o^9 z#Y(8D5(DgbOl5#eEAy2GaCkxB zW0srJuJI)uJZ3qvXS8tt$Hy$+dTI65`|s95FLIbTQ3h5*O_iX3eya-oJV5X2?;i9^ z{IBx+wx6-+cUv%WwRoLIb&x9ttn2gl9-(L&NF7 z4tx`Iz#_a@12t6w`|&4KurG(Gcb7RNprt`yhu;~CaSy}TqyGnHCXZwM>UMNm*TFAY z!)Zw@9yjhiBO@7Z~eaj^+^m>El;Nv9jNhKSDkzY;Klaq zkRL=-uCvxzu0Iw=#w;%x9GCG;QT1e1XWL@C9`bQttO@*+d$*;0#sxu`BU- zI$#`LWDLTdO1k-+K!Fkx@}Tim6$BW4vQ=XR2biiK?Y{`rigSRT+f^ow|TCzDYGLoF0 z7`X-Re3)Dd7nh}RiHBY?bR|Z{q1tw!EK74TSK;C!xbdfDG4=4>g5_0XY6HHSlV)aC z1GxwOwg8I@zN_GG7yJ!kE?2Udi}deE9%ufNoX#-=FB#GknP-x-6C<|+OBl0w;i3}@ z3&6#XF{KGE{sB|^@!fDTwr)b6;y5i%k~iy%P@3hL)aF2opHI! zKENS9OiYTs0;)=>aC>P`%RKCyQ#VY!VmWO=|GrBfyJ4i;7wS>~r_=6DzX#yIivgFl z8Urq)Y6Az?hbxWgB<`&^fkiL7qSOlCkbPj~c`kF+jo*qwZ%DMldQWQB6JM{v$>gfa z8}Y;Q?%fW|I`PQl6;TMI24joTNWe&qM0iBStiX9)`ka z^9=WLkt6TGSR1)v#~3u)k%#H`WEiqI>pRCmv9^v4!=S)K*a58Lr9*~nZag?-=t_<3 zcxi~ug4zW@ip+fslIjhaBT)c0zOj1{_Xe$`ij|8J69T-|qH`5gY@D+s)x_$#YgWy7 zZeCRRZq~ZvR`=ZEy^i{SUxQQY!=K#hF|FO4eb?sfyV1J0rg3>#;qrq?FsPzC_Oa-p z*9Dg=G!ueRLDfzyvTbJoDf+;#AH&o5#l}2Ek{~6-DCA0LL_q}!UvJV%5^Rt`2meza zDi`bxwR#5p9tGOh*~|khdJ6t-J0PG{G^wMAW`HZl56GVMCA%$ZM3#XNp{aSJ)P0^{3vYGHk?6Lo5(SIGs*wqc|sH54b zsSB!!xc7;?91%#;ef+ed zdP_W|t9F-DvV5bqAVRh0nbMWphefJl{9c<7s&`183S%Tu+{(xSkJ3aXq7r;(AUR#s4%j ziu;*k6yKAwxW^wX&sJ9-s44gM1Xi{**mAv2SD+o_)wmsi&@q&g zQ-~SuP#ajGfX9Nw5795+>unhFl@twmNiknv=%)G;NIc)O6eQsqB{jHxteuF5h$8ri zy-FMMS{e(?O-pKm-bPTfV&zCqPEoyBpC83Fd5Jy9JrUw*stI_Bc$;0|xASEVuq4sH z0h|IT2m{rhn1U$m!bP4YS4T@z`B-yPO;=N1%`laC@% z2V3&G%%TpwqizgBsWTQ|=j0EbFBJNd5RcV`9kO_o#1k#xB0 zNw`#z@={_t%8D_8+dz@6SP6I+ z){FJeE&lCxf#1$oIYj?cdrSK-R-Hx#=^8DiMWLo8I5cg1AsBh-7sQ(v!^<~|_7?hk zJBB8~OY#qCx)l0ZLs0=P?{!1cFk}Ccc!i{fD@_w$j!J>Q4EjeU^^dJylHZ7bNYex) z2>=`PsoV}fmhX0i!M^RHzUOG)3V|Q&i~S?XFK4l@FE7yJE5&$hWkmqBNYs}t{t1lAi}2%hhp)_lBQf+c}=lv zu(`<<_B6HxVGQi3>1!DDmbeD|4Mo-My+z#(&hoD2yg{&+HzwZ})0f6?C6PNNzZ72< zkC>ax1;a452CDN$yV{qyf-Sv)Kzo47t?MIQwX}%lW0=juMBWe?I4*u$Kg>&+{0cuPyd?RkM-Ph-Pa zles0=TRK|jSrX`|ca{$~_Y~w3&B`ld^34kQ<%D6%9>v(+&YH32M%O@7-e|zv>oa>w ziej-dHZ1tRlP@b0$E#5yqzC^?^C!8l#sB(OuTO#)tVVJTnO?o3O7G-Uf5Kkk1NY z_ThULiuq7Gp4IjV$l}j0)VK2`;ruUXA8EgUeC`$Mm+=Aj$}i}DIlh&{Lj6uhl>(n; zJZtn*$fuPteRy}DOrNvHvuJ1cRQ)PN{b~Iz%7CfYLNWiA))ff$b#-G;(@sI+&xui$ zSC>Lht;W+bCw>fC(Fuc6iVS3^~MJ{IN2Vduclv+#j?5=vwcvnj`8b9-Nd*@O8? zR>GrOVevvSpVBOHPUCTf8!D0fK*!@1?g8Px3|bea0@D73dGUCf2V=6iAmDe_bTs!C z4SQh{yHLyzWVo}9DD=UWmF7P;zOQq%bU;5}S=ZZ;56=YM-n^ngzMSA8>_mx*mDdXX zqR(SsJ@^oRkt>+$faEGyLlNXL*NT=V{47;I?9ChD%?f<^=Y|jJEAjDVIb#1N`1s}+ zUTSY0d@ePi39r&-wXH%>M49Sd}(ugdq;yX=2TVnv;@7*8kk=cj=Hcv zoME}VvamuP7A>_Go?6<-+@hv8FbL1lV@=^=GW83&Vd^W1htZh*Xgp|+AH}XNuzOca zX+x=}any|GR#kN*T=_zUJU(`>BM(C_hBbjEo5KrZy;8qQH>7Hbe(sn&IzLn81B|>h zFx1kB=Vdj)W*F!E#Jv2N;7^$E$NWK`$5io!8=M;N>6`@SiSTI|7VOn&?}^2Sc03S+ zfDXg$P8(kH2;sRM5mFU~p-2)Bu9!b{;+^ONV^GOTPj1m5wltsN{A6u@r#{ogSl9)q(06E)ifVh#_9|HP}-BM zmx$}1!gIExUTzOZPYjRt_(Ps@+Dij~vy+D#=&J2_y;%6r@uw7WERA2C^#TmV8HBkv z%va&k20kVHownkbKGYvdf09sbZVCpyO=R306Zmoehf5Nl!g`D*e(8MlT4=dDR=?s4 z*DrunUmL6M`oi^LLEB-E)rZ*vSdXMWSehYuhB#NStT10!j!&hySe&m>)@JMvpcB&p zsf=e#-w1jJRt@<>*R%(N*LVg7`bV;c{R4yWzmm=>Tx?!!hNEV1F8TkD|CrhEe^GS9 I|DXJS0o>D#*8l(j diff --git a/data/tests/sqlite/oscNext_genie_level7_v02/oscNext_genie_level7_v02_first_5_frames.db b/data/tests/sqlite/oscNext_genie_level7_v02/oscNext_genie_level7_v02_first_5_frames.db index befb894e7f6b7f128773512772a07fa326d0c786..67d19013216b7d068fe411bac962bb4fd4020e17 100644 GIT binary patch literal 24576 zcmeI4c~BHr9>=HWV0wV*rv$|Vun}x#41$V?DK*ms@I+>~1UBxD1GF#^W^iVZL(EEy z*<@3jY%Y%&Q|=~Z61B3~N_K0LxSP&ll}Wk5SUD@l;zmn1#x)X+Mw76wUw4lK{TKqZ ze{5x6SMlxc*RMbQ>+d`Ib-yVp%1XQ;fnDbhw0J^nu5O%;qI5jV>U262{ELBq>SBN! zMs)|Ssfg|2Hch(h+m2b`QA1plj_G8Ka77m40C9jgKpY?r5C@0@!~x>KPu_uTlMUvi zq$KLs<3pahW+4=43pF8$DJ(B8s4QkH3vMYXW>FGrEA#~GJq^MVZ*xcpxaZ`uOG=yt zm04_&AhZ_x1KI<{8wFozd6Am0B-iZ^)|Uz$A$OhM-yBY!hfmJKC(p+x=VOyswfTLT zc%8mxuTLnMs~*T#BPnn5U5~b+t!_yCFeMO~!_(z$frhDPH`IB2jqV1a-qY!B7JQA@ zj|k14V8~nV4Rxw{1)mUT9J1%Gm;D`_#~liI>eq+UBuCvoKfAoNvUq86IqNE4zN(i8r&_gG}JukwuboBCHTCmYxbbtoV3tR^#HH0LFkb0 z$me%glvghIEw2|`ZOuU;IC#&X)Y|2xMa4CW37S(_XDK`QG}fll?OTnMImvFPwiTiV zF*x7gJ^i32GWZy4^R$P;Ce}B30*zsJ8~iP9Ut3F^5Kyg*0g+6h}P z?Sx%dZz5_vXlkxkFY67u>pTr^$cRh{cpIBSLlT8Rz!UNeNep^IZ7>-YYXqUSCFJ%5 z1dmFT_6Of=pdsu;Fck3m8r7`r-iC15OE38pxF+?#CVBOf?~2I{5(kI_!~x;}aez2L z93T!52Z#g20pb90;Q!14V|FaO;|0v?I`AF14En)2a0Yx1J_8?tcR??B3mgU_cp3Z| z{1F@kPl4Zo-QW>$Kez{M1)D)T*Z|f80k}aer~s?LQcwUEf;^A|>>wSafFv*;!~qjC zz3pPlNarEVMLHMh9Hg_6<{-^RItytQ(i@S^l+<8HYD1cdltVfLX$I1Cq|=e6 zA!U(HlT@FIGzDog(y2(NAf1eK64E536Om3pnuzoUqrpNYNHSiMagqQ@7)h*>#7hz< zNvzR8Qx-{RNz9VONMe%2D2YK5y}>|Rq~GRe>(=Yw+kGi>iP^&xThCb^u`Z5375{L2 zQQXD2C*w+Dzm9!0w!m`6vfWZd_tU#*-hA48pE)n)lbEeBc2louqbbFB)VRqw)9{hu zZbQEQQ~g%Go$94FQYpGOAY5LmvnN>0l%6u1&6L?f!Np9QDU0KWqen%U!^XXx$UAtH zS1ez+jfGp{#(jxf;M&NaeewGhPp{uH02jx;mv@RV(Q#?luDx*Wte%AjA63C`j1`=P z2Y<0{-CB6zOzsi}oOgB$JT^kaH{!wPcrAT@;X@5)d%;Zk}K$%Og#B?ZY~ecwq^VykuQ?3!WaDQFKc0%xbZ<$ zwo%Dtw3E{opas_DlMn%ydA7$A`SPS^S-35__93FUss98I_pxM`a|RyxnQZS|?lcCN zaR1ujff;z3HDRa-E&c?H(;~Iv)6qTp~H#9xbr*wZ&Ut z^uu_Vh6jEpdpL_bss(nOK+P2otY)=9smD^<|AI_cBENJ-st6+z3tCa#7dOpC^#}`8 zd#sG=X?XBIVz=SVvZOXC}X75 zOh_F1kPS4TjzZ}ddT?W}3c7kc9(+}{b82RR7QEqx52T|bfXCs%N+aWLgrmBlkJz?j zsKbc{cG1-!DmaZ>qy;NAD>`^J6RxtB$N{eMgiPi3X~6;Kh*r2tty-=$*yxunPUha! za+Mm33)oc}kLM{5iK*Q8T3+kGHg=dIbCt&7c}fR971vaw@tm>R0+L#&GKigOo1z`W zwq8U{A9ue_dbK4SARSpS@5)b@`P^RZQ?dIo)JZ8o?2AX$Y8ubIs&aRXk!!|tm1pgI z?%pwSWAI$%@sbAzNSe_>cUGja!}wsrbCr1WxF^TRHR8F-_-h_6-c5%hJ1u;?)rN2d zqa!;*<@GDa_OLeI&U+&p0G>-%>+xV^TsJTCPVFG6J03OTXcY~PktjU)vOF-j$zG~` zAaobT^4FUzHsTCgdjF4&KP12Zf6p9bZe?PvFIt3 z1H=L1!2gH?H4DNoMQ|$d`ec{#-7U>lseO$LdeNL=)akh@+La$cR;DT=zChS*A8Efp zv`<7+6Iim7cFqsK%7!ODrEfUOCkQ)kDlsb2nTH1|^E-C#%Tb2toQDT0Gf;L8PL>Uw zo{b)1E*=<;@TJiL=VF0;raZxB=bjraa1IuDy-?r>yHVY8^=vIrnlMss*_N5F1^&1x z?i@TgJjIQvkUt76z5ko68+2eZbCqegUWA)uAr24+hy%m{;s9}gI6xdA4iE>31H^&< zh66o1BTK_Ke`%p<1Fbj1-$9$rIy3xX0$eA-y_@yZ`(mt)D?j|{?1bJY;SZj#oxD){ zhYN=s$NTPnZpq4I$K|ZvS&#qXQelTJ?jbQ{h2!v3$M0#26*c2?t}%CZtBq)Mham83W1-P^Q+0AVi7zw;o*y~Uah># z@zTP>UoCn_|8lx+`vZq(`S8DFjw_;jHbIdoo(r!Vq-ZS8kqj_IAgGzWY{f-7(Jb@15go z#b17?gz$66iJKp+T35HYpRYJ#x$=SW4zcV2oxiN?S+OlX?f8Soo>pBjE&F6u+WFUd zbks;8q{=Q>=A0_=?O6dsFvZt2S2Zkje7@)R%}f7!kk@6Lf2S~XObOwz4 literal 24576 zcmeI23vd)g8pmh$k=+eR76K9ouO&dTiM#_Q2&6lb2Z6AGB&ew6?j~8nO4wcYK^{a5 zNCZSgkOM)$C>9Fd2_@=@sEj=7K#^C$0fMj~6l&B2E`b0Zx$c?SWOin=IZ}0Xb#*;e z$v53S^Xu+^e_zk^Z0AkOwz@64nRe$Kvs;%)b*5;Vf;t^VQ6c;vGVCEo{}uc>HniZp ztq0d} z_Blqkb&e>f%>RON|71mL(lXCta~o}T-D6pqSq7HKW%)C$E@OeY$cQQvT!=HB*5X-i zX}-njG`r2xe3#knK_wP3$}z`nG&?Qk)`mHY1YIh%7P)6}Wufua(K1z3R22PkS5{7^ z#qG4S|Kxt+u~}HLnCwx;DIU9RLW$Ysvd*+xoW?ZVxT#Kifx#2gS|z^FV7Iy~f-3`D zlq9BQcuGq0W|s#7CK;z@r3rAtOY@!P!r6lA%@x);9``IG*YK7B_GP=Q#d8Gu6q;Ry z<|2#HVzW4l%f&YZ%@vUXxPaYNXe_nhp>0*iVRu>GR=dquCaz3SB0{&6NE#?8Y3rVf zHtVS{itfSAD!!V{6JLiVZ$mOSTUKJR6-&pU!eXv7a_+#T={R!^5N3z zw?zeJTd}doQfMwWa-%Cy*n;K+x3$pf7R-=b?6d;-SYrJ{kQ;R3Y@JO}KLgZZwV}~A z7qao>E_Vo~I)}AL(7u*15lrYUd4e{#Ga1&9Jf0ipm=fG9u|APNu#hyp|bqQHMa zfih)3wT9NvpWh$Chspt@u1{qZpN~FkQydmMB>Ha6R4;_2Axf$HMdSk|p8()Ci zgO3f_{A{CwUR1Sv9DQm%IQZVl#U=Jb5pXZFO& zK4d&${ZFnw{=Toz%83`I^sn1jDhp7vMxb_IuFskg*RpM!CZU>@VNaG!FB%J}fpsS) zxQ}xcUziNOJ2`)4#qe1XFgIo0whx$RnCa6WKUEodh;hF$Y5TBy4Ze!$=mj;yUh7gy z2dG#rs945z`KNBfr)8ejqhW4}pL|qz?Etv-^Qu0sY86-UxsSm1(izvm)20Y;{pt_p zKUPC#`Y+Mu?Bzq51>q-`)$R}VRU9(u>+_lu&FuO=tUIM0AK?uDFzgCCB?nP}C_oe- z3J?W|0z?6#08xM_@PDPi!YpMEw7%pHywYqvK;8M1Pb9qf1gLxUd_v{H0ieF1)6LT_ zZU*O;TCheia&t}%vf#aBFESKppi2)&H+F;CFK zR);dczULjdfXVwD?=+^s*~ltvnbMSe9kO6|Z=9sy{z;iN$byDrHdr!EZh4M>mSHbaDH{kMxkZv6gG$Min)Oox2+B`QI)Yhy? zoc(A5sC#|xE?x5r;QXku_*T$p=~RX7o%~XWER5K&aq=bD58OGu4kf9SO>$PXbaLdGi_m48uMlzkMZ6)P0u znTFm?Tj~DPIcgQEti2;6QLB<6A!+ob3w_?wXuz-BEwP_N0NGO}cWku|oiU}|$5$b% zfVI2-IqCh`Rn6!CTlc)dpv;VsA`Xn(I@5#-Vs}0mXM~6YP8~UdD~R3tKpewxKaQ%U zUvWti&b+PNs!$C9`vn@;udhLchGAmf5x!mLI>s9+=H2IcM?1zFBIfzpcc5du!D8Mo zGr93>l=4Q`wy)pAV%~3@uyHC0&vS5JyS$tPe;y5Pa{9$RC*iq04m^ZG$5)*bF9z0g z9ZtB)465fQf#}QZqmZfauvQ)@5PS}!&e4m3Uap1lr{Oc%@u)!*V;IL-S6YP^rP~3< ziGkM>6Ct{7;3b&Gp<(J*pP5h&GjBCJSiylAv3@`l`&ZGu=+JM3xlteL&>1t8i(+Q} z&C3v_0|GM!i8&W}U%4*jM6NWo$r&i-@N-ezDha1zWZNCg7$D~SgY&i6Hy?}>Bj()V zV|0^*Q@)R#8w2apU(ES~*XM|YQ+An+rGOlR&X0rUG(INdrozlYeHo-Hlj>kym6<=1 zb%cN%pNks$iFsGJcIp2v;Z?-6t+$Vt)7PIzgA<(Y`X*+;Gn}XAc~L7-JIg*`T~`)7 zg!+hqJ_qdwvpF>Ll$ndkxlN4>0tW{gdiw)4;3s}W|122^Hy89lfregU-aX#awseda zE#}?eLOga|$9O%(ylZ^>K9=&TSkDRG-yR|!?7fQ}$Jp&1<3)*hzF9o>8!4|dJ4Xg@ zUw1L@4zFLGgy*W6=55bmG9U8eX=rd3-GUCiQ^K2TXJ^cS?NeuV6NBowhUxPj5Y$x+ z;vZA`GD*zZ?z8VTfm>vVE4-;efDLD3@5{#p{TGz%Xq<0cyeDkC_=IkR<++d+-fjg@d<9w}qw?jCE80T9TNLx4m3*od8uyqExKaK|c zvOvE^(z^2HLVpd2QyXMr9{)-cw^;JVP#MNXSTIjf8)z}_HunIHH%O-2$_b&cHJ&8x zLq*C&{QG|*T9|8hYE9bCWc@GwjzHG`lBx0E`aVI{|J)o-*8gaVA?tttZ<@YWGqV0? zkazgrgUI@ytp8iT@VUwQzvb5qUxbkLKbkHmhOGZ3o}nh|fBu&Nvi>LQe>6$NlJ$S^ z^?wZ#NzLy6k9d{Rex`M3^%0lQDLIG&L;<1zQGh5w6d(!^1&9Jf0ipm=fGF@ERv;&t z`*oY&{EhYi`F17cOMZQKS=oKi!8g%nCiwx;ekp!eu3oxJI@oS!?y< z-F^4SuGnPX0PP{C8sE9I{HM;bJ-+s`68qG*JNBgiOy>y>H9kIG4!5r;8b75tR9q37 z&Nj||!&6hTQc|5LOg5vKa%)%e(=w^zpY zVU_%UcG#eG7f&B=uO;W6AA4V{ck4cJVUGlp-P+}?<670aiyD7CJZbHU2V)0Y_i2RR zv@7t16u1%iDR3q5Rp5&q4ZEHQbW{|lB;Qn=Z0lfKnQbN9x?ohpzI2e_N?Hm?TPg=o z+ED4y7bKa=R+JLm%yJaB|EKf(q5$Lb}msCpA2ZseYkObKh|P$=%4c!PQYYp-goC>Rj$@=6J@DCZCqaOIM_M zwc(C>D2<)#TUY*cxb_B9nK%?=5S2411F2*V3+lN8m{?cZ;b9qiD3#VE4u?_wsl-4^ z`%&?XD9}s$QkjL4LgfpP+&)aKePw0CzIxM|x@_3j!&F-5fILKHHOhljw2>gas8pjo zK;=i2`>B+Sf=_WDl}|<$=%Joe(sE(cy;KgM+(V^D9!L)=TT!}GNz4c7MkR)FHx