From df224d7233a38732ebd3365ad4995dc7a95a0235 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 2 Nov 2023 16:37:02 +0100 Subject: [PATCH 01/47] Attempt to implement Issue #4 - Add LGDO format conversion utilities The idea is to add a `convert` function to each LGDO datatype that converts the underlying data to a third-party datatype. These are `pandas.DataFrame`, `numpy.ndarray` and `awkward.Array`. Additionally, you have the option to control whether `convert` copies data or not. At the moment, these issues are still open: [ ] How to use `to_aoesa` to convert VectorOfVectors to `numpy.ndarray`? [ ] How to implement the conversion of structures/tables to `numpy.ndarray`? [ ] How to implement the `convert' function for WaveformTable and encoded data? [ ] Find out how to implement units with pint. Is it possible for awkward arrays? [ ] Write many, many tests. --- src/lgdo/types/array.py | 21 +++++++++++++++++ src/lgdo/types/arrayofequalsizedarrays.py | 21 +++++++++++++++++ src/lgdo/types/encoded.py | 16 +++++++++++++ src/lgdo/types/fixedsizearray.py | 27 +++++++++++++++++++--- src/lgdo/types/lgdo.py | 11 +++++++++ src/lgdo/types/scalar.py | 22 ++++++++++++++++++ src/lgdo/types/struct.py | 28 +++++++++++++++++++++++ src/lgdo/types/table.py | 28 +++++++++++++++++++++++ src/lgdo/types/vectorofvectors.py | 22 ++++++++++++++++++ src/lgdo/types/waveform_table.py | 7 ++++++ 10 files changed, 200 insertions(+), 3 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 30a47bd2..7e2a8ad1 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -8,7 +8,9 @@ from collections.abc import Iterator from typing import Any +import awkward as ak import numpy as np +import pandas as pd from .. import lgdo_utils as utils from .lgdo import LGDO @@ -138,3 +140,22 @@ def __repr__(self) -> str: ) + f", attrs={repr(self.attrs)})" ) + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the Array object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame(self.nda, copy=copy) + elif fmt == "numpy.ndarray": + return self.nda + elif fmt == "awkward.Array": + return ak.Array(self.nda) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 95884bc9..70a5ddc6 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -7,7 +7,9 @@ from collections.abc import Iterator from typing import Any +import awkward as ak import numpy as np +import pandas as pd from .. import lgdo_utils as utils from . import vectorofvectors as vov @@ -131,3 +133,22 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: cumulative_length=cumulative_length, attrs=attrs, ) + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame(self.nda, copy=copy) + elif fmt == "numpy.ndarray": + return self.nda + elif fmt == "awkward.Array": + return ak.Array(self.nda) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 68886273..c03e28e9 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -3,7 +3,9 @@ from collections.abc import Iterator from typing import Any +import awkward as ak import numpy as np +import pandas as pd from numpy.typing import NDArray from .. import lgdo_utils as utils @@ -225,6 +227,13 @@ def __repr__(self) -> str: np.set_printoptions(**npopt) return out + def convert( + self, fmt: str = "pandas.DataFrame" + ) -> pd.DataFrame | np.NDArray | ak.Array: + raise NotImplementedError( + "'convert' not yet implemented for VectorOfEncodedVectors." + ) + class ArrayOfEncodedEqualSizedArrays(LGDO): """An array of encoded arrays with equal decoded size. @@ -388,3 +397,10 @@ def __repr__(self) -> str: ) np.set_printoptions(**npopt) return out + + def convert( + self, fmt: str = "pandas.DataFrame" + ) -> pd.DataFrame | np.NDArray | ak.Array: + raise NotImplementedError( + "'convert' not yet implemented for ArrayOfEncodedEqualSizedArrays." + ) diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 89eae3e5..12b0c1ca 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -6,7 +6,9 @@ from typing import Any -import numpy +import awkward as ak +import numpy as np +import pandas as pd from .array import Array @@ -24,9 +26,9 @@ class FixedSizeArray(Array): def __init__( self, - nda: numpy.ndarray = None, + nda: np.ndarray = None, shape: tuple[int, ...] = (), - dtype: numpy.dtype = None, + dtype: np.dtype = None, fill_val: int | float = None, attrs: dict[str, Any] = None, ) -> None: @@ -41,3 +43,22 @@ def __init__( def datatype_name(self) -> str: return "fixedsize_array" + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the FixedSizeArray object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame(self.nda, copy=copy) + elif fmt == "numpy.ndarray": + return self.nda + elif fmt == "awkward.Array": + return ak.Array(self.nda) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index df9a0eed..4468c54f 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -3,6 +3,10 @@ from abc import ABC, abstractmethod from typing import Any +import awkward as ak +import numpy as np +import pandas as pd + class LGDO(ABC): """Abstract base class representing a LEGEND Data Object (LGDO).""" @@ -30,6 +34,13 @@ def form_datatype(self) -> str: """Return this LGDO's datatype attribute string.""" pass + @abstractmethod + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """Convert the data of the LGDO object to a third-party format.""" + pass + def getattrs(self, datatype: bool = False) -> dict: """Return a copy of the LGDO attributes dictionary. diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 86630899..1f2dfb5a 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -5,7 +5,9 @@ import logging from typing import Any +import awkward as ak import numpy as np +import pandas as pd from .. import lgdo_utils as utils from .lgdo import LGDO @@ -57,3 +59,23 @@ def __repr__(self) -> str: self.__class__.__name__ + f"(value={repr(self.value)}, attrs={repr(self.attrs)})" ) + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the Scalar object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + Not sure why you would need it though ... + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame([self.value], copy=copy) + elif fmt == "numpy.ndarray": + return np.array([self.value], copy=copy) + elif fmt == "awkward.Array": + return ak.Array([self.value]) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index c3f32711..a850e9fa 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -7,7 +7,9 @@ import logging from typing import Any +import awkward as ak import numpy as np +import pandas as pd from .lgdo import LGDO @@ -106,3 +108,29 @@ def __repr__(self) -> str: ) np.set_printoptions(**npopt) return " ".join(out.replace("\n", " ").split()) + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the Struct object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + + Note: + - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray + - conversion to awkward array only works when the key is a string and values are of equal length + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame(self, copy=copy) + elif fmt == "numpy.ndarray": + return { + "keys": np.array(list(self.keys()), copy=copy), + "values": np.array(list(self.values()), copy=copy), + } + elif fmt == "awkward.Array": + return ak.Array(self) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index c321bfcc..d238cf7d 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -8,6 +8,7 @@ import re from typing import Any +import awkward as ak import numexpr as ne import numpy as np import pandas as pd @@ -347,3 +348,30 @@ def __str__(self): string += f"\nwith attrs={attrs}" return string + + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the Table object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + + Note: + - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray + - conversion to awkward array only works when the key is a string and values are of equal length + + """ + if fmt == "pandas.DataFrame": + return pd.DataFrame(self, copy=copy) + elif fmt == "numpy.ndarray": + return { + "keys": np.array(list(self.keys()), copy=copy), + "values": np.array(list(self.values()), copy=copy), + } + elif fmt == "awkward.Array": + return ak.Array(self) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 7d227a52..16545d01 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -9,8 +9,10 @@ from collections.abc import Iterator from typing import Any +import awkward as ak import numba import numpy as np +import pandas as pd from numpy.typing import DTypeLike, NDArray from .. import lgdo_utils as utils @@ -419,6 +421,26 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) + def convert( + self, fmt: str = "pandas.DataFrame", copy: bool = False + ) -> pd.DataFrame | np.NDArray | ak.Array: + """ + Convert the data of the Table object to a third-party format. + Supported options are: + - "pandas.DataFrame" + - "numpy.ndarray" + - "awkward.Array" + """ + if fmt == "pandas.DataFrame": + return self.to_aoesa().convert("pandas.DataFrame", copy) + elif fmt == "numpy.ndarray": + return self.to_aoesa().convert("numpy.ndarray", copy) + elif fmt == "awkward.Array": + lengths_of_individual_vectors = np.diff(self.cumulative_length, prepend=[0]) + return ak.unflatten(self.flattened_data, lengths_of_individual_vectors) + else: + raise TypeError(f"{fmt} is not a supported third-party format.") + def build_cl( sorted_array_in: NDArray, cumulative_length_out: NDArray = None diff --git a/src/lgdo/types/waveform_table.py b/src/lgdo/types/waveform_table.py index f444c727..131e25d9 100644 --- a/src/lgdo/types/waveform_table.py +++ b/src/lgdo/types/waveform_table.py @@ -8,7 +8,9 @@ import logging from typing import Any +import awkward as ak import numpy as np +import pandas as pd from .array import Array from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays @@ -262,3 +264,8 @@ def __str__(self): np.set_printoptions(**npopt) return string + + def convert( + self, fmt: str = "pandas.DataFrame" + ) -> pd.DataFrame | np.NDArray | ak.Array: + raise NotImplementedError("'convert' not yet implemented for WaveformTable.") From f9182e263208d352821af9906252b7aa5ef7e33f Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 2 Nov 2023 16:52:27 +0100 Subject: [PATCH 02/47] forgot to add awkward to `setup.cfg` --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 050d0a30..ba37b8e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ classifiers = [options] packages = find: install_requires = + awkward colorlog h5py>=3.2 hdf5plugin From 51f9cf7f55b4a9d7a673897fae7c400c58f287ec Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 3 Nov 2023 08:33:35 +0100 Subject: [PATCH 03/47] fixed docstrings --- src/lgdo/types/array.py | 3 +-- src/lgdo/types/arrayofequalsizedarrays.py | 3 +-- src/lgdo/types/fixedsizearray.py | 3 +-- src/lgdo/types/scalar.py | 3 +-- src/lgdo/types/struct.py | 3 +-- src/lgdo/types/table.py | 3 +-- src/lgdo/types/vectorofvectors.py | 3 +-- 7 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 7e2a8ad1..dab66c91 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -144,8 +144,7 @@ def __repr__(self) -> str: def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the Array object to a third-party format. + """Convert the data of the Array object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 70a5ddc6..221776c0 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -137,8 +137,7 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. + """Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 12b0c1ca..191ed633 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -47,8 +47,7 @@ def datatype_name(self) -> str: def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the FixedSizeArray object to a third-party format. + """Convert the data of the FixedSizeArray object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 1f2dfb5a..7ee12e5c 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -63,8 +63,7 @@ def __repr__(self) -> str: def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the Scalar object to a third-party format. + """Convert the data of the Scalar object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index a850e9fa..05cbd2d9 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -112,8 +112,7 @@ def __repr__(self) -> str: def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the Struct object to a third-party format. + """Convert the data of the Struct object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index d238cf7d..3d0fc75e 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -352,8 +352,7 @@ def __str__(self): def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the Table object to a third-party format. + """Convert the data of the Table object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 16545d01..a4fc0db0 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -424,8 +424,7 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray def convert( self, fmt: str = "pandas.DataFrame", copy: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """ - Convert the data of the Table object to a third-party format. + """Convert the data of the Table object to a third-party format. Supported options are: - "pandas.DataFrame" - "numpy.ndarray" From ee3da976699be6e2ca36ea9bf08d1140103337fd Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 3 Nov 2023 08:38:29 +0100 Subject: [PATCH 04/47] maybe now ... --- src/lgdo/types/array.py | 6 +++--- src/lgdo/types/arrayofequalsizedarrays.py | 6 +++--- src/lgdo/types/fixedsizearray.py | 6 +++--- src/lgdo/types/scalar.py | 6 +++--- src/lgdo/types/struct.py | 6 +++--- src/lgdo/types/table.py | 6 +++--- src/lgdo/types/vectorofvectors.py | 6 +++--- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index dab66c91..d8ad9901 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -146,9 +146,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Array object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" """ if fmt == "pandas.DataFrame": return pd.DataFrame(self.nda, copy=copy) diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 221776c0..cb867f62 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -139,9 +139,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" """ if fmt == "pandas.DataFrame": return pd.DataFrame(self.nda, copy=copy) diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 191ed633..1f0494ea 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -49,9 +49,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the FixedSizeArray object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" """ if fmt == "pandas.DataFrame": return pd.DataFrame(self.nda, copy=copy) diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 7ee12e5c..d45569f3 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -65,9 +65,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Scalar object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" Not sure why you would need it though ... """ if fmt == "pandas.DataFrame": diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 05cbd2d9..f1b417d3 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -114,9 +114,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Struct object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" Note: - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 3d0fc75e..e5074128 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -354,9 +354,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" Note: - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index a4fc0db0..08a27cb2 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -426,9 +426,9 @@ def convert( ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. Supported options are: - - "pandas.DataFrame" - - "numpy.ndarray" - - "awkward.Array" + "pandas.DataFrame" + "numpy.ndarray" + "awkward.Array" """ if fmt == "pandas.DataFrame": return self.to_aoesa().convert("pandas.DataFrame", copy) From 52c51ea9c7fb478906f217fa238157de56416ddd Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 24 Nov 2023 14:43:56 +0100 Subject: [PATCH 05/47] Removed ability to controll copy and added option to controll wheter to convert it with or without units. --- src/lgdo/types/array.py | 4 ++-- src/lgdo/types/arrayofequalsizedarrays.py | 4 ++-- src/lgdo/types/encoded.py | 4 ++-- src/lgdo/types/fixedsizearray.py | 4 ++-- src/lgdo/types/lgdo.py | 2 +- src/lgdo/types/scalar.py | 6 +++--- src/lgdo/types/struct.py | 8 ++++---- src/lgdo/types/table.py | 8 ++++---- src/lgdo/types/vectorofvectors.py | 6 +++--- 9 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 1ecd0b16..cae36076 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -142,7 +142,7 @@ def __repr__(self) -> str: ) def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Array object to a third-party format. Supported options are: @@ -151,7 +151,7 @@ def convert( "awkward.Array" """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda, copy=copy) + return pd.DataFrame(self.nda) elif fmt == "numpy.ndarray": return self.nda elif fmt == "awkward.Array": diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 502904f4..9257cf6b 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -135,7 +135,7 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: ) def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. Supported options are: @@ -144,7 +144,7 @@ def convert( "awkward.Array" """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda, copy=copy) + return pd.DataFrame(self.nda) elif fmt == "numpy.ndarray": return self.nda elif fmt == "awkward.Array": diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 26f667eb..6a2d05e0 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -228,7 +228,7 @@ def __repr__(self) -> str: return out def convert( - self, fmt: str = "pandas.DataFrame" + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( "'convert' not yet implemented for VectorOfEncodedVectors." @@ -399,7 +399,7 @@ def __repr__(self) -> str: return out def convert( - self, fmt: str = "pandas.DataFrame" + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( "'convert' not yet implemented for ArrayOfEncodedEqualSizedArrays." diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 1f0494ea..87016729 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -45,7 +45,7 @@ def datatype_name(self) -> str: return "fixedsize_array" def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the FixedSizeArray object to a third-party format. Supported options are: @@ -54,7 +54,7 @@ def convert( "awkward.Array" """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda, copy=copy) + return pd.DataFrame(self.nda) elif fmt == "numpy.ndarray": return self.nda elif fmt == "awkward.Array": diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index 4468c54f..a8b10238 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -36,7 +36,7 @@ def form_datatype(self) -> str: @abstractmethod def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the LGDO object to a third-party format.""" pass diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 236856b4..9db1da76 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -61,7 +61,7 @@ def __repr__(self) -> str: ) def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Scalar object to a third-party format. Supported options are: @@ -71,9 +71,9 @@ def convert( Not sure why you would need it though ... """ if fmt == "pandas.DataFrame": - return pd.DataFrame([self.value], copy=copy) + return pd.DataFrame([self.value]) elif fmt == "numpy.ndarray": - return np.array([self.value], copy=copy) + return np.array([self.value]) elif fmt == "awkward.Array": return ak.Array([self.value]) else: diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index f1b417d3..070c97d0 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -110,7 +110,7 @@ def __repr__(self) -> str: return " ".join(out.replace("\n", " ").split()) def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Struct object to a third-party format. Supported options are: @@ -123,11 +123,11 @@ def convert( - conversion to awkward array only works when the key is a string and values are of equal length """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self, copy=copy) + return pd.DataFrame(self) elif fmt == "numpy.ndarray": return { - "keys": np.array(list(self.keys()), copy=copy), - "values": np.array(list(self.values()), copy=copy), + "keys": np.array(list(self.keys())), + "values": np.array(list(self.values())), } elif fmt == "awkward.Array": return ak.Array(self) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 8f5e98ae..32b8f07c 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -353,7 +353,7 @@ def __str__(self): return string def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. Supported options are: @@ -367,11 +367,11 @@ def convert( """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self, copy=copy) + return pd.DataFrame(self) elif fmt == "numpy.ndarray": return { - "keys": np.array(list(self.keys()), copy=copy), - "values": np.array(list(self.values()), copy=copy), + "keys": np.array(list(self.keys())), + "values": np.array(list(self.values())), } elif fmt == "awkward.Array": return ak.Array(self) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 7be6c582..da4c372a 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -422,7 +422,7 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) def convert( - self, fmt: str = "pandas.DataFrame", copy: bool = False + self, fmt: str = "pandas.DataFrame", with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. Supported options are: @@ -431,9 +431,9 @@ def convert( "awkward.Array" """ if fmt == "pandas.DataFrame": - return self.to_aoesa().convert("pandas.DataFrame", copy) + return self.to_aoesa().convert("pandas.DataFrame") elif fmt == "numpy.ndarray": - return self.to_aoesa().convert("numpy.ndarray", copy) + return self.to_aoesa().convert("numpy.ndarray") elif fmt == "awkward.Array": lengths_of_individual_vectors = np.diff(self.cumulative_length, prepend=[0]) return ak.unflatten(self.flattened_data, lengths_of_individual_vectors) From 5544e45daff6f2a95886eff7ac37afdabb69ab4c Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 15:20:49 +0100 Subject: [PATCH 06/47] Rename convert -> view_as and other fixes (also docs) --- docs/source/conf.py | 1 + setup.cfg | 2 +- src/lgdo/types/array.py | 10 ++-------- src/lgdo/types/arrayofequalsizedarrays.py | 19 +++--------------- src/lgdo/types/encoded.py | 12 ++++++------ src/lgdo/types/fixedsizearray.py | 21 ++------------------ src/lgdo/types/lgdo.py | 24 ++++++++++++++++++++--- src/lgdo/types/scalar.py | 24 +++-------------------- src/lgdo/types/struct.py | 21 ++++++++++---------- src/lgdo/types/table.py | 22 ++++++++++----------- src/lgdo/types/vectorofvectors.py | 13 +++++------- src/lgdo/types/waveform_table.py | 4 ++-- 12 files changed, 68 insertions(+), 105 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index b9980d88..ab8317e2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,6 +51,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "numpy": ("https://numpy.org/doc/stable", None), + "awkward": ("https://awkward-array.org/doc/stable", None), "numba": ("https://numba.readthedocs.io/en/stable", None), "pandas": ("https://pandas.pydata.org/docs", None), "h5py": ("https://docs.h5py.org/en/stable", None), diff --git a/setup.cfg b/setup.cfg index ba37b8e9..db6a843d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ classifiers = [options] packages = find: install_requires = - awkward + awkward>=2 colorlog h5py>=3.2 hdf5plugin diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index cae36076..39c1f9db 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -141,15 +141,9 @@ def __repr__(self) -> str: + f", attrs={repr(self.attrs)})" ) - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the Array object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - """ if fmt == "pandas.DataFrame": return pd.DataFrame(self.nda) elif fmt == "numpy.ndarray": diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 9257cf6b..155c6b51 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -134,20 +134,7 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: attrs=attrs, ) - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the ArrayOfEqualSizedArrays object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - """ - if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda) - elif fmt == "numpy.ndarray": - return self.nda - elif fmt == "awkward.Array": - return ak.Array(self.nda) - else: - raise TypeError(f"{fmt} is not a supported third-party format.") + return super().view_as(fmt, with_units) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 6a2d05e0..1ad3e35d 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -227,11 +227,11 @@ def __repr__(self) -> str: np.set_printoptions(**npopt) return out - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( - "'convert' not yet implemented for VectorOfEncodedVectors." + "'view_as' not yet implemented for VectorOfEncodedVectors." ) @@ -398,9 +398,9 @@ def __repr__(self) -> str: np.set_printoptions(**npopt) return out - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( - "'convert' not yet implemented for ArrayOfEncodedEqualSizedArrays." + "'view_as' not yet implemented for ArrayOfEncodedEqualSizedArrays." ) diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 87016729..30de9790 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -6,9 +6,7 @@ from typing import Any -import awkward as ak import numpy as np -import pandas as pd from .array import Array @@ -44,20 +42,5 @@ def __init__( def datatype_name(self) -> str: return "fixedsize_array" - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True - ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the FixedSizeArray object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - """ - if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda) - elif fmt == "numpy.ndarray": - return self.nda - elif fmt == "awkward.Array": - return ak.Array(self.nda) - else: - raise TypeError(f"{fmt} is not a supported third-party format.") + def view_as(self, fmt: str, with_units: bool = True): + return super.view_as(fmt, with_units) diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index a8b10238..5227b5ec 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -35,10 +35,28 @@ def form_datatype(self) -> str: pass @abstractmethod - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, library: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the LGDO object to a third-party format.""" + """View the LGDO data object as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Typical supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``np``: :mod:`numpy` + - ``ak``: :mod:`awkward` + + But the actual supported formats may vary depending on the concrete + LGDO class. + + Parameters + ---------- + library + format of the returned data view. + """ pass def getattrs(self, datatype: bool = False) -> dict: diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 9db1da76..eb7aebff 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -5,9 +5,7 @@ import logging from typing import Any -import awkward as ak import numpy as np -import pandas as pd from .. import utils as utils from .lgdo import LGDO @@ -44,6 +42,9 @@ def datatype_name(self) -> str: def form_datatype(self) -> str: return self.datatype_name() + def view_as(self, fmt: str, with_units: bool = True): + return self.value + def __eq__(self, other: Scalar) -> bool: if isinstance(other, Scalar): return self.value == other.value and self.attrs == self.attrs @@ -59,22 +60,3 @@ def __repr__(self) -> str: self.__class__.__name__ + f"(value={repr(self.value)}, attrs={repr(self.attrs)})" ) - - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True - ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the Scalar object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - Not sure why you would need it though ... - """ - if fmt == "pandas.DataFrame": - return pd.DataFrame([self.value]) - elif fmt == "numpy.ndarray": - return np.array([self.value]) - elif fmt == "awkward.Array": - return ak.Array([self.value]) - else: - raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 070c97d0..e73d126a 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -109,18 +109,19 @@ def __repr__(self) -> str: np.set_printoptions(**npopt) return " ".join(out.replace("\n", " ").split()) - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Struct object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - - Note: - - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray - - conversion to awkward array only works when the key is a string and values are of equal length + Supported options are ... + + Note + ---- + - conversion to ndarray only works when the values are of the equal + length, returns a dict containing "keys" and "values" keys for + the corresponding NDArray + - conversion to awkward array only works when the key is a string + and values are of equal length """ if fmt == "pandas.DataFrame": return pd.DataFrame(self) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 32b8f07c..c4001cd1 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -352,19 +352,19 @@ def __str__(self): return string - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" - - Note: - - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray - - conversion to awkward array only works when the key is a string and values are of equal length - + Supported options are ... + + Note + ---- + - conversion to ndarray only works when the values are of the equal + length, returns a dict containing "keys" and "values" keys for + the corresponding NDArray + - conversion to awkward array only works when the key is a string + and values are of equal length """ if fmt == "pandas.DataFrame": return pd.DataFrame(self) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index da4c372a..a0cf4658 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -421,19 +421,16 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) - def convert( - self, fmt: str = "pandas.DataFrame", with_units: bool = True + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. - Supported options are: - "pandas.DataFrame" - "numpy.ndarray" - "awkward.Array" + Supported options are ... """ if fmt == "pandas.DataFrame": - return self.to_aoesa().convert("pandas.DataFrame") + return self.to_aoesa().view_as("pandas.DataFrame") elif fmt == "numpy.ndarray": - return self.to_aoesa().convert("numpy.ndarray") + return self.to_aoesa().view_as("numpy.ndarray") elif fmt == "awkward.Array": lengths_of_individual_vectors = np.diff(self.cumulative_length, prepend=[0]) return ak.unflatten(self.flattened_data, lengths_of_individual_vectors) diff --git a/src/lgdo/types/waveform_table.py b/src/lgdo/types/waveform_table.py index 131e25d9..9be7db3a 100644 --- a/src/lgdo/types/waveform_table.py +++ b/src/lgdo/types/waveform_table.py @@ -265,7 +265,7 @@ def __str__(self): np.set_printoptions(**npopt) return string - def convert( - self, fmt: str = "pandas.DataFrame" + def view_as( + self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError("'convert' not yet implemented for WaveformTable.") From 00d464e749785ade8daf320e8bf6f1a837fee2d7 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 24 Nov 2023 15:23:15 +0100 Subject: [PATCH 07/47] working on Struct and Table --- src/lgdo/types/struct.py | 11 +++-------- src/lgdo/types/table.py | 9 ++------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 070c97d0..2c5fd525 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -115,20 +115,15 @@ def convert( """Convert the data of the Struct object to a third-party format. Supported options are: "pandas.DataFrame" - "numpy.ndarray" "awkward.Array" Note: - - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray - - conversion to awkward array only works when the key is a string and values are of equal length + - conversion to ndarray is not supported at the moment as there is no clear way how to wrap the column names and the data into one array. """ if fmt == "pandas.DataFrame": - return pd.DataFrame(self) + return pd.DataFrame(self, copy=False) elif fmt == "numpy.ndarray": - return { - "keys": np.array(list(self.keys())), - "values": np.array(list(self.values())), - } + raise TypeError(f"Format {fmt} is not a supported for Structs.") elif fmt == "awkward.Array": return ak.Array(self) else: diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 32b8f07c..3c9f51aa 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -358,21 +358,16 @@ def convert( """Convert the data of the Table object to a third-party format. Supported options are: "pandas.DataFrame" - "numpy.ndarray" "awkward.Array" Note: - - conversion to ndarray only works when the values are of the equal length, returns a dict containing "keys" and "values" keys for the corresponding NDArray - - conversion to awkward array only works when the key is a string and values are of equal length + - conversion to ndarray is not supported at the moment as there is no clear way how to wrap the column names and the data into one array. """ if fmt == "pandas.DataFrame": return pd.DataFrame(self) elif fmt == "numpy.ndarray": - return { - "keys": np.array(list(self.keys())), - "values": np.array(list(self.values())), - } + raise TypeError(f"Format {fmt} is not a supported for Tables.") elif fmt == "awkward.Array": return ak.Array(self) else: From f310f9bce0ad23c8f8cd3f1d8b66328bf07e5366 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 24 Nov 2023 15:26:08 +0100 Subject: [PATCH 08/47] Merge --- src/lgdo/types/struct.py | 2 +- src/lgdo/types/table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index d340b36b..efb35d4b 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -118,7 +118,7 @@ def view_as( Note ---- - - conversion to ndarray is not supported at the moment as there is + - conversion to ndarray is not supported at the moment as there is no clear way how to wrap the column names and the data into one array. - conversion to awkward array only works when the key is a string and values are of equal length diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 5bb65171..36073cb9 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -361,7 +361,7 @@ def view_as( Note ---- - - conversion to ndarray is not supported at the moment as there is + - conversion to ndarray is not supported at the moment as there is no clear way how to wrap the column names and the data into one array. - conversion to awkward array only works when the key is a string and values are of equal length From 64cd21326fe9114796a92714e5770b91dce2e03b Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 15:31:48 +0100 Subject: [PATCH 09/47] Rename waveform_table module to waveformtable --- src/lgdo/lh5/store.py | 2 +- src/lgdo/types/__init__.py | 2 +- src/lgdo/types/{waveform_table.py => waveformtable.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/lgdo/types/{waveform_table.py => waveformtable.py} (100%) diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py index 3c2aa696..8474414f 100644 --- a/src/lgdo/lh5/store.py +++ b/src/lgdo/lh5/store.py @@ -53,7 +53,7 @@ class LH5Store: >>> store = LH5Store() >>> obj, _ = store.read("/geds/waveform", "file.lh5") >>> type(obj) - lgdo.waveform_table.WaveformTable + lgdo.waveformtable.WaveformTable """ def __init__(self, base_path: str = "", keep_open: bool = False) -> None: diff --git a/src/lgdo/types/__init__.py b/src/lgdo/types/__init__.py index 57a18e0a..8b71e19c 100644 --- a/src/lgdo/types/__init__.py +++ b/src/lgdo/types/__init__.py @@ -9,7 +9,7 @@ from .struct import Struct from .table import Table from .vectorofvectors import VectorOfVectors -from .waveform_table import WaveformTable +from .waveformtable import WaveformTable __all__ = [ "Array", diff --git a/src/lgdo/types/waveform_table.py b/src/lgdo/types/waveformtable.py similarity index 100% rename from src/lgdo/types/waveform_table.py rename to src/lgdo/types/waveformtable.py From 2bf36dcb4f09c9f13a601d750f94704c8d889c8f Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 24 Nov 2023 15:43:15 +0100 Subject: [PATCH 10/47] added `view_as` for Struct, Table and WaveformTable --- src/lgdo/types/array.py | 8 ++++---- src/lgdo/types/struct.py | 8 +++++--- src/lgdo/types/table.py | 12 ++++++++---- src/lgdo/types/waveform_table.py | 17 ++++++++++++++++- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 39c1f9db..339ce1e4 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -144,11 +144,11 @@ def __repr__(self) -> str: def view_as( self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - if fmt == "pandas.DataFrame": - return pd.DataFrame(self.nda) - elif fmt == "numpy.ndarray": + if fmt == "pd": + return pd.DataFrame(self.nda, copy=False) + elif fmt == "np": return self.nda - elif fmt == "awkward.Array": + elif fmt == "ak": return ak.Array(self.nda) else: raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index efb35d4b..7fed7fb0 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -115,6 +115,8 @@ def view_as( """Convert the data of the Struct object to a third-party format. Supported options are ... + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` Note ---- @@ -123,11 +125,11 @@ def view_as( - conversion to awkward array only works when the key is a string and values are of equal length """ - if fmt == "pandas.DataFrame": + if fmt == "pd": return pd.DataFrame(self, copy=False) - elif fmt == "numpy.ndarray": + elif fmt == "np": raise TypeError(f"Format {fmt} is not a supported for Structs.") - elif fmt == "awkward.Array": + elif fmt == "ak": return ak.Array(self) else: raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 36073cb9..7d436fc9 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -359,6 +359,10 @@ def view_as( Supported options are ... + Supported options are ... + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` + Note ---- - conversion to ndarray is not supported at the moment as there is @@ -366,11 +370,11 @@ def view_as( - conversion to awkward array only works when the key is a string and values are of equal length """ - if fmt == "pandas.DataFrame": - return pd.DataFrame(self) - elif fmt == "numpy.ndarray": + if fmt == "pd": + return pd.DataFrame(self, copy=False) + elif fmt == "np": raise TypeError(f"Format {fmt} is not a supported for Tables.") - elif fmt == "awkward.Array": + elif fmt == "ak": return ak.Array(self) else: raise TypeError(f"{fmt} is not a supported third-party format.") diff --git a/src/lgdo/types/waveform_table.py b/src/lgdo/types/waveform_table.py index 9be7db3a..4f5147b6 100644 --- a/src/lgdo/types/waveform_table.py +++ b/src/lgdo/types/waveform_table.py @@ -268,4 +268,19 @@ def __str__(self): def view_as( self, fmt: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - raise NotImplementedError("'convert' not yet implemented for WaveformTable.") + """Convert the data of the WaveformTable object to a third-party format. + + Supported options are ... + + Supported options are ... + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` + + Note + ---- + - The implementation of `view_as' for WaveformTable is just a wrapper + for the implementation in Table. + """ + return super().view_as(fmt, with_units) + + # raise NotImplementedError("'convert' not yet implemented for WaveformTable.") From fbdd04b09b122d93819dfd3ba061238a7caecba8 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:41:16 +0100 Subject: [PATCH 11/47] Add awkward-pandas and pint-pandas to requirements --- setup.cfg | 2 ++ tests/types/{test_waveform_table.py => test_waveformtable.py} | 0 2 files changed, 2 insertions(+) rename tests/types/{test_waveform_table.py => test_waveformtable.py} (100%) diff --git a/setup.cfg b/setup.cfg index db6a843d..638ca2ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,7 @@ classifiers = packages = find: install_requires = awkward>=2 + awkward-pandas colorlog h5py>=3.2 hdf5plugin @@ -40,6 +41,7 @@ install_requires = pandas>=1.4.4 parse pint + pint-pandas python_requires = >=3.9 include_package_data = True package_dir = diff --git a/tests/types/test_waveform_table.py b/tests/types/test_waveformtable.py similarity index 100% rename from tests/types/test_waveform_table.py rename to tests/types/test_waveformtable.py From 4940ecc26be8afef6a79ec963f5d6db7c2e64f7b Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:42:08 +0100 Subject: [PATCH 12/47] Add tiny module for physical units --- src/lgdo/units.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 src/lgdo/units.py diff --git a/src/lgdo/units.py b/src/lgdo/units.py new file mode 100644 index 00000000..7fba10af --- /dev/null +++ b/src/lgdo/units.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +import pint + +default_units_registry = pint.get_application_registry() +default_units_registry.default_format = "~P" From 9358f7e16f3422a7f272a7f964b917536e51ae50 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:44:24 +0100 Subject: [PATCH 13/47] Implement view_as for array types Need tests for AoESA --- src/lgdo/types/array.py | 41 ++++++++++++++++++----- src/lgdo/types/arrayofequalsizedarrays.py | 4 +-- src/lgdo/types/fixedsizearray.py | 4 +-- tests/types/test_array.py | 29 ++++++++++++++++ 4 files changed, 66 insertions(+), 12 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 339ce1e4..7f7a26d2 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -11,8 +11,10 @@ import awkward as ak import numpy as np import pandas as pd +import pint_pandas # noqa: F401 from .. import utils as utils +from ..units import default_units_registry as u from .lgdo import LGDO log = logging.getLogger(__name__) @@ -142,13 +144,36 @@ def __repr__(self) -> str: ) def view_as( - self, fmt: str, with_units: bool = True + self, library: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - if fmt == "pd": - return pd.DataFrame(self.nda, copy=False) - elif fmt == "np": - return self.nda - elif fmt == "ak": - return ak.Array(self.nda) + """View the Array data as a third-party format data structure. + + Parameters + ---------- + library + format of the returned data view. + """ + # TODO: does attaching units imply a copy? + attach_units = with_units and "units" in self.attrs + + if library == "pd": + if attach_units: + return pd.Series( + self.nda, dtype=f"pint[{self.attrs['units']}]", copy=False + ) + else: + return pd.Series(self.nda, copy=False) + elif library == "np": + if attach_units: + return self.nda * u(self.attrs["units"]) + else: + return self.nda + elif library == "ak": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + else: + return ak.Array(self.nda) else: - raise TypeError(f"{fmt} is not a supported third-party format.") + raise ValueError(f"{library} is not a supported third-party format.") diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 155c6b51..42796c20 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -135,6 +135,6 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: ) def view_as( - self, fmt: str, with_units: bool = True + self, library: str, with_units: bool = True ) -> pd.DataFrame | np.NDArray | ak.Array: - return super().view_as(fmt, with_units) + return super().view_as(library, with_units=with_units) diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 30de9790..0b1d7265 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -42,5 +42,5 @@ def __init__( def datatype_name(self) -> str: return "fixedsize_array" - def view_as(self, fmt: str, with_units: bool = True): - return super.view_as(fmt, with_units) + def view_as(self, library: str, with_units: bool = True): + return super.view_as(library, with_units=with_units) diff --git a/tests/types/test_array.py b/tests/types/test_array.py index df1bcd3c..b4972d99 100644 --- a/tests/types/test_array.py +++ b/tests/types/test_array.py @@ -1,4 +1,8 @@ +import awkward as ak import numpy as np +import pandas as pd +import pint +import pytest import lgdo.utils as utils from lgdo import Array @@ -37,3 +41,28 @@ def test_insert(): a = Array(np.array([1, 2, 3, 4])) a.insert(2, [-1, -1]) assert a == Array([1, 2, -1, -1, 3, 4]) + + +def test_view(): + a = Array(np.array([1, 2, 3, 4]), attrs={"units": "m"}) + + v = a.view_as("np", with_units=True) + assert isinstance(v, pint.Quantity) + assert v.u == "meter" + assert np.array_equal(v.m, a.nda) + + v = a.view_as("np", with_units=False) + assert isinstance(v, np.ndarray) + + v = a.view_as("pd", with_units=True) + assert isinstance(v, pd.Series) + assert v.dtype == "meter" + + v = a.view_as("pd", with_units=False) + assert v.dtype == "int64" + + v = a.view_as("ak", with_units=False) + assert isinstance(v, ak.Array) + + with pytest.raises(ValueError): + a.view_as("ak", with_units=True) From 0f3ce519f1118be4b7d410261c9d41c767aa48be Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:45:17 +0100 Subject: [PATCH 14/47] Implement view_as() for VoVs --- src/lgdo/types/vectorofvectors.py | 61 +++++++++++++++++++++++------ tests/types/test_vectorofvectors.py | 39 ++++++++++++++---- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index a0cf4658..3d595475 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -10,6 +10,7 @@ from typing import Any import awkward as ak +import awkward_pandas as akpd import numba import numpy as np import pandas as pd @@ -422,20 +423,58 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) def view_as( - self, fmt: str, with_units: bool = True + self, library: str, with_units: bool = True, preserve_dtype: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the Table object to a third-party format. - Supported options are ... + """View the data as third-party format structure. + + Note + ---- + Awkward array views partially involve memory re-allocation (the + `cumulative_length`s). + + Parameters + ---------- + library + either ``pd``, ``np`` or `ak`. + with_units + forward physical units to the output data. """ - if fmt == "pandas.DataFrame": - return self.to_aoesa().view_as("pandas.DataFrame") - elif fmt == "numpy.ndarray": - return self.to_aoesa().view_as("numpy.ndarray") - elif fmt == "awkward.Array": - lengths_of_individual_vectors = np.diff(self.cumulative_length, prepend=[0]) - return ak.unflatten(self.flattened_data, lengths_of_individual_vectors) + attach_units = with_units and "units" in self.attrs + + if library == "ak": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + + # cannot avoid making a copy here. we should add the leading 0 to + # cumulative_length inside VectorOfVectors at some point in the + # future + offsets = np.empty( + len(self.cumulative_length) + 1, dtype=self.cumulative_length.dtype + ) + offsets[1:] = self.cumulative_length + offsets[0] = 0 + + layout = ak.contents.ListOffsetArray( + offsets=ak.index.Index(offsets), + content=ak.contents.NumpyArray(self.flattened_data), + ) + return ak.Array(layout) + + if library == "np": + return self.to_aoesa(preserve_dtype=preserve_dtype).view_as( + "np", with_units=with_units + ) + if library == "pd": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + else: + return akpd.from_awkward(self.view_as("ak")) else: - raise TypeError(f"{fmt} is not a supported third-party format.") + raise ValueError(f"{library} is not a supported third-party format.") def build_cl( diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 71c20ea8..6155c7eb 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -1,4 +1,6 @@ +import awkward as ak import numpy as np +import pint import pytest import lgdo @@ -219,17 +221,11 @@ def test_replace(lgdo_vov): def test_iter(lgdo_vov): - desired = [ - np.array([1, 2]), - np.array([3, 4, 5]), - np.array([2]), - np.array([4, 8, 9, 7]), - np.array([5, 3, 1]), - ] + desired = [[1, 2], [3, 4, 5], [2], [4, 8, 9, 7], [5, 3, 1]] c = 0 for v in lgdo_vov: - assert (v == desired[c]).all() + assert np.array_equal(v, desired[c]) c += 1 @@ -270,3 +266,30 @@ def test_build_cl_and_explodes(): def test_copy(lgdo_vov): assert lgdo_vov == utils.copy(lgdo_vov) + + +def test_view(lgdo_vov): + lgdo_vov.attrs["units"] = "s" + with pytest.raises(ValueError): + lgdo_vov.view_as("ak") + + ak_arr = lgdo_vov.view_as("ak", with_units=False) + + assert isinstance(ak_arr, ak.Array) + assert len(ak_arr) == len(lgdo_vov) + assert ak.all(ak_arr == [[1, 2], [3, 4, 5], [2], [4, 8, 9, 7], [5, 3, 1]]) + + np_arr = lgdo_vov.view_as("np", with_units=True) + assert isinstance(np_arr, pint.Quantity) + assert np_arr.u == "second" + assert isinstance(np_arr.m, np.ndarray) + + np_arr = lgdo_vov.view_as("np", with_units=False) + assert isinstance(np_arr, np.ndarray) + assert np.issubdtype(np_arr.dtype, np.floating) + + np_arr = lgdo_vov.view_as("np", with_units=False, preserve_dtype=True) + assert np.issubdtype(np_arr.dtype, np.integer) + + np_arr = lgdo_vov.view_as("pd", with_units=True) + assert isinstance(np_arr, pint.Quantity) From f562fb73b736c30c3109face6df5938cd2b0337f Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:47:51 +0100 Subject: [PATCH 15/47] Switch attaching units to off by default They might result in poor performance when computing with large data sets? https://pint.readthedocs.io/en/stable/advanced/performance.html --- src/lgdo/types/array.py | 2 +- src/lgdo/types/arrayofequalsizedarrays.py | 2 +- src/lgdo/types/encoded.py | 4 ++-- src/lgdo/types/fixedsizearray.py | 2 +- src/lgdo/types/lgdo.py | 2 +- src/lgdo/types/scalar.py | 2 +- src/lgdo/types/struct.py | 2 +- src/lgdo/types/table.py | 2 +- src/lgdo/types/vectorofvectors.py | 2 +- src/lgdo/types/waveformtable.py | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 7f7a26d2..2dcf032a 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -144,7 +144,7 @@ def __repr__(self) -> str: ) def view_as( - self, library: str, with_units: bool = True + self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """View the Array data as a third-party format data structure. diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 42796c20..1b7a6eb6 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -135,6 +135,6 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: ) def view_as( - self, library: str, with_units: bool = True + self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: return super().view_as(library, with_units=with_units) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 1ad3e35d..0c91c8ca 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -228,7 +228,7 @@ def __repr__(self) -> str: return out def view_as( - self, fmt: str, with_units: bool = True + self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( "'view_as' not yet implemented for VectorOfEncodedVectors." @@ -399,7 +399,7 @@ def __repr__(self) -> str: return out def view_as( - self, fmt: str, with_units: bool = True + self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: raise NotImplementedError( "'view_as' not yet implemented for ArrayOfEncodedEqualSizedArrays." diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 0b1d7265..575a8eb7 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -42,5 +42,5 @@ def __init__( def datatype_name(self) -> str: return "fixedsize_array" - def view_as(self, library: str, with_units: bool = True): + def view_as(self, library: str, with_units: bool = False): return super.view_as(library, with_units=with_units) diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index 5227b5ec..dfccf8e6 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -36,7 +36,7 @@ def form_datatype(self) -> str: @abstractmethod def view_as( - self, library: str, with_units: bool = True + self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """View the LGDO data object as a third-party format data structure. diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index eb7aebff..a18a314c 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -42,7 +42,7 @@ def datatype_name(self) -> str: def form_datatype(self) -> str: return self.datatype_name() - def view_as(self, fmt: str, with_units: bool = True): + def view_as(self, fmt: str, with_units: bool = False): return self.value def __eq__(self, other: Scalar) -> bool: diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 7fed7fb0..08af7a2b 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -110,7 +110,7 @@ def __repr__(self) -> str: return " ".join(out.replace("\n", " ").split()) def view_as( - self, fmt: str, with_units: bool = True + self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Struct object to a third-party format. diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 7d436fc9..cf4f7708 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -353,7 +353,7 @@ def __str__(self): return string def view_as( - self, fmt: str, with_units: bool = True + self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 3d595475..8a7f517b 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -423,7 +423,7 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) def view_as( - self, library: str, with_units: bool = True, preserve_dtype: bool = False + self, library: str, with_units: bool = False, preserve_dtype: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """View the data as third-party format structure. diff --git a/src/lgdo/types/waveformtable.py b/src/lgdo/types/waveformtable.py index 4f5147b6..421bee0d 100644 --- a/src/lgdo/types/waveformtable.py +++ b/src/lgdo/types/waveformtable.py @@ -266,7 +266,7 @@ def __str__(self): return string def view_as( - self, fmt: str, with_units: bool = True + self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the WaveformTable object to a third-party format. From fd4c9bdcaacde427af1b9fbabb83d08f6c404605 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 18:55:24 +0100 Subject: [PATCH 16/47] Fix failing test --- tests/types/test_vectorofvectors.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 6155c7eb..66f9814b 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -1,5 +1,7 @@ import awkward as ak +import awkward_pandas as akpd import numpy as np +import pandas as pd import pint import pytest @@ -271,7 +273,7 @@ def test_copy(lgdo_vov): def test_view(lgdo_vov): lgdo_vov.attrs["units"] = "s" with pytest.raises(ValueError): - lgdo_vov.view_as("ak") + lgdo_vov.view_as("ak", with_units=True) ak_arr = lgdo_vov.view_as("ak", with_units=False) @@ -291,5 +293,6 @@ def test_view(lgdo_vov): np_arr = lgdo_vov.view_as("np", with_units=False, preserve_dtype=True) assert np.issubdtype(np_arr.dtype, np.integer) - np_arr = lgdo_vov.view_as("pd", with_units=True) - assert isinstance(np_arr, pint.Quantity) + np_arr = lgdo_vov.view_as("pd", with_units=False) + assert isinstance(np_arr, pd.Series) + assert isinstance(np_arr.ak, akpd.accessor.AwkwardAccessor) From b98afde0c725a245e15c4f79657b522544d04885 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 24 Nov 2023 19:02:09 +0100 Subject: [PATCH 17/47] Fix docstrings --- src/lgdo/types/vectorofvectors.py | 6 +++--- src/lgdo/types/waveformtable.py | 16 +--------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 8a7f517b..65b3178a 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -425,17 +425,17 @@ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArray def view_as( self, library: str, with_units: bool = False, preserve_dtype: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """View the data as third-party format structure. + r"""View the data as third-party format structure. Note ---- Awkward array views partially involve memory re-allocation (the - `cumulative_length`s). + `cumulative_length`\ s). Parameters ---------- library - either ``pd``, ``np`` or `ak`. + either ``pd``, ``np`` or ``ak``. with_units forward physical units to the output data. """ diff --git a/src/lgdo/types/waveformtable.py b/src/lgdo/types/waveformtable.py index 421bee0d..b768d9ed 100644 --- a/src/lgdo/types/waveformtable.py +++ b/src/lgdo/types/waveformtable.py @@ -268,19 +268,5 @@ def __str__(self): def view_as( self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the WaveformTable object to a third-party format. - - Supported options are ... - - Supported options are ... - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` - - Note - ---- - - The implementation of `view_as' for WaveformTable is just a wrapper - for the implementation in Table. - """ + """View WaveformTable object as a third-party format data structure.""" return super().view_as(fmt, with_units) - - # raise NotImplementedError("'convert' not yet implemented for WaveformTable.") From ad802d8c1f816eb27ca14e226b2eb6705954dedc Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 11:11:28 +0100 Subject: [PATCH 18/47] merge while wip on table.py --- src/lgdo/types/table.py | 89 ++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 27 deletions(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 7d436fc9..9f3bbfab 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -7,6 +7,7 @@ import logging import re from typing import Any +from warnings import warn import awkward as ak import numexpr as ne @@ -207,31 +208,14 @@ def get_dataframe( The prefix to be added to the column names. Used when recursively getting the dataframe of a Table inside this Table """ - df = pd.DataFrame(copy=copy) - if cols is None: - cols = self.keys() - for col in cols: - if isinstance(self[col], Table): - sub_df = self[col].get_dataframe(prefix=f"{prefix}{col}_") - if df.empty: - df = sub_df - else: - df = df.join(sub_df) - else: - if isinstance(self[col], VectorOfVectors): - column = self[col].to_aoesa() - else: - column = self[col] - - if not hasattr(column, "nda"): - raise ValueError(f"column {col} does not have an nda") - else: - if len(column.nda.shape) == 1: - df[prefix + str(col)] = column.nda - else: - df[prefix + str(col)] = column.nda.tolist() - - return df + warn( + "`get_dataframe` is deprecated and will be removed in a future release. " + "Instead use `view_as` to get the Table data as a pandas dataframe " + "or awkward Array. ", + DeprecationWarning, + stacklevel=2, + ) + return self.view_as(fmt="pd", cols=cols, prefix=prefix) def eval(self, expr_config: dict) -> Table: """Apply column operations to the table and return a new table holding @@ -353,7 +337,11 @@ def __str__(self): return string def view_as( - self, fmt: str, with_units: bool = True + self, + fmt: str, + with_units: bool = True, + cols: list[str] = None, + prefix: str = "", ) -> pd.DataFrame | np.NDArray | ak.Array: """Convert the data of the Table object to a third-party format. @@ -371,10 +359,57 @@ def view_as( and values are of equal length """ if fmt == "pd": - return pd.DataFrame(self, copy=False) + return _view_table_as_pd(self, cols=cols, prefix=prefix) elif fmt == "np": raise TypeError(f"Format {fmt} is not a supported for Tables.") elif fmt == "ak": return ak.Array(self) else: raise TypeError(f"{fmt} is not a supported third-party format.") + + +def _view_table_as_pd( + table: Table, cols: list[str] = None, copy: bool = False, prefix: str = "" +) -> pd.DataFrame: + """Get a :class:`pandas.DataFrame` from the data in the table. + + Notes + ----- + The requested data must be array-like, with the ``nda`` attribute. + + Parameters + ---------- + cols + a list of column names specifying the subset of the table's columns + to be added to the dataframe. + copy + When ``True``, the dataframe allocates new memory and copies data + into it. Otherwise, the raw ``nda``'s from the table are used directly. + prefix + The prefix to be added to the column names. Used when recursively getting the + dataframe of a Table inside this Table + """ + df = pd.DataFrame(copy=copy) + if cols is None: + cols = table.keys() + for col in cols: + if isinstance(table[col], Table): + sub_df = _view_table_as_pd(table[col], prefix=f"{prefix}{col}_") + if df.empty: + df = sub_df + else: + df = df.join(sub_df) + else: + column = table[col] + + if not isinstance(column, VectorOfVectors) and not hasattr(column, "nda"): + raise ValueError(f"column {col} does not have an nda") + elif isinstance(column, VectorOfVectors): + df = df.join(ak.to_dataframe(column.view_as("ak"))) + else: + if len(column.nda.shape) == 1: + df[prefix + str(col)] = column.nda + else: + df[prefix + str(col)] = column.nda.tolist() + + return df From 7857cb3373501c69d2c8b140bb28ed7c9911fb17 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 12:12:30 +0100 Subject: [PATCH 19/47] fixed tests to not use get_dataframe anymore as it got a deprication warning --- tests/types/test_table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/types/test_table.py b/tests/types/test_table.py index 1c82c350..4beb422d 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -84,7 +84,7 @@ def test_join(): assert list(tbl2.keys()) == ["c", "d", "a"] -def test_get_dataframe(): +def test_view_as_pd(): tbl = Table(4) tbl.add_column("a", lgdo.Array(np.array([1, 2, 3]))) tbl.add_column("b", lgdo.Array(np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))) @@ -104,7 +104,7 @@ def test_get_dataframe(): } ), ) - df = tbl.get_dataframe() + df = tbl.view_as("pd") assert isinstance(df, pd.DataFrame) assert list(df.keys()) == ["a", "b", "c", "d_a", "d_b"] From 16db16fbcbf05c5962d10b413e3da587bc45014c Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 12:16:35 +0100 Subject: [PATCH 20/47] at least the get_dataframe errors should be fixed --- src/lgdo/types/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index f3e0fb7a..b441b0fa 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -320,7 +320,7 @@ def __str__(self): opts["index"] = False try: - string = self.get_dataframe().to_string(**opts) + string = self.view_as("pd").to_string(**opts) except ValueError: string = "Cannot print Table with VectorOfVectors yet!" From 8d1e59bbe5c70b73fe077677bee9e160e98e3081 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 14:20:37 +0100 Subject: [PATCH 21/47] implemented with_units option for view_as of Table, added akpd transformation for arrays with ndim>1 in Array.py --- src/lgdo/types/array.py | 17 +++++++++++++---- src/lgdo/types/table.py | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 2dcf032a..095a5db6 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -9,6 +9,7 @@ from typing import Any import awkward as ak +import awkward_pandas as akpd import numpy as np import pandas as pd import pint_pandas # noqa: F401 @@ -158,11 +159,19 @@ def view_as( if library == "pd": if attach_units: - return pd.Series( - self.nda, dtype=f"pint[{self.attrs['units']}]", copy=False - ) + if self.nda.ndim == 1: + return pd.Series( + self.nda, dtype=f"pint[{self.attrs['units']}]", copy=False + ) + else: + raise ValueError( + "view_as() for ndarrays uses awkard_pandas to convert to pandas which does not support Pint. You must view the data with_units=False" + ) else: - return pd.Series(self.nda, copy=False) + if self.nda.ndim == 1: + return pd.Series(self.nda, copy=False) + else: + return akpd.from_awkward(self.view_as("ak")) elif library == "np": if attach_units: return self.nda * u(self.attrs["units"]) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index b441b0fa..687ff9d3 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -359,7 +359,9 @@ def view_as( and values are of equal length """ if library == "pd": - return _view_table_as_pd(self, cols=cols, prefix=prefix) + return _view_table_as_pd( + self, cols=cols, prefix=prefix, with_units=with_units + ) elif library == "np": raise TypeError(f"Format {library} is not a supported for Tables.") elif library == "ak": @@ -369,7 +371,11 @@ def view_as( def _view_table_as_pd( - table: Table, cols: list[str] = None, copy: bool = False, prefix: str = "" + table: Table, + cols: list[str] = None, + copy: bool = False, + prefix: str = "", + with_units: bool = False, ) -> pd.DataFrame: """Get a :class:`pandas.DataFrame` from the data in the table. @@ -396,17 +402,33 @@ def _view_table_as_pd( column = table[col] if isinstance(column, Array) or isinstance(column, VectorOfVectors): if df.empty: - df = pd.DataFrame(column.view_as("pd").rename(prefix + str(col))) + df = pd.DataFrame( + column.view_as("pd", with_units=with_units).rename( + prefix + str(col) + ) + ) else: - df = df.join(column.view_as("pd").rename(prefix + str(col))) + df = df.join( + column.view_as("pd", with_units=with_units).rename( + prefix + str(col) + ) + ) elif isinstance(column, Table): if df.empty: - df = column.view_as("pd", prefix=f"{prefix}{col}_") + df = column.view_as( + "pd", prefix=f"{prefix}{col}_", with_units=with_units + ) else: - df = df.join(column.view_as("pd", prefix=f"{prefix}{col}_")) + df = df.join( + column.view_as( + "pd", prefix=f"{prefix}{col}_", with_units=with_units + ) + ) else: if df.empty: - df[prefix + str(col)] = column.view_as("pd") + df[prefix + str(col)] = column.view_as("pd", with_units=with_units) else: - df[prefix + str(col)] = df.join(column.view_as("pd")) + df[prefix + str(col)] = df.join( + column.view_as("pd", with_units=with_units) + ) return df From 64ab2dde67a37cf78b4a839adbce882db78fff74 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 14:38:59 +0100 Subject: [PATCH 22/47] misc small bug fixes --- src/lgdo/types/array.py | 2 +- src/lgdo/types/table.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 095a5db6..b24ec12f 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -165,7 +165,7 @@ def view_as( ) else: raise ValueError( - "view_as() for ndarrays uses awkard_pandas to convert to pandas which does not support Pint. You must view the data with_units=False" + "Pint does not support Awkward yet, you must view the data with_units=False" ) else: if self.nda.ndim == 1: diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 687ff9d3..b40a978b 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -373,7 +373,6 @@ def view_as( def _view_table_as_pd( table: Table, cols: list[str] = None, - copy: bool = False, prefix: str = "", with_units: bool = False, ) -> pd.DataFrame: From e2fcc9c6c7046c4e7a8671b3c021e82e15d97375 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 15:06:33 +0100 Subject: [PATCH 23/47] implemented aoesa tests for view_as --- src/lgdo/types/table.py | 7 ++++- tests/types/test_arrayofequalsizedarrays.py | 31 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index b40a978b..893fdaab 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -365,7 +365,12 @@ def view_as( elif library == "np": raise TypeError(f"Format {library} is not a supported for Tables.") elif library == "ak": - return ak.Array(self) + if with_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + else: + return ak.Array(self) else: raise TypeError(f"{library} is not a supported third-party format.") diff --git a/tests/types/test_arrayofequalsizedarrays.py b/tests/types/test_arrayofequalsizedarrays.py index 0e4f957a..43c29b40 100644 --- a/tests/types/test_arrayofequalsizedarrays.py +++ b/tests/types/test_arrayofequalsizedarrays.py @@ -1,4 +1,8 @@ +import awkward as ak import numpy as np +import pandas as pd +import pint +import pytest import lgdo @@ -37,3 +41,30 @@ def test_to_vov(): assert np.array_equal(vov[0], [53, 91]) assert np.array_equal(vov[1], [78, 57, 66]) assert np.array_equal(vov[2], [85]) + + +def test_view(): + aoesa = lgdo.ArrayOfEqualSizedArrays( + nda=np.array([[53, 91, 66, 58, 8], [78, 57, 66, 88, 73], [85, 99, 86, 68, 53]]), + attrs={"units": "m"}, + ) + + v = aoesa.view_as("np", with_units=True) + assert isinstance(v, pint.Quantity) + assert v.u == "meter" + assert np.array_equal(v.m, aoesa.nda) + + v = aoesa.view_as("np", with_units=False) + assert isinstance(v, np.ndarray) + + v = aoesa.view_as("pd", with_units=False) + assert isinstance(v, pd.Series) + + v = aoesa.view_as("ak", with_units=False) + assert isinstance(v, ak.Array) + + with pytest.raises(ValueError): + aoesa.view_as("pd", with_units=True) + + with pytest.raises(ValueError): + aoesa.view_as("ak", with_units=True) From 52bba6aefecabf3bab041743bdb7c9f45d402bd0 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 15:31:31 +0100 Subject: [PATCH 24/47] implemented tests for Table view_as --- tests/types/test_table.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/tests/types/test_table.py b/tests/types/test_table.py index 4beb422d..e95b5f4e 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -1,3 +1,4 @@ +import awkward as ak import numpy as np import pandas as pd import pytest @@ -84,9 +85,9 @@ def test_join(): assert list(tbl2.keys()) == ["c", "d", "a"] -def test_view_as_pd(): +def test_view_as(): tbl = Table(4) - tbl.add_column("a", lgdo.Array(np.array([1, 2, 3]))) + tbl.add_column("a", lgdo.Array(np.array([1, 2, 3]), attrs={"units": "m"})) tbl.add_column("b", lgdo.Array(np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))) tbl.add_column( "c", @@ -99,14 +100,43 @@ def test_view_as_pd(): "d", lgdo.Table( col_dict={ - "a": lgdo.Array(np.array([2, 4, 6, 8])), + "a": lgdo.Array(np.array([2, 4, 6, 8]), attrs={"units": "m"}), "b": lgdo.Array(np.array([[1, 1], [2, 4], [3, 9], [4, 16]])), } ), ) - df = tbl.view_as("pd") + + df = tbl.view_as("pd", with_units=False) + assert isinstance(df, pd.DataFrame) + assert list(df.keys()) == ["a", "b", "c", "d_a", "d_b"] + + df = tbl.view_as("pd", with_units=True) assert isinstance(df, pd.DataFrame) assert list(df.keys()) == ["a", "b", "c", "d_a", "d_b"] + assert df["a"].dtype == "meter" + assert df["d_a"].dtype == "meter" + + ak_arr = tbl.view_as("ak", with_units=False) + assert isinstance(ak_arr, ak.Array) + assert list(ak_arr.fields) == ["a", "b", "c", "d"] + + with pytest.raises(ValueError): + tbl.view_as("ak", with_units=True) + + with pytest.raises(TypeError): + tbl.view_as("np") + + tbl.add_column( + "e", + lgdo.VectorOfVectors( + flattened_data=lgdo.Array(np.array([0, 1, 2, 3, 4, 5, 6])), + cumulative_length=lgdo.Array(np.array([3, 4, 7])), + attrs={"units": "m"}, + ), + ) + + with pytest.raises(ValueError): + tbl.view_as("pd", with_units=True) def test_remove_column(): From 7c5eeb14c4ee5ead3ce74b8589d5f6e520ab6263 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 15:48:48 +0100 Subject: [PATCH 25/47] cleaned up the view_as implementation of Table --- src/lgdo/types/table.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 893fdaab..21ed6078 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -405,29 +405,21 @@ def _view_table_as_pd( for col in cols: column = table[col] if isinstance(column, Array) or isinstance(column, VectorOfVectors): + tmp_ser = column.view_as("pd", with_units=with_units).rename( + prefix + str(col) + ) if df.empty: - df = pd.DataFrame( - column.view_as("pd", with_units=with_units).rename( - prefix + str(col) - ) - ) + df = pd.DataFrame(tmp_ser) else: - df = df.join( - column.view_as("pd", with_units=with_units).rename( - prefix + str(col) - ) - ) + df = df.join(tmp_ser) elif isinstance(column, Table): + tmp_df = column.view_as( + "pd", prefix=f"{prefix}{col}_", with_units=with_units + ) if df.empty: - df = column.view_as( - "pd", prefix=f"{prefix}{col}_", with_units=with_units - ) + df = tmp_df else: - df = df.join( - column.view_as( - "pd", prefix=f"{prefix}{col}_", with_units=with_units - ) - ) + df = df.join(tmp_df) else: if df.empty: df[prefix + str(col)] = column.view_as("pd", with_units=with_units) From b067820864c754bf21df163c05d4d958ca9664eb Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 15:50:01 +0100 Subject: [PATCH 26/47] even more cleaned up the view_as implementation of Table --- src/lgdo/types/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 21ed6078..b07403a1 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -404,7 +404,7 @@ def _view_table_as_pd( cols = table.keys() for col in cols: column = table[col] - if isinstance(column, Array) or isinstance(column, VectorOfVectors): + if isinstance(column, Array | VectorOfVectors): tmp_ser = column.view_as("pd", with_units=with_units).rename( prefix + str(col) ) From fcd3f540e824e7de9f04c2f7810c8e17ba7dff23 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 17:01:08 +0100 Subject: [PATCH 27/47] implemented view_as for voev and aoeesa --- src/lgdo/types/encoded.py | 136 +++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 0c91c8ca..662edff5 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -4,6 +4,7 @@ from typing import Any import awkward as ak +import awkward_pandas as akpd import numpy as np import pandas as pd from numpy.typing import NDArray @@ -228,11 +229,68 @@ def __repr__(self) -> str: return out def view_as( - self, fmt: str, with_units: bool = False + self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - raise NotImplementedError( - "'view_as' not yet implemented for VectorOfEncodedVectors." - ) + r"""View the data as third-party format structure. + + Note + ---- + Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). + + Parameters + ---------- + library + either ``pd``, ``np`` or ``ak``. + with_units + forward physical units to the output data. + """ + attach_units = with_units and "units" in self.attrs + + if library == "ak": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + + # cannot avoid making a copy here. we should add the leading 0 to + # cumulative_length inside VectorOfVectors at some point in the + # future + offsets = np.empty( + len(self.encoded_data.cumulative_length) + 1, + dtype=self.encoded_data.cumulative_length.dtype, + ) + offsets[1:] = self.encoded_data.cumulative_length + offsets[0] = 0 + + layout = ak.contents.ListOffsetArray( + offsets=ak.index.Index(offsets), + content=ak.contents.NumpyArray(self.encoded_data.flattened_data), + ) + + records_list = { + "encoded_data": ak.Array(layout), + "decoded_size": np.array(self.decoded_size), + } + return ak.Array(records_list) + + if library == "np": + raise TypeError(f"Format {library} is not a supported for voev.") + if library == "pd": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + else: + ak_view = self.view_as("ak") + return pd.DataFrame( + { + "encoded_data": akpd.from_awkward(ak_view["encoded_data"]), + "decoded_size": self.decoded_size, + } + ) + else: + raise ValueError(f"{library} is not a supported third-party format.") class ArrayOfEncodedEqualSizedArrays(LGDO): @@ -399,8 +457,70 @@ def __repr__(self) -> str: return out def view_as( - self, fmt: str, with_units: bool = False + self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - raise NotImplementedError( - "'view_as' not yet implemented for ArrayOfEncodedEqualSizedArrays." - ) + r"""View the data as third-party format structure. + + Note + ---- + Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). + + Parameters + ---------- + library + either ``pd``, ``np`` or ``ak``. + with_units + forward physical units to the output data. + """ + attach_units = with_units and "units" in self.attrs + + if library == "ak": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + + # cannot avoid making a copy here. we should add the leading 0 to + # cumulative_length inside VectorOfVectors at some point in the + # future + offsets = np.empty( + len(self.encoded_data.cumulative_length) + 1, + dtype=self.encoded_data.cumulative_length.dtype, + ) + offsets[1:] = self.encoded_data.cumulative_length + offsets[0] = 0 + + layout = ak.contents.ListOffsetArray( + offsets=ak.index.Index(offsets), + content=ak.contents.NumpyArray(self.encoded_data.flattened_data), + ) + + records_list = { + "encoded_data": ak.Array(layout), + "decoded_size": np.full( + len(self.encoded_data.cumulative_length), self.decoded_size.value + ), + } + return ak.Array(records_list) + + if library == "np": + raise TypeError(f"Format {library} is not a supported for voev.") + if library == "pd": + if attach_units: + raise ValueError( + "Pint does not support Awkward yet, you must view the data with_units=False" + ) + else: + ak_view = self.view_as("ak") + return pd.DataFrame( + { + "encoded_data": akpd.from_awkward(ak_view["encoded_data"]), + "decoded_size": np.full( + len(self.encoded_data.cumulative_length), + self.decoded_size.value, + ), + } + ) + else: + raise ValueError(f"{library} is not a supported third-party format.") From cd48dfbde2d22ae5e6b5d3228c7968f4729ac676 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 17:04:18 +0100 Subject: [PATCH 28/47] small cleanup --- src/lgdo/types/encoded.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 662edff5..e30ee949 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -241,7 +241,7 @@ def view_as( Parameters ---------- library - either ``pd``, ``np`` or ``ak``. + either ``pd`` or ``ak``. with_units forward physical units to the output data. """ @@ -282,10 +282,11 @@ def view_as( "Pint does not support Awkward yet, you must view the data with_units=False" ) else: - ak_view = self.view_as("ak") return pd.DataFrame( { - "encoded_data": akpd.from_awkward(ak_view["encoded_data"]), + "encoded_data": akpd.from_awkward( + self.view_as("ak")["encoded_data"] + ), "decoded_size": self.decoded_size, } ) @@ -469,7 +470,7 @@ def view_as( Parameters ---------- library - either ``pd``, ``np`` or ``ak``. + either ``pd`` or ``ak``. with_units forward physical units to the output data. """ @@ -512,10 +513,11 @@ def view_as( "Pint does not support Awkward yet, you must view the data with_units=False" ) else: - ak_view = self.view_as("ak") return pd.DataFrame( { - "encoded_data": akpd.from_awkward(ak_view["encoded_data"]), + "encoded_data": akpd.from_awkward( + self.view_as("ak")["encoded_data"] + ), "decoded_size": np.full( len(self.encoded_data.cumulative_length), self.decoded_size.value, From 6b4bfbaec388d9e7ffcca3d8858d1fbb3c865b10 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 17:04:59 +0100 Subject: [PATCH 29/47] typo --- src/lgdo/types/encoded.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index e30ee949..061b14dc 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -506,7 +506,7 @@ def view_as( return ak.Array(records_list) if library == "np": - raise TypeError(f"Format {library} is not a supported for voev.") + raise TypeError(f"Format {library} is not a supported for aoeesa.") if library == "pd": if attach_units: raise ValueError( From f05958412a11ff723a56b0de7ce6aebc60db3f93 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 17:24:42 +0100 Subject: [PATCH 30/47] a much easier implementation of view_as for the encoded types [credit Luigi] --- src/lgdo/types/encoded.py | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 061b14dc..a521729a 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -253,23 +253,8 @@ def view_as( "Pint does not support Awkward yet, you must view the data with_units=False" ) - # cannot avoid making a copy here. we should add the leading 0 to - # cumulative_length inside VectorOfVectors at some point in the - # future - offsets = np.empty( - len(self.encoded_data.cumulative_length) + 1, - dtype=self.encoded_data.cumulative_length.dtype, - ) - offsets[1:] = self.encoded_data.cumulative_length - offsets[0] = 0 - - layout = ak.contents.ListOffsetArray( - offsets=ak.index.Index(offsets), - content=ak.contents.NumpyArray(self.encoded_data.flattened_data), - ) - records_list = { - "encoded_data": ak.Array(layout), + "encoded_data": self.encoded_data.view_as("ak"), "decoded_size": np.array(self.decoded_size), } return ak.Array(records_list) @@ -285,7 +270,7 @@ def view_as( return pd.DataFrame( { "encoded_data": akpd.from_awkward( - self.view_as("ak")["encoded_data"] + self.encoded_data.view_as("ak") ), "decoded_size": self.decoded_size, } @@ -482,23 +467,8 @@ def view_as( "Pint does not support Awkward yet, you must view the data with_units=False" ) - # cannot avoid making a copy here. we should add the leading 0 to - # cumulative_length inside VectorOfVectors at some point in the - # future - offsets = np.empty( - len(self.encoded_data.cumulative_length) + 1, - dtype=self.encoded_data.cumulative_length.dtype, - ) - offsets[1:] = self.encoded_data.cumulative_length - offsets[0] = 0 - - layout = ak.contents.ListOffsetArray( - offsets=ak.index.Index(offsets), - content=ak.contents.NumpyArray(self.encoded_data.flattened_data), - ) - records_list = { - "encoded_data": ak.Array(layout), + "encoded_data": self.encoded_data.view_as("ak"), "decoded_size": np.full( len(self.encoded_data.cumulative_length), self.decoded_size.value ), @@ -516,7 +486,7 @@ def view_as( return pd.DataFrame( { "encoded_data": akpd.from_awkward( - self.view_as("ak")["encoded_data"] + self.encoded_data.view_as("ak") ), "decoded_size": np.full( len(self.encoded_data.cumulative_length), From c52599be5f9e63d4b56a74aa42cec8cccc8ed199 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Thu, 30 Nov 2023 17:32:28 +0100 Subject: [PATCH 31/47] the tests might faile because of this pipe? --- src/lgdo/types/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index b07403a1..21ed6078 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -404,7 +404,7 @@ def _view_table_as_pd( cols = table.keys() for col in cols: column = table[col] - if isinstance(column, Array | VectorOfVectors): + if isinstance(column, Array) or isinstance(column, VectorOfVectors): tmp_ser = column.view_as("pd", with_units=with_units).rename( prefix + str(col) ) From 4f26d9866b32442a8e921817065cd6cd7996f3c0 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 1 Dec 2023 10:04:04 +0100 Subject: [PATCH 32/47] implemented awkward based to_aoesa function including a clipping option --- src/lgdo/types/vectorofvectors.py | 33 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 65b3178a..8bf1f027 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -400,25 +400,28 @@ def __repr__(self) -> str: np.set_printoptions(**npopt) return out - def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArrays: + def to_aoesa( + self, preserve_dtype: bool = False, max_len: int = None + ) -> aoesa.ArrayOfEqualSizedArrays: """Convert to :class:`ArrayOfEqualSizedArrays`. - - If `preserve_dtype` is False, the output array will have dtype - :class:`numpy.float64` and is padded with :class:`numpy.nan`. - Otherwise, the dtype of the original :class:`VectorOfVectors` is - preserved. + If `preserve_dtype` is ``False``, the output array will have dtype + subtype of :class:`numpy.floating` and is padded with + :class:`numpy.nan`. Otherwise, the dtype of the original + :class:`VectorOfVectors` is preserved and the padded values are left + uninitialized (unless the dtype is already floating-point). """ - ind_lengths = np.diff(self.cumulative_length.nda, prepend=0) - arr_len = np.max(ind_lengths) + ak_arr = self.view_as("ak") - if not preserve_dtype: - nda = np.empty((len(self.cumulative_length), arr_len)) - nda.fill(np.nan) - else: - nda = np.empty((len(self.cumulative_length), arr_len), dtype=self.dtype) + if not max_len: + max_len = int(ak.max(ak.count(ak_arr, axis=-1))) + + nda_pad = ak.pad_none(ak_arr, max_len, clip=True).to_numpy() + + if not preserve_dtype and not np.issubdtype(nda_pad.dtype, np.floating): + nda_pad = nda_pad.astype(float) + nda_pad.set_fill_value(np.nan) - for i in range(len(self.cumulative_length)): - nda[i, : ind_lengths[i]] = self[i] + nda = nda_pad.filled() return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) From dba303f78a031f84357805e2302513f25cd92d28 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Fri, 1 Dec 2023 16:11:37 +0100 Subject: [PATCH 33/47] updated the doc strings --- src/lgdo/lh5/store.py | 4 +- src/lgdo/types/array.py | 14 ++++++- src/lgdo/types/arrayofequalsizedarrays.py | 24 ++++++++++++ src/lgdo/types/encoded.py | 46 +++++++++++++++++------ src/lgdo/types/fixedsizearray.py | 24 ++++++++++++ src/lgdo/types/lgdo.py | 4 +- src/lgdo/types/scalar.py | 18 ++++++++- src/lgdo/types/struct.py | 23 +++++++++--- src/lgdo/types/table.py | 26 +++++++++---- src/lgdo/types/vectorofvectors.py | 22 ++++++++--- src/lgdo/types/waveformtable.py | 23 +++++++++++- 11 files changed, 190 insertions(+), 38 deletions(-) diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py index 8474414f..1f1dfc46 100644 --- a/src/lgdo/lh5/store.py +++ b/src/lgdo/lh5/store.py @@ -890,13 +890,13 @@ def write( `compression` attribute. Note - ---- + ---------- The `compression` LGDO attribute takes precedence over the default HDF5 compression settings. The `hdf5_settings` attribute takes precedence over `compression`. These attributes are not written to disk. Note - ---- + ---------- HDF5 compression is skipped for the `encoded_data.flattened_data` dataset of :class:`.VectorOfEncodedVectors` and :class:`.ArrayOfEncodedEqualSizedArrays`. diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index b24ec12f..6cbc69f0 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -147,12 +147,24 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """View the Array data as a third-party format data structure. + r"""View the Array data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``np``: :mod:`numpy` + - ``ak``: :mod:`awkward` Parameters ---------- library format of the returned data view. + with_units + forward physical units to the output data. + """ # TODO: does attaching units imply a copy? attach_units = with_units and "units" in self.attrs diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 1b7a6eb6..524217f3 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -137,4 +137,28 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: + r"""View the ArrayOfEqualSizedArrays data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``np``: :mod:`numpy` + - ``ak``: :mod:`awkward` + + Notes + ----- + - Pint does not yet support Awkward yet. You will need to pass the data with_units=False + in the case of awkward and pandas (as it uses awkward_pandas for conversion). + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + + """ return super().view_as(library, with_units=with_units) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index a521729a..c8e9b2ce 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -231,17 +231,28 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the data as third-party format structure. + r"""View the VectorOfEncodedVectors data as a third-party format data structure. - Note - ---- - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` + + Notes + ----- + - Pint does not yet support Awkward yet. You will need to pass the data with_units=False + in the case of awkward and pandas (as it uses awkward_pandas for conversion). + + - Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). Parameters ---------- library - either ``pd`` or ``ak``. + format of the returned data view. with_units forward physical units to the output data. """ @@ -445,17 +456,28 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the data as third-party format structure. + r"""View the ArrayOfEncodedEqualSizedArrays data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` + + Notes + ----- + - Pint does not yet support Awkward yet. You will need to pass the data with_units=False + in the case of awkward and pandas (as it uses awkward_pandas for conversion). - Note - ---- - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + - Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). Parameters ---------- library - either ``pd`` or ``ak``. + format of the returned data view. with_units forward physical units to the output data. """ diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index 575a8eb7..a9ca770e 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -43,4 +43,28 @@ def datatype_name(self) -> str: return "fixedsize_array" def view_as(self, library: str, with_units: bool = False): + r"""View the FixedSizeArray data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``np``: :mod:`numpy` + - ``ak``: :mod:`awkward` + + Notes + ----- + - Pint does not yet support Awkward yet. You will need to pass the data with_units=False + in the case of awkward and pandas (as it uses awkward_pandas for conversion). + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + + """ return super.view_as(library, with_units=with_units) diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index dfccf8e6..5d6cf914 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -38,7 +38,7 @@ def form_datatype(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """View the LGDO data object as a third-party format data structure. + r"""View the LGDO data object as a third-party format data structure. This is typically a zero-copy or nearly zero-copy operation unless explicitly stated in the concrete LGDO documentation. @@ -56,6 +56,8 @@ def view_as( ---------- library format of the returned data view. + with_units + forward physical units to the output data. """ pass diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index a18a314c..1474ce7f 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -42,7 +42,23 @@ def datatype_name(self) -> str: def form_datatype(self) -> str: return self.datatype_name() - def view_as(self, fmt: str, with_units: bool = False): + def view_as(self, library: str, with_units: bool = False): + r"""View the Scalar data object as a third-party format data structure. + + This implementation is very trivial as it only returns the value of the Scalar. + No data conversion is applied. + + Notes + ----- + - as of right now, units are not forwarded. + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + """ return self.value def __eq__(self, other: Scalar) -> bool: diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 7237618a..0526ab88 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -112,18 +112,29 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the Struct object to a third-party format. + r"""View the Struct data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: - Supported options are ... - ``pd``: :mod:`pandas` - ``ak``: :mod:`awkward` - Note - ---- - - conversion to ndarray is not supported at the moment as there is - no clear way how to wrap the column names and the data into one array. + Notes + ----- + - conversion to ndarray is not supported - conversion to awkward array only works when the key is a string and values are of equal length + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + """ if library == "pd": return pd.DataFrame(self, copy=False) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 21ed6078..11c85754 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -343,20 +343,29 @@ def view_as( cols: list[str] = None, prefix: str = "", ) -> pd.DataFrame | np.NDArray | ak.Array: - """Convert the data of the Table object to a third-party format. + r"""View the Table data as a third-party format data structure. - Supported options are ... + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: - Supported options are ... - ``pd``: :mod:`pandas` - ``ak``: :mod:`awkward` - Note - ---- - - conversion to ndarray is not supported at the moment as there is - no clear way how to wrap the column names and the data into one array. + Notes + ----- + - conversion to ndarray is not supported - conversion to awkward array only works when the key is a string and values are of equal length + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + """ if library == "pd": return _view_table_as_pd( @@ -385,7 +394,8 @@ def _view_table_as_pd( Notes ----- - The requested data must be array-like, with the ``nda`` attribute. + The requested data must be array-like, with the ``nda`` attribute, a VectorOfVectors + or a Table. Parameters ---------- diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 8bf1f027..ba61b817 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -428,19 +428,29 @@ def to_aoesa( def view_as( self, library: str, with_units: bool = False, preserve_dtype: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the data as third-party format structure. + r"""View the VectorOfVectors data as a third-party format data structure. - Note - ---- - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``np``: :mod:`numpy` + - ``ak``: :mod:`awkward` + + Notes + ----- + - Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). Parameters ---------- library - either ``pd``, ``np`` or ``ak``. + format of the returned data view. with_units forward physical units to the output data. + """ attach_units = with_units and "units" in self.attrs diff --git a/src/lgdo/types/waveformtable.py b/src/lgdo/types/waveformtable.py index b768d9ed..2341defa 100644 --- a/src/lgdo/types/waveformtable.py +++ b/src/lgdo/types/waveformtable.py @@ -268,5 +268,26 @@ def __str__(self): def view_as( self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - """View WaveformTable object as a third-party format data structure.""" + r"""View the WaveformTable data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation unless + explicitly stated in the concrete LGDO documentation. + + Supported third-party formats are: + + - ``pd``: :mod:`pandas` + - ``ak``: :mod:`awkward` + + Notes + ----- + - conversion to ndarray is not supported + + Parameters + ---------- + library + format of the returned data view. + with_units + forward physical units to the output data. + + """ return super().view_as(fmt, with_units) From 9552bc84dc94d51ac2a126ad3cfb01cfa4f1360d Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 1 Dec 2023 22:17:55 +0100 Subject: [PATCH 34/47] Improve view_as() docstrings --- src/lgdo/types/array.py | 16 ++++----- src/lgdo/types/arrayofequalsizedarrays.py | 26 +++----------- src/lgdo/types/encoded.py | 44 ++++++++++------------- src/lgdo/types/fixedsizearray.py | 26 +++----------- src/lgdo/types/lgdo.py | 14 ++++++-- src/lgdo/types/scalar.py | 18 +++------- src/lgdo/types/struct.py | 15 ++++---- src/lgdo/types/table.py | 14 ++++---- src/lgdo/types/vectorofvectors.py | 23 +++++++----- src/lgdo/types/waveformtable.py | 24 +++---------- 10 files changed, 84 insertions(+), 136 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 6cbc69f0..a8ac30b7 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -147,16 +147,13 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the Array data as a third-party format data structure. + """View the Array data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is a zero-copy operation. Supported third-party formats are: - Supported third-party formats are: - - - ``pd``: :mod:`pandas` - - ``np``: :mod:`numpy` - - ``ak``: :mod:`awkward` + - ``pd``: returns a :class:`pandas.Series` + - ``np``: returns the internal `nda` attribute (:class:`numpy.ndarray`) + - ``ak``: returns an :class:`ak.Array` initialized with `self.nda` Parameters ---------- @@ -165,6 +162,9 @@ def view_as( with_units forward physical units to the output data. + See Also + -------- + .LGDO.view_as """ # TODO: does attaching units imply a copy? attach_units = with_units and "units" in self.attrs diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 524217f3..58c08476 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -137,28 +137,10 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the ArrayOfEqualSizedArrays data as a third-party format data structure. - - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. - - Supported third-party formats are: - - - ``pd``: :mod:`pandas` - - ``np``: :mod:`numpy` - - ``ak``: :mod:`awkward` - - Notes - ----- - - Pint does not yet support Awkward yet. You will need to pass the data with_units=False - in the case of awkward and pandas (as it uses awkward_pandas for conversion). - - Parameters - ---------- - library - format of the returned data view. - with_units - forward physical units to the output data. + """View the array as a third-party format data structure. + See Also + -------- + .LGDO.view_as """ return super().view_as(library, with_units=with_units) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index c8e9b2ce..ac5e5a08 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -231,23 +231,14 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the VectorOfEncodedVectors data as a third-party format data structure. + """View the encoded data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is a zero-copy or nearly zero-copy operation. Supported third-party formats are: - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` - - Notes - ----- - - Pint does not yet support Awkward yet. You will need to pass the data with_units=False - in the case of awkward and pandas (as it uses awkward_pandas for conversion). - - - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + - ``pd``: returns a :class:`pandas.DataFrame` + - ``ak``: returns an :class:`ak.Array` (record type) Parameters ---------- @@ -255,6 +246,10 @@ def view_as( format of the returned data view. with_units forward physical units to the output data. + + See Also + -------- + .LGDO.view_as """ attach_units = with_units and "units" in self.attrs @@ -456,23 +451,18 @@ def __repr__(self) -> str: def view_as( self, library: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the ArrayOfEncodedEqualSizedArrays data as a third-party format data structure. + """View the encoded data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is nearly a zero-copy operation. Supported third-party formats are: - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` + - ``pd``: returns a :class:`pandas.DataFrame` + - ``ak``: returns an :class:`ak.Array` (record type) - Notes - ----- - - Pint does not yet support Awkward yet. You will need to pass the data with_units=False - in the case of awkward and pandas (as it uses awkward_pandas for conversion). - - - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + Note + ---- + In the view, `decoded_size` is expanded into an array. Parameters ---------- @@ -480,6 +470,10 @@ def view_as( format of the returned data view. with_units forward physical units to the output data. + + See Also + -------- + .LGDO.view_as """ attach_units = with_units and "units" in self.attrs diff --git a/src/lgdo/types/fixedsizearray.py b/src/lgdo/types/fixedsizearray.py index a9ca770e..0692b960 100644 --- a/src/lgdo/types/fixedsizearray.py +++ b/src/lgdo/types/fixedsizearray.py @@ -43,28 +43,10 @@ def datatype_name(self) -> str: return "fixedsize_array" def view_as(self, library: str, with_units: bool = False): - r"""View the FixedSizeArray data as a third-party format data structure. - - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. - - Supported third-party formats are: - - - ``pd``: :mod:`pandas` - - ``np``: :mod:`numpy` - - ``ak``: :mod:`awkward` - - Notes - ----- - - Pint does not yet support Awkward yet. You will need to pass the data with_units=False - in the case of awkward and pandas (as it uses awkward_pandas for conversion). - - Parameters - ---------- - library - format of the returned data view. - with_units - forward physical units to the output data. + """View the array as a third-party format data structure. + See Also + -------- + .LGDO.view_as """ return super.view_as(library, with_units=with_units) diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index 5d6cf914..d1e102b7 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -41,15 +41,23 @@ def view_as( r"""View the LGDO data object as a third-party format data structure. This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + explicitly stated in the concrete LGDO documentation. The view can be + turned into a copy explicitly by the user with the appropriate methods. + If requested by the user, the output format supports it and the LGDO + carries a ``units`` attribute, physical units are attached to the view + through the :mod:`pint` package. - Typical supported third-party formats are: + Typical supported third-party libraries are: - ``pd``: :mod:`pandas` - ``np``: :mod:`numpy` - ``ak``: :mod:`awkward` - But the actual supported formats may vary depending on the concrete + Note + ---- + Awkward does not support attaching units through Pint, at the moment. + + but the actual supported formats may vary depending on the concrete LGDO class. Parameters diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 1474ce7f..50a14afd 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -43,21 +43,11 @@ def form_datatype(self) -> str: return self.datatype_name() def view_as(self, library: str, with_units: bool = False): - r"""View the Scalar data object as a third-party format data structure. + r"""Dummy function, returns the scalar value itself. - This implementation is very trivial as it only returns the value of the Scalar. - No data conversion is applied. - - Notes - ----- - - as of right now, units are not forwarded. - - Parameters - ---------- - library - format of the returned data view. - with_units - forward physical units to the output data. + See Also + -------- + .LGDO.view_as """ return self.value diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 0526ab88..5f793287 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -114,19 +114,17 @@ def view_as( ) -> pd.DataFrame | np.NDArray | ak.Array: r"""View the Struct data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is a zero-copy operation. Supported third-party formats are: - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` + - ``pd``: returns a :class:`pandas.DataFrame` + - ``ak``: returns an :class:`ak.Array` (record type) Notes ----- - - conversion to ndarray is not supported - - conversion to awkward array only works when the key is a string - and values are of equal length + Conversion to awkward array only works when the key is a string and + the columns have all the same length. Parameters ---------- @@ -135,6 +133,9 @@ def view_as( with_units forward physical units to the output data. + See Also + -------- + .LGDO.view_as """ if library == "pd": return pd.DataFrame(self, copy=False) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 11c85754..c1b14fcc 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -345,19 +345,16 @@ def view_as( ) -> pd.DataFrame | np.NDArray | ak.Array: r"""View the Table data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is typically a zero-copy or nearly zero-copy operation. Supported third-party formats are: - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` + - ``pd``: returns a :class:`pandas.DataFrame` + - ``ak``: returns an :class:`ak.Array` (record type) Notes ----- - - conversion to ndarray is not supported - - conversion to awkward array only works when the key is a string - and values are of equal length + Conversion to Awkward array only works when the key is a string. Parameters ---------- @@ -366,6 +363,9 @@ def view_as( with_units forward physical units to the output data. + See Also + -------- + .LGDO.view_as """ if library == "pd": return _view_table_as_pd( diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index ba61b817..3594d94a 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -428,21 +428,23 @@ def to_aoesa( def view_as( self, library: str, with_units: bool = False, preserve_dtype: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the VectorOfVectors data as a third-party format data structure. + r"""View the vector data as a third-party format data structure. - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. + This is typically a zero-copy or nearly zero-copy operation. Supported third-party formats are: - - ``pd``: :mod:`pandas` - - ``np``: :mod:`numpy` - - ``ak``: :mod:`awkward` + - ``pd``: returns a :class:`pandas.Series` (supported through the + ``awkward-pandas`` package) + - ``np``: returns a :class:`numpy.ndarray`, padded to make it + rectangular. + - ``ak``: returns an :class:`ak.Array`. ``self.cumulative_length`` is + currently re-allocated for technical reasons. Notes ----- - - Awkward array views partially involve memory re-allocation (the - `cumulative_length`\ s). + Awkward array views partially involve memory re-allocation (the + `cumulative_length`\ s). Parameters ---------- @@ -450,7 +452,12 @@ def view_as( format of the returned data view. with_units forward physical units to the output data. + preserve_dtype + forwarded to :meth:`.to_aoesa`, if `library` is ``np``. + See Also + -------- + .LGDO.view_as """ attach_units = with_units and "units" in self.attrs diff --git a/src/lgdo/types/waveformtable.py b/src/lgdo/types/waveformtable.py index 2341defa..989747c3 100644 --- a/src/lgdo/types/waveformtable.py +++ b/src/lgdo/types/waveformtable.py @@ -268,26 +268,10 @@ def __str__(self): def view_as( self, fmt: str, with_units: bool = False ) -> pd.DataFrame | np.NDArray | ak.Array: - r"""View the WaveformTable data as a third-party format data structure. - - This is typically a zero-copy or nearly zero-copy operation unless - explicitly stated in the concrete LGDO documentation. - - Supported third-party formats are: - - - ``pd``: :mod:`pandas` - - ``ak``: :mod:`awkward` - - Notes - ----- - - conversion to ndarray is not supported - - Parameters - ---------- - library - format of the returned data view. - with_units - forward physical units to the output data. + r"""View the waveform data as a third-party format data structure. + See Also + -------- + .LGDO.view_as """ return super().view_as(fmt, with_units) From fb7e3631c0b9fdbd5ccf632e04c83fe51a5a9de8 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Fri, 1 Dec 2023 22:21:25 +0100 Subject: [PATCH 35/47] Move _view_table_as_pd() into view_as() --- src/lgdo/types/table.py | 99 +++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index c1b14fcc..a054da2b 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -362,17 +362,53 @@ def view_as( format of the returned data view. with_units forward physical units to the output data. + cols + a list of column names specifying the subset of the table's columns + to be added to the dataframe. + prefix + The prefix to be added to the column names. Used when recursively getting the + dataframe of a table inside this table. See Also -------- .LGDO.view_as """ if library == "pd": - return _view_table_as_pd( - self, cols=cols, prefix=prefix, with_units=with_units - ) + df = pd.DataFrame() + if cols is None: + cols = self.keys() + for col in cols: + column = self[col] + if isinstance(column, Array) or isinstance(column, VectorOfVectors): + tmp_ser = column.view_as("pd", with_units=with_units).rename( + prefix + str(col) + ) + if df.empty: + df = pd.DataFrame(tmp_ser) + else: + df = df.join(tmp_ser) + elif isinstance(column, Table): + tmp_df = column.view_as( + "pd", prefix=f"{prefix}{col}_", with_units=with_units + ) + if df.empty: + df = tmp_df + else: + df = df.join(tmp_df) + else: + if df.empty: + df[prefix + str(col)] = column.view_as( + "pd", with_units=with_units + ) + else: + df[prefix + str(col)] = df.join( + column.view_as("pd", with_units=with_units) + ) + return df + elif library == "np": raise TypeError(f"Format {library} is not a supported for Tables.") + elif library == "ak": if with_units: raise ValueError( @@ -380,61 +416,6 @@ def view_as( ) else: return ak.Array(self) - else: - raise TypeError(f"{library} is not a supported third-party format.") - -def _view_table_as_pd( - table: Table, - cols: list[str] = None, - prefix: str = "", - with_units: bool = False, -) -> pd.DataFrame: - """Get a :class:`pandas.DataFrame` from the data in the table. - - Notes - ----- - The requested data must be array-like, with the ``nda`` attribute, a VectorOfVectors - or a Table. - - Parameters - ---------- - cols - a list of column names specifying the subset of the table's columns - to be added to the dataframe. - copy - When ``True``, the dataframe allocates new memory and copies data - into it. Otherwise, the raw ``nda``'s from the table are used directly. - prefix - The prefix to be added to the column names. Used when recursively getting the - dataframe of a Table inside this Table - """ - df = pd.DataFrame() - if cols is None: - cols = table.keys() - for col in cols: - column = table[col] - if isinstance(column, Array) or isinstance(column, VectorOfVectors): - tmp_ser = column.view_as("pd", with_units=with_units).rename( - prefix + str(col) - ) - if df.empty: - df = pd.DataFrame(tmp_ser) - else: - df = df.join(tmp_ser) - elif isinstance(column, Table): - tmp_df = column.view_as( - "pd", prefix=f"{prefix}{col}_", with_units=with_units - ) - if df.empty: - df = tmp_df - else: - df = df.join(tmp_df) else: - if df.empty: - df[prefix + str(col)] = column.view_as("pd", with_units=with_units) - else: - df[prefix + str(col)] = df.join( - column.view_as("pd", with_units=with_units) - ) - return df + raise TypeError(f"{library} is not a supported third-party format.") From 8d488ae1fc95db06e04e9c96ce5542517acd5e9d Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 2 Dec 2023 19:21:21 +0100 Subject: [PATCH 36/47] Implement Struct.__setitem__() --- src/lgdo/types/struct.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 5f793287..611836d5 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -23,8 +23,6 @@ class Struct(LGDO, dict): datatype updated, or call :meth:`update_datatype` after adding. """ - # TODO: overload setattr to require add_field for setting? - def __init__( self, obj_dict: dict[str, LGDO] = None, attrs: dict[str, Any] = None ) -> None: @@ -55,9 +53,12 @@ def update_datatype(self) -> None: def add_field(self, name: str | int, obj: LGDO) -> None: """Add a field to the table.""" - self[name] = obj + super().__setitem__(name, obj) self.update_datatype() + def __setitem__(self, name, obj) -> None: + return self.add_field(name, obj) + def remove_field(self, name: str | int, delete: bool = False) -> None: """Remove a field from the table. From b11dd24db3f276636d5a67add01ff741740163fe Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 2 Dec 2023 19:24:49 +0100 Subject: [PATCH 37/47] Implement Struct.__getattr__() --- src/lgdo/types/struct.py | 5 ++++- tests/types/test_struct.py | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 611836d5..113bfe78 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -56,9 +56,12 @@ def add_field(self, name: str | int, obj: LGDO) -> None: super().__setitem__(name, obj) self.update_datatype() - def __setitem__(self, name, obj) -> None: + def __setitem__(self, name: str, obj: LGDO) -> None: return self.add_field(name, obj) + def __getattr__(self, name: str) -> LGDO: + return self.__getitem__(name) + def remove_field(self, name: str | int, delete: bool = False) -> None: """Remove a field from the table. diff --git a/tests/types/test_struct.py b/tests/types/test_struct.py index 2c18ccee..e6454942 100644 --- a/tests/types/test_struct.py +++ b/tests/types/test_struct.py @@ -35,6 +35,15 @@ def test_add_field(): struct.add_field("array1", lgdo.Array(shape=(700, 21), dtype="f", fill_val=2)) assert struct.attrs["datatype"] == "struct{scalar1,array1}" + struct["array2"] = lgdo.Array(shape=(700, 21), dtype="f", fill_val=2) + assert struct.attrs["datatype"] == "struct{scalar1,array1,array2}" + + +def test_getattr(): + struct = lgdo.Struct() + struct["scalar1"] = lgdo.Scalar(value=10) + assert struct.scalar1.value == 10 + def test_remove_field(): struct = lgdo.Struct() From 94a10c8212f0702f54cadd569237c37dc399f297 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 2 Dec 2023 19:37:23 +0100 Subject: [PATCH 38/47] Just throw a NotImplementedError in Struct.view_as() --- src/lgdo/types/struct.py | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/src/lgdo/types/struct.py b/src/lgdo/types/struct.py index 113bfe78..495eba3a 100644 --- a/src/lgdo/types/struct.py +++ b/src/lgdo/types/struct.py @@ -118,34 +118,18 @@ def view_as( ) -> pd.DataFrame | np.NDArray | ak.Array: r"""View the Struct data as a third-party format data structure. - This is a zero-copy operation. - - Supported third-party formats are: - - - ``pd``: returns a :class:`pandas.DataFrame` - - ``ak``: returns an :class:`ak.Array` (record type) - - Notes + Error ----- - Conversion to awkward array only works when the key is a string and - the columns have all the same length. - - Parameters - ---------- - library - format of the returned data view. - with_units - forward physical units to the output data. + Not implemented. Since Struct's fields can have different lengths, + converting to a Numpy, Pandas or Awkward is generally not possible. + Call :meth:`.LGDO.view_as` on the fields instead. See Also -------- .LGDO.view_as """ - if library == "pd": - return pd.DataFrame(self, copy=False) - elif library == "np": - raise TypeError(f"Format {library} is not a supported for Structs.") - elif library == "ak": - return ak.Array(self) - else: - raise TypeError(f"{library} is not a supported third-party format.") + raise NotImplementedError( + "Since Struct's fields can have different lengths, " + "converting to a Numpy, Pandas or Awkward is generally " + "not possible. Call view_as() on the fields instead." + ) From 02425cd94684e7202003ed29ae1ed5ba15a4607b Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 2 Dec 2023 19:53:14 +0100 Subject: [PATCH 39/47] Add tests for view_as() in encoded types --- tests/types/test_encoded.py | 76 +++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/types/test_encoded.py b/tests/types/test_encoded.py index 7a002cae..26a02ca0 100644 --- a/tests/types/test_encoded.py +++ b/tests/types/test_encoded.py @@ -1,4 +1,8 @@ +import awkward as ak +import awkward_pandas as akpd import numpy as np +import pandas as pd +import pytest from lgdo import ( Array, @@ -207,3 +211,75 @@ def test_aoeesa_iteration(): for i, v in enumerate(voev): assert np.array_equal(v, desired[i]) + + +def test_voev_view_as(): + voev = VectorOfEncodedVectors( + encoded_data=VectorOfVectors( + flattened_data=Array(nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])), + cumulative_length=Array(nda=np.array([2, 5, 6, 10, 13])), + ), + decoded_size=Array(shape=5, fill_val=6), + attrs={"units": "s"}, + ) + + ak_arr = voev.view_as("ak", with_units=False) + assert ak_arr.fields == ["encoded_data", "decoded_size"] + assert ak.all( + ak_arr.encoded_data + == [ + [1, 2], + [3, 4, 5], + [2], + [4, 8, 9, 7], + [5, 3, 1], + ] + ) + assert ak.all(ak_arr.decoded_size == [6, 6, 6, 6, 6]) + + df = voev.view_as("pd", with_units=False) + assert isinstance(df, pd.DataFrame) + assert isinstance(df.encoded_data.ak, akpd.accessor.AwkwardAccessor) + assert isinstance(df.decoded_size, pd.Series) + + with pytest.raises(ValueError): + df = voev.view_as("pd", with_units=True) + + with pytest.raises(TypeError): + df = voev.view_as("np") + + +def test_aoeesa_view_as(): + voev = ArrayOfEncodedEqualSizedArrays( + encoded_data=VectorOfVectors( + flattened_data=Array(nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])), + cumulative_length=Array(nda=np.array([2, 5, 6, 10, 13])), + ), + decoded_size=99, + attrs={"units": "s"}, + ) + + ak_arr = voev.view_as("ak", with_units=False) + assert ak_arr.fields == ["encoded_data", "decoded_size"] + assert ak.all( + ak_arr.encoded_data + == [ + [1, 2], + [3, 4, 5], + [2], + [4, 8, 9, 7], + [5, 3, 1], + ] + ) + assert ak.all(ak_arr.decoded_size == [99, 99, 99, 99, 99]) + + df = voev.view_as("pd", with_units=False) + assert isinstance(df, pd.DataFrame) + assert isinstance(df.encoded_data.ak, akpd.accessor.AwkwardAccessor) + assert isinstance(df.decoded_size, pd.Series) + + with pytest.raises(ValueError): + df = voev.view_as("pd", with_units=True) + + with pytest.raises(TypeError): + df = voev.view_as("np") From b0bd351acafebd0f6df09384739c0c4bbe8a1c23 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Mon, 4 Dec 2023 11:59:33 +0100 Subject: [PATCH 40/47] first attempt at refactor of to_aoesa including the missing_value argument --- src/lgdo/types/vectorofvectors.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index ba61b817..f1a23238 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -401,7 +401,7 @@ def __repr__(self) -> str: return out def to_aoesa( - self, preserve_dtype: bool = False, max_len: int = None + self, preserve_dtype: bool = False, max_len: int = None, missing_value=np.nan ) -> aoesa.ArrayOfEqualSizedArrays: """Convert to :class:`ArrayOfEqualSizedArrays`. If `preserve_dtype` is ``False``, the output array will have dtype @@ -412,16 +412,17 @@ def to_aoesa( """ ak_arr = self.view_as("ak") - if not max_len: + if max_len is None: max_len = int(ak.max(ak.count(ak_arr, axis=-1))) - nda_pad = ak.pad_none(ak_arr, max_len, clip=True).to_numpy() + nda = ak.fill_none( + ak.pad_none(ak_arr, max_len, clip=True), missing_value + ).to_numpy(allow_missing=False) - if not preserve_dtype and not np.issubdtype(nda_pad.dtype, np.floating): - nda_pad = nda_pad.astype(float) - nda_pad.set_fill_value(np.nan) - - nda = nda_pad.filled() + if preserve_dtype: + nda = nda.astype(type(self.flattened_data[0])) + else: + nda = nda.astype(type(missing_value)) return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) From 67d8c2972e4e8777d85e42ad3dc7ac9558c22921 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Mon, 4 Dec 2023 15:03:23 +0100 Subject: [PATCH 41/47] second attempt, removing preserve_dtype but keeping it indirectly when giving None as missing_value --- src/lgdo/compression/radware.py | 4 +-- src/lgdo/compression/varlen.py | 4 +-- src/lgdo/types/vectorofvectors.py | 26 +++++++++++++------ tests/compression/test_uleb128_zigzag_diff.py | 2 +- tests/types/test_vectorofvectors.py | 2 +- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/lgdo/compression/radware.py b/src/lgdo/compression/radware.py index 0332dfb7..6b88b741 100644 --- a/src/lgdo/compression/radware.py +++ b/src/lgdo/compression/radware.py @@ -262,7 +262,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row _, siglen = decode( - (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes), + (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda, shift=shift, ) @@ -288,7 +288,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row sig_out, siglen = decode( - (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes), shift=shift + (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), shift=shift ) # sanity check diff --git a/src/lgdo/compression/varlen.py b/src/lgdo/compression/varlen.py index 9c3c0737..65787843 100644 --- a/src/lgdo/compression/varlen.py +++ b/src/lgdo/compression/varlen.py @@ -227,7 +227,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row _, siglen = decode( - (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes), + (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda, ) @@ -252,7 +252,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row sig_out, siglen = decode( - (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes) + (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes) ) # sanity check diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 7f8c31c7..070b8833 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -401,7 +401,7 @@ def __repr__(self) -> str: return out def to_aoesa( - self, preserve_dtype: bool = False, max_len: int = None, missing_value=np.nan + self, max_len: int = None, missing_value=np.nan ) -> aoesa.ArrayOfEqualSizedArrays: """Convert to :class:`ArrayOfEqualSizedArrays`. If `preserve_dtype` is ``False``, the output array will have dtype @@ -411,18 +411,25 @@ def to_aoesa( uninitialized (unless the dtype is already floating-point). """ ak_arr = self.view_as("ak") + preserve_dtype = False if max_len is None: max_len = int(ak.max(ak.count(ak_arr, axis=-1))) + # hack to keep preserve_dtype functionality if needed + if missing_value is None: + preserve_dtype = True + # this was introduced, as np.array([np.nan]).astype() would throw a RuntimeWarning if converted to a type not supporting nan values + if np.can_cast(np.nan, self.flattened_data.dtype): + missing_value = np.nan + else: + missing_value = 0 + nda = ak.fill_none( ak.pad_none(ak_arr, max_len, clip=True), missing_value ).to_numpy(allow_missing=False) - if preserve_dtype: - nda = nda.astype(type(self.flattened_data[0])) - else: - nda = nda.astype(type(missing_value)) + nda = nda.astype(self.flattened_data.dtype) return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) @@ -484,9 +491,12 @@ def view_as( return ak.Array(layout) if library == "np": - return self.to_aoesa(preserve_dtype=preserve_dtype).view_as( - "np", with_units=with_units - ) + if preserve_dtype: + return self.to_aoesa(missing_value=None).view_as( + "np", with_units=with_units + ) + else: + return self.to_aoesa().view_as("np", with_units=with_units) if library == "pd": if attach_units: raise ValueError( diff --git a/tests/compression/test_uleb128_zigzag_diff.py b/tests/compression/test_uleb128_zigzag_diff.py index b524fd11..61e81490 100644 --- a/tests/compression/test_uleb128_zigzag_diff.py +++ b/tests/compression/test_uleb128_zigzag_diff.py @@ -111,7 +111,7 @@ def test_uleb128zzdiff_encode_decode_lgdo_aoesa(wftable): wf_dec, siglen = varlen.decode((wf_enc, nbytes)) assert np.array_equal(wf_dec[:siglen], wftable.values[i]) - assert voev.encoded_data.to_aoesa(preserve_dtype=True).nda.dtype == np.ubyte + assert voev.encoded_data.to_aoesa(missing_value=None).nda.dtype == np.ubyte sig_in_dec = varlen.decode(voev) assert isinstance(sig_in_dec, ArrayOfEqualSizedArrays) diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 66f9814b..a26ade8b 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -121,7 +121,7 @@ def test_aoesa(lgdo_vov): aoesa = v.to_aoesa() assert aoesa.dtype == np.float64 - aoesa = v.to_aoesa(preserve_dtype=True) + aoesa = v.to_aoesa(missing_value=None) assert aoesa.dtype == np.int16 From acfc837db1f5ff65d30dd31a2aab5cb3ff8698c2 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 5 Dec 2023 15:43:01 +0100 Subject: [PATCH 42/47] Yet another API change to VoV.to_aoesa() This is my opinion more transparent. --- src/lgdo/compression/radware.py | 9 ++-- src/lgdo/compression/varlen.py | 6 +-- src/lgdo/types/vectorofvectors.py | 54 +++++++++++-------- tests/compression/test_uleb128_zigzag_diff.py | 5 +- tests/types/test_vectorofvectors.py | 15 ++++-- 5 files changed, 57 insertions(+), 32 deletions(-) diff --git a/src/lgdo/compression/radware.py b/src/lgdo/compression/radware.py index 6b88b741..115992b0 100644 --- a/src/lgdo/compression/radware.py +++ b/src/lgdo/compression/radware.py @@ -129,7 +129,9 @@ def encode( ) # convert VectorOfVectors to ArrayOfEqualSizedArrays so it can be # directly passed to the low-level encoding routine - sig_out_nda, nbytes = encode(sig_in.to_aoesa(), shift=shift) + sig_out_nda, nbytes = encode( + sig_in.to_aoesa(fill_val=0, preserve_dtype=True), shift=shift + ) # build the encoded LGDO encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov( @@ -262,7 +264,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row _, siglen = decode( - (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), + (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes), sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda, shift=shift, ) @@ -288,7 +290,8 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row sig_out, siglen = decode( - (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), shift=shift + (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes), + shift=shift, ) # sanity check diff --git a/src/lgdo/compression/varlen.py b/src/lgdo/compression/varlen.py index 65787843..27bc61da 100644 --- a/src/lgdo/compression/varlen.py +++ b/src/lgdo/compression/varlen.py @@ -103,7 +103,7 @@ def encode( ) # convert VectorOfVectors to ArrayOfEqualSizedArrays so it can be # directly passed to the low-level encoding routine - sig_out_nda, nbytes = encode(sig_in.to_aoesa()) + sig_out_nda, nbytes = encode(sig_in.to_aoesa(fill_val=0, preserve_dtype=True)) # build the encoded LGDO encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov( @@ -227,7 +227,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row _, siglen = decode( - (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes), + (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes), sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda, ) @@ -252,7 +252,7 @@ def decode( # convert vector of vectors to array of equal sized arrays # can now decode on the 2D matrix together with number of bytes to read per row sig_out, siglen = decode( - (sig_in.encoded_data.to_aoesa(missing_value=None).nda, nbytes) + (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes) ) # sanity check diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 070b8833..2a89d1b0 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -401,33 +401,45 @@ def __repr__(self) -> str: return out def to_aoesa( - self, max_len: int = None, missing_value=np.nan + self, + max_len: int = None, + fill_val: int | float = np.nan, + preserve_dtype: bool = False, ) -> aoesa.ArrayOfEqualSizedArrays: """Convert to :class:`ArrayOfEqualSizedArrays`. - If `preserve_dtype` is ``False``, the output array will have dtype - subtype of :class:`numpy.floating` and is padded with - :class:`numpy.nan`. Otherwise, the dtype of the original - :class:`VectorOfVectors` is preserved and the padded values are left - uninitialized (unless the dtype is already floating-point). + + Note + ---- + The dtype of the original vector is typically not strictly preserved. + The output dtype will be either :any:`np.float64` or :any:`np.int64`. + If you want to use the same exact dtype, set `preserve_dtype` to + ``True``. + + Parameters + ---------- + max_len + the length of the returned array along its second dimension. Longer + vectors will be truncated, shorter will be padded with `fill_val`. + If ``None``, the length will be equal to the length of the longest + vector. + fill_val + value used to pad shorter vectors up to `max_len`. The dtype of the + output array will be such that both `fill_val` and the vector + values can be represented in the same data structure. + preserve_dtype + whether the output array should have exactly the same dtype as the + original vector of vectors. The type `fill_val` must be a + compatible one. """ ak_arr = self.view_as("ak") - preserve_dtype = False if max_len is None: max_len = int(ak.max(ak.count(ak_arr, axis=-1))) - # hack to keep preserve_dtype functionality if needed - if missing_value is None: - preserve_dtype = True - # this was introduced, as np.array([np.nan]).astype() would throw a RuntimeWarning if converted to a type not supporting nan values - if np.can_cast(np.nan, self.flattened_data.dtype): - missing_value = np.nan - else: - missing_value = 0 + nda = ak.fill_none(ak.pad_none(ak_arr, max_len, clip=True), fill_val).to_numpy( + allow_missing=False + ) - nda = ak.fill_none( - ak.pad_none(ak_arr, max_len, clip=True), missing_value - ).to_numpy(allow_missing=False) if preserve_dtype: nda = nda.astype(self.flattened_data.dtype) @@ -444,8 +456,8 @@ def view_as( - ``pd``: returns a :class:`pandas.Series` (supported through the ``awkward-pandas`` package) - - ``np``: returns a :class:`numpy.ndarray`, padded to make it - rectangular. + - ``np``: returns a :class:`numpy.ndarray`, padded with zeros to make + it rectangular. - ``ak``: returns an :class:`ak.Array`. ``self.cumulative_length`` is currently re-allocated for technical reasons. @@ -492,7 +504,7 @@ def view_as( if library == "np": if preserve_dtype: - return self.to_aoesa(missing_value=None).view_as( + return self.to_aoesa(fill_val=0, preserve_dtype=True).view_as( "np", with_units=with_units ) else: diff --git a/tests/compression/test_uleb128_zigzag_diff.py b/tests/compression/test_uleb128_zigzag_diff.py index 61e81490..2a9fccaf 100644 --- a/tests/compression/test_uleb128_zigzag_diff.py +++ b/tests/compression/test_uleb128_zigzag_diff.py @@ -111,7 +111,10 @@ def test_uleb128zzdiff_encode_decode_lgdo_aoesa(wftable): wf_dec, siglen = varlen.decode((wf_enc, nbytes)) assert np.array_equal(wf_dec[:siglen], wftable.values[i]) - assert voev.encoded_data.to_aoesa(missing_value=None).nda.dtype == np.ubyte + assert ( + voev.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda.dtype + == np.ubyte + ) sig_in_dec = varlen.decode(voev) assert isinstance(sig_in_dec, ArrayOfEqualSizedArrays) diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index a26ade8b..ce28691f 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -109,7 +109,7 @@ def test_aoesa(lgdo_vov): ] ) assert isinstance(arr, lgdo.ArrayOfEqualSizedArrays) - assert arr.dtype == np.float64 + assert np.issubdtype(arr.dtype, np.floating) assert np.array_equal(arr.nda, desired, True) v = VectorOfVectors( @@ -119,10 +119,17 @@ def test_aoesa(lgdo_vov): cumulative_length=lgdo.Array(nda=np.array([2, 5, 6, 10, 13])), ) aoesa = v.to_aoesa() - assert aoesa.dtype == np.float64 - aoesa = v.to_aoesa(missing_value=None) - assert aoesa.dtype == np.int16 + assert np.issubdtype(aoesa.dtype, np.floating) + + aoesa = v.to_aoesa(fill_val=-999.9) + assert np.issubdtype(aoesa.nda.dtype, np.floating) + + aoesa = v.to_aoesa(fill_val=-999) + assert np.issubdtype(aoesa.nda.dtype, np.integer) + + aoesa = v.to_aoesa(fill_val=-999, preserve_dtype=True) + assert aoesa.nda.dtype == np.int16 def test_set_vector(lgdo_vov): From cef1c66d3691d267976cefcc0e4a107875764c77 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 5 Dec 2023 15:50:27 +0100 Subject: [PATCH 43/47] Fix docs --- src/lgdo/types/vectorofvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 2a89d1b0..a5f13462 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -411,7 +411,7 @@ def to_aoesa( Note ---- The dtype of the original vector is typically not strictly preserved. - The output dtype will be either :any:`np.float64` or :any:`np.int64`. + The output dtype will be either :class:`np.float64` or :class:`np.int64`. If you want to use the same exact dtype, set `preserve_dtype` to ``True``. From a3c84b0354068fee55d49e7192b0191b9d0736cc Mon Sep 17 00:00:00 2001 From: Neuberger Date: Wed, 6 Dec 2023 10:26:21 +0100 Subject: [PATCH 44/47] adding copy=False option to astype --- src/lgdo/types/vectorofvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index a5f13462..33f18094 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -441,7 +441,7 @@ def to_aoesa( ) if preserve_dtype: - nda = nda.astype(self.flattened_data.dtype) + nda.astype(self.flattened_data.dtype, copy=False) return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) From e49ed0c2f0495407bfbdcdb99e08e8bb22f41628 Mon Sep 17 00:00:00 2001 From: Neuberger Date: Wed, 6 Dec 2023 10:40:10 +0100 Subject: [PATCH 45/47] im confused, why do I still need the equal sign there --- src/lgdo/types/vectorofvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 33f18094..66e5217f 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -441,7 +441,7 @@ def to_aoesa( ) if preserve_dtype: - nda.astype(self.flattened_data.dtype, copy=False) + nda = nda.astype(self.flattened_data.dtype, copy=False) return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs()) From b233a264d84650f5cbb0599ab5677a75faa0c410 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 30 Dec 2023 17:03:04 +0100 Subject: [PATCH 46/47] Bug fix in WaveformTable.view_as() --- src/lgdo/types/table.py | 2 +- src/lgdo/types/waveformtable.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index a054da2b..01d84aaf 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -389,7 +389,7 @@ def view_as( df = df.join(tmp_ser) elif isinstance(column, Table): tmp_df = column.view_as( - "pd", prefix=f"{prefix}{col}_", with_units=with_units + "pd", with_units=with_units, prefix=f"{prefix}{col}_" ) if df.empty: df = tmp_df diff --git a/src/lgdo/types/waveformtable.py b/src/lgdo/types/waveformtable.py index 989747c3..7840afc2 100644 --- a/src/lgdo/types/waveformtable.py +++ b/src/lgdo/types/waveformtable.py @@ -266,7 +266,11 @@ def __str__(self): return string def view_as( - self, fmt: str, with_units: bool = False + self, + library: str, + with_units: bool = False, + cols: list[str] = None, + prefix: str = "", ) -> pd.DataFrame | np.NDArray | ak.Array: r"""View the waveform data as a third-party format data structure. @@ -274,4 +278,4 @@ def view_as( -------- .LGDO.view_as """ - return super().view_as(fmt, with_units) + return super().view_as(library, with_units, cols, prefix) From f5a317c8d6e79e6df9b9c0c0fd8629cd44a6223a Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sat, 30 Dec 2023 17:50:57 +0100 Subject: [PATCH 47/47] [docs] update LH5Files tutorial --- docs/source/notebooks/LH5Files.ipynb | 139 ++++++++++++++++++++++++++- src/lgdo/types/encoded.py | 8 +- src/lgdo/types/table.py | 2 +- 3 files changed, 141 insertions(+), 8 deletions(-) diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb index 9c594be9..2d77495b 100644 --- a/docs/source/notebooks/LH5Files.ipynb +++ b/docs/source/notebooks/LH5Files.ipynb @@ -48,7 +48,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lgdo import ls\n", + "from lgdo.lh5 import ls\n", "\n", "ls(lh5_file)" ] @@ -91,7 +91,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lgdo import show\n", + "from lgdo.lh5 import show\n", "\n", "show(lh5_file)" ] @@ -111,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lgdo import LH5Store\n", + "from lgdo.lh5 import LH5Store\n", "\n", "store = LH5Store()" ] @@ -210,12 +210,141 @@ "metadata": {}, "outputs": [], "source": [ - "from lgdo import LH5Iterator\n", + "from lgdo.lh5 import LH5Iterator\n", "\n", "for lh5_obj, entry, n_rows in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n", " print(f\"entry {entry}, energy = {lh5_obj} ({n_rows} rows)\")" ] }, + { + "cell_type": "markdown", + "id": "684f8530", + "metadata": {}, + "source": [ + "### Converting LGDO data to alternative formats\n", + "\n", + "Each LGDO is equipped with a class method called `view_as()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.types.html#lgdo.types.lgdo.LGDO.view_as), which allows the user to \"view\" the data (i.e. avoiding copying data as much as possible) in a different, third-party format.\n", + "\n", + "LGDOs generally support viewing as NumPy (`np`), Pandas (`pd`) or [Awkward](https://awkward-array.org) (`ak`) data structures, with some exceptions. We strongly recommend having a look at the `view_as()` API docs of each LGDO type for more details (for `Table.view_as()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.types.html#lgdo.types.table.Table.view_as), for example).\n", + "\n", + "
\n", + "\n", + "**Note:** To obtain a copy of the data in the selected third-party format, the user can call the appropriate third-party copy method on the view (e.g. `pandas.DataFrame.copy()`, if viewing the data as a Pandas dataframe).\n", + "\n", + "
\n", + "\n", + "Let's play around with our good old table, can we view it as a Pandas dataframe?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f48391", + "metadata": {}, + "outputs": [], + "source": [ + "obj, _ = store.read(\"geds/raw\", lh5_file)\n", + "df = obj.view_as(\"pd\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "7f476362", + "metadata": {}, + "source": [ + "Yes! But how are the nested objects being handled?\n", + "\n", + "Nested tables have been flattened by prefixing their column names with the table object name (`obj.waveform.values` becomes `df.waveform_values`) and multi-dimensional columns are represented by Awkward arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6261c8fe", + "metadata": {}, + "outputs": [], + "source": [ + "df.waveform_values" + ] + }, + { + "cell_type": "markdown", + "id": "6ed5904a", + "metadata": {}, + "source": [ + "But what if we wanted to have the waveform values as a NumPy array?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4b45112", + "metadata": {}, + "outputs": [], + "source": [ + "obj.waveform.values.view_as(\"np\")" + ] + }, + { + "cell_type": "markdown", + "id": "d0c86728", + "metadata": {}, + "source": [ + "Can we just view the full table as a huge Awkward array? Of course:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ae5c21", + "metadata": {}, + "outputs": [], + "source": [ + "obj.view_as(\"ak\")" + ] + }, + { + "cell_type": "markdown", + "id": "cd5fa308", + "metadata": {}, + "source": [ + "Note that viewing a `VectorOfVector` as an Awkward array is a nearly zero-copy operation and opens a new avenue of fast computational possibilities thanks to Awkward:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d75c8ff8", + "metadata": {}, + "outputs": [], + "source": [ + "import awkward as ak\n", + "\n", + "# tracelist is a VoV on disk\n", + "trlist = obj.tracelist.view_as(\"ak\")\n", + "ak.mean(trlist)" + ] + }, + { + "cell_type": "markdown", + "id": "d8d9ad8c", + "metadata": {}, + "source": [ + "Last but not least, we support attaching physical units (that might be stored in the `units` attribute of an LGDO) to data views through Pint, if the third-party format allows it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4007efd4", + "metadata": {}, + "outputs": [], + "source": [ + "df = obj.view_as(\"pd\", with_units=True)\n", + "df.timestamp.dtype" + ] + }, { "cell_type": "markdown", "id": "3ab3794c", @@ -278,7 +407,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lgdo import show\n", + "from lgdo.lh5 import show\n", "\n", "show(\"my_objects.lh5\")" ] diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index ac5e5a08..bcb8b71e 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -266,7 +266,9 @@ def view_as( return ak.Array(records_list) if library == "np": - raise TypeError(f"Format {library} is not a supported for voev.") + raise TypeError( + f"Format {library} is not supported for VectorOfEncodedVectors." + ) if library == "pd": if attach_units: raise ValueError( @@ -492,7 +494,9 @@ def view_as( return ak.Array(records_list) if library == "np": - raise TypeError(f"Format {library} is not a supported for aoeesa.") + raise TypeError( + f"Format {library} is not supported for ArrayOfEncodedEqualSizedArrays." + ) if library == "pd": if attach_units: raise ValueError( diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 01d84aaf..4889faf1 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -407,7 +407,7 @@ def view_as( return df elif library == "np": - raise TypeError(f"Format {library} is not a supported for Tables.") + raise TypeError(f"Format {library} is not supported for Tables.") elif library == "ak": if with_units: