lincc-frameworks · gitosaurus · Nov 27, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -3,6 +3,7 @@
 
 import ast
 import os
+import re
 
 import numpy as np
 import pandas as pd
@@ -83,9 +84,9 @@
         super().__init__()
         # Pre-load the field resolvers for all columns which are known at present.
         for column in outer.nested_columns:
-            self._initialize_field_resolver(column, outer)
+            self._initialize_column_resolver(column, outer)
 
-    def _initialize_field_resolver(self, column: str, outer: NestedFrame):
+    def _initialize_column_resolver(self, column: str, outer: NestedFrame):
         """
         Initialize a resolver for the given nested column, and also an alias
         for it, in the case of column names that have spaces or are otherwise
@@ -107,7 +108,7 @@
         if not super().__contains__(top_nest):
             if top_nest not in self._outer.nested_columns:
                 raise KeyError(f"Unknown nest {top_nest}")
-            self._initialize_field_resolver(top_nest, self._outer)
+            self._initialize_column_resolver(top_nest, self._outer)
         return super().__getitem__(top_nest)
 
     def __setitem__(self, item, _):
@@ -139,8 +140,20 @@
         # Flattened only once for every access of this particular nest
         # within the expression.
         self._flat_nest = outer[nest_name].nest.to_flat()
+        # Save aliases to any columns that are not identifier-like.
+        # If our given frame has aliases for identifiers, use these instead
+        # of generating our own.
+        self._aliases = getattr(outer, "_aliases", None)
+        if self._aliases is None:
+            self._aliases = {}
+            for column in self._flat_nest.columns:
+                clean_id = clean_column_name(column)
+                if clean_id != column:
+                    self._aliases[clean_id] = column
 
     def __getattr__(self, item_name: str):
+        if self._aliases:
+            item_name = self._aliases.get(item_name, item_name)
         if item_name in self._flat_nest:
             result = _SeriesFromNest(self._flat_nest[item_name])
             # Assigning these properties directly in order to avoid any complication
@@ -193,6 +206,26 @@
     return result
 
 
+def _identify_aliases(expr: str) -> tuple[str, dict[str, str]]:
+    """
+    Given an expression string, identify backtick-quoted names
+    and replace them with cleaned names, returning the cleaned
+    expression and a dictionary of aliases, where the keys are
+    clean aliases to the original names.
+    """
+    aliases = {}
+    pattern = re.compile(r"`[^`]+`", re.MULTILINE)
+
+    def sub_and_alias(match):
+        original = match.group(0)[1:-1]  # remove backticks
+        alias = clean_column_name(original)
+        if alias != original:
+            aliases[alias] = original
+        return alias
+
+    return pattern.sub(sub_and_alias, expr), aliases
+
+
 class NestedFrame(pd.DataFrame):
     """A Pandas Dataframe extension with support for nested structure.
 
@@ -205,6 +238,12 @@
     # Series produce instances of this class, preserving the type and origin.
     __pandas_priority__ = 4500
 
+    # The "_aliases" attribute is usually None or not even present, but when it is present,
+    # it indicates that an evaluation is in progress, and that columns and fields with names
+    # that are not identifier-like have been aliases to cleaned names, and this attribute
+    # contains those aliases, keyed by the cleaned name.
+    _metadata = ["_aliases"]
+
     @property
     def _constructor(self) -> Self:  # type: ignore[name-defined] # noqa: F821
         return NestedFrame
@@ -232,62 +271,86 @@
                 nest_cols.append(column)
         return nest_cols
 
-    def _is_known_hierarchical_column(self, colname) -> bool:
+    def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]:
+        """
+        Given a string that may be a delimited path, parse it into its components,
+        respecting backticks that are used to protect component names that may contain the delimiter.
+        """
+        aliases = getattr(self, "_aliases", None)
+        if aliases is None:
+            delimited_path, aliases = _identify_aliases(delimited_path)
+        return [aliases.get(x, x) for x in delimited_path.split(delimiter)]
+
+    def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
         """Determine whether a string is a known hierarchical column name"""
-        if "." in colname:
-            base_name = colname.split(".")[0]
-            if base_name in self.nested_columns:
-                # TODO: only handles one level of nesting for now
-                nested_name = ".".join(colname.split(".")[1:])
-                return nested_name in self.all_columns[base_name]
+        if isinstance(components, str):
+            components = self._parse_hierarchical_components(components)
+        if len(components) < 2:
             return False
+        base_name = components[0]
+        if base_name in self.nested_columns:
+            nested_name = ".".join(components[1:])
+            return nested_name in self.all_columns[base_name]
         return False
 
-    def _is_known_column(self, colname) -> bool:
-        """Determine whether a string is a known column name"""
-        return colname in self.columns or self._is_known_hierarchical_column(colname)
+    def _is_known_column(self, components: list[str] | str) -> bool:
+        """Determine whether a list of field components describes a known column name"""
+        if isinstance(components, str):
+            components = self._parse_hierarchical_components(components)
+        if ".".join(components) in self.columns:
+            return True
+        return self._is_known_hierarchical_column(components)
 
     def __getitem__(self, item):
         """Adds custom __getitem__ behavior for nested columns"""
 
-        if isinstance(item, str):
-            # Preempt the nested check if the item is a base column
-            if item in self.columns:
-                return super().__getitem__(item)
-            # If a nested column name is passed, return a flat series for that column
-            # flat series is chosen over list series for utility
-            # e.g. native ability to do something like ndf["nested.a"] + 3
-            elif self._is_known_hierarchical_column(item):
-                # TODO: only handles one level of nesting for now
-                nested = item.split(".")[0]
-                col = ".".join(item.split(".")[1:])
-                return self[nested].nest.get_flat_series(col)
-            else:
-                raise KeyError(f"Column '{item}' not found in nested columns or base columns")
-        else:
+        if not isinstance(item, str):
             return super().__getitem__(item)
 
+        # Preempt the nested check if the item is a base column, with or without
+        # dots and backticks.
+        if item in self.columns:
+            return super().__getitem__(item)
+        components = self._parse_hierarchical_components(item)
+        # One more check on the entirety of the item name, in case backticks were used
+        # (even if they weren't necessary).
+        cleaned_item = ".".join(components)
+        if cleaned_item in self.columns:
+            return super().__getitem__(cleaned_item)
+
+        # If a nested column name is passed, return a flat series for that column
+        # flat series is chosen over list series for utility
+        # e.g. native ability to do something like ndf["nested.a"] + 3
+        if self._is_known_hierarchical_column(components):
+            nested = components[0]
+            field = ".".join(components[1:])
+            return self[nested].nest.get_flat_series(field)
+        else:
+            raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")
+
     def __setitem__(self, key, value):
         """Adds custom __setitem__ behavior for nested columns"""
-
+        components = self._parse_hierarchical_components(key)
         # Replacing or adding columns to a nested structure
         # Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5
         # Or ndf["nested.base_t"] = ndf["nested.t"] - 5
         # Performance note: This requires building a new nested structure
         # TODO: Support assignment of a new column to an existing nested col from a list series
-        if self._is_known_hierarchical_column(key) or (
-            "." in key and key.split(".")[0] in self.nested_columns
+        if self._is_known_hierarchical_column(components) or (
+            len(components) > 1 and components[0] in self.nested_columns
         ):
-            nested, col = key.split(".")
-            new_nested_series = self[nested].nest.with_flat_field(col, value)
+            if len(components) != 2:
+                raise ValueError(f"Only one level of nesting is supported; given {key}")
+            nested, field = components
+            new_nested_series = self[nested].nest.with_flat_field(field, value)
             return super().__setitem__(nested, new_nested_series)
 
         # Adding a new nested structure from a column
         # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
-        if "." in key:
-            new_nested, col = key.split(".")
+        if len(components) > 1:
+            new_nested, field = components
             if isinstance(value, pd.Series):
-                value.name = col
+                value.name = field
                 value = value.to_frame()
             new_df = self.add_nested(value, name=new_nested)
             self._update_inplace(new_df)
@@ -512,10 +575,15 @@
         --------
         https://pandas.pydata.org/docs/reference/api/pandas.eval.html
         """
+        _, aliases = _identify_aliases(expr)
+        self._aliases: dict[str, str] | None = aliases
+
         kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),)
         kwargs["inplace"] = inplace
         kwargs["parser"] = "nested-pandas"
-        return super().eval(expr, **kwargs)
+        answer = super().eval(expr, **kwargs)
+        self._aliases = None
+        return answer
 
     def extract_nest_names(
         self,
@@ -838,12 +906,15 @@
         # Parse through the initial args to determine the columns to apply the function to
         requested_columns = []
         for arg in args:
-            if not isinstance(arg, str) or not self._is_known_column(arg):
-                # We've reached an argument that is not a valid column, so we assume
-                # the remaining args are extra arguments to the function
+            # Stop when we reach an argument that is not a valid column, as we assume
+            # that the remaining args are extra arguments to the function
+            if not isinstance(arg, str):
+                break
+            components = self._parse_hierarchical_components(arg)
+            if not self._is_known_column(components):
                 break
-            layer = "base" if "." not in arg else arg.split(".")[0]
-            col = arg.split(".")[-1]
+            layer = "base" if len(components) < 2 else components[0]
+            col = components[-1]
             requested_columns.append((layer, col))
 
         # We require the first *args to be the columns to apply the function to

diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -612,6 +612,17 @@ def test_query_on_non_identifier_columns():
     nf3 = nf.query("`bad dog`.a > 2")
     assert nf3["bad dog"].nest["a"].size == 4
 
+    # And also for fields within the nested columns.
+    # Taken from GH#176
+    nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
+    nested = pd.DataFrame(
+        data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+    nf = nf.add_nested(nested, "bad dog")
+    nf4 = nf.query("`bad dog`.`n/a` > 2")
+    assert nf4["bad dog"].nest["n/a"].size == 4
+
 
 def test_dropna():
     """Test that dropna works on all layers"""
@@ -1034,6 +1045,20 @@ def test_eval_assignment():
     assert (nf["p2.e"] == nf["packed.d"] * 2 + nf.c).all()
     assert (nf["p2.f"] == nf["p2.e"] + nf.b).all()
 
+    # Verify that assignment can be done to nested columns and fields
+    # having names which are not valid Python identifiers, and must
+    # be quoted with backticks.
+    nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
+    nested = pd.DataFrame(
+        data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+    nf = nf.add_nested(nested, "bad dog")
+    nfx = nf.eval("`bad dog`.`n/c` = `bad dog`.`n/b` + 2.5")
+    # The number of columns at the top should not have changed
+    assert len(nfx.columns) == len(nf.columns)
+    assert (nfx["bad dog"].nest["n/c"] == nf["bad dog"].nest["n/b"] + 2.5).all()
+
 
 def test_access_non_existing_column():
     """Test that accessing a non-existing column raises a KeyError"""