Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow nested field names to be escaped with backticks. #177

Merged
merged 8 commits into from
Nov 27, 2024
160 changes: 117 additions & 43 deletions src/nested_pandas/nestedframe/core.py
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ast
import os
import re

import numpy as np
import pandas as pd
Expand All @@ -18,6 +19,10 @@
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct

# Used to identify backtick-protected names in the expressions
# used in NestedFrame.eval() and NestedFrame.query().
_backtick_protected_names = re.compile(r"`[^`]+`", re.MULTILINE)


class NestedPandasExprVisitor(PandasExprVisitor):
"""
Expand Down Expand Up @@ -83,9 +88,9 @@ def __init__(self, outer: NestedFrame):
super().__init__()
# Pre-load the field resolvers for all columns which are known at present.
for column in outer.nested_columns:
self._initialize_field_resolver(column, outer)
self._initialize_column_resolver(column, outer)

def _initialize_field_resolver(self, column: str, outer: NestedFrame):
def _initialize_column_resolver(self, column: str, outer: NestedFrame):
"""
Initialize a resolver for the given nested column, and also an alias
for it, in the case of column names that have spaces or are otherwise
Expand All @@ -107,7 +112,7 @@ def __getitem__(self, item):
if not super().__contains__(top_nest):
if top_nest not in self._outer.nested_columns:
raise KeyError(f"Unknown nest {top_nest}")
self._initialize_field_resolver(top_nest, self._outer)
self._initialize_column_resolver(top_nest, self._outer)
return super().__getitem__(top_nest)

def __setitem__(self, item, _):
Expand Down Expand Up @@ -139,8 +144,20 @@ def __init__(self, nest_name: str, outer: NestedFrame):
# Flattened only once for every access of this particular nest
# within the expression.
self._flat_nest = outer[nest_name].nest.to_flat()
# Save aliases to any columns that are not identifier-like.
# If our given frame has aliases for identifiers, use these instead
# of generating our own.
self._aliases = getattr(outer, "_aliases", None)
if self._aliases is None:
self._aliases = {}
for column in self._flat_nest.columns:
clean_id = clean_column_name(column)
if clean_id != column:
self._aliases[clean_id] = column

def __getattr__(self, item_name: str):
if self._aliases:
item_name = self._aliases.get(item_name, item_name)
if item_name in self._flat_nest:
result = _SeriesFromNest(self._flat_nest[item_name])
# Assigning these properties directly in order to avoid any complication
Expand Down Expand Up @@ -193,6 +210,25 @@ def _subexprs_by_nest(parents: list, node) -> dict[str, list]:
return result


def _identify_aliases(expr: str) -> tuple[str, dict[str, str]]:
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
"""
Given an expression string, identify backtick-quoted names
and replace them with cleaned names, returning the cleaned
expression and a dictionary of aliases, where the keys are
clean aliases to the original names.
"""
aliases = {}

def sub_and_alias(match):
original = match.group(0)[1:-1] # remove backticks
alias = clean_column_name(original)
if alias != original:
aliases[alias] = original
return alias

return _backtick_protected_names.sub(sub_and_alias, expr), aliases


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.

Expand All @@ -205,6 +241,12 @@ class NestedFrame(pd.DataFrame):
# Series produce instances of this class, preserving the type and origin.
__pandas_priority__ = 4500

# The "_aliases" attribute is usually None or not even present, but when it is present,
# it indicates that an evaluation is in progress, and that columns and fields with names
# that are not identifier-like have been aliases to cleaned names, and this attribute
# contains those aliases, keyed by the cleaned name.
_metadata = ["_aliases"]

@property
def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame
Expand Down Expand Up @@ -232,62 +274,86 @@ def nested_columns(self) -> list:
nest_cols.append(column)
return nest_cols

def _is_known_hierarchical_column(self, colname) -> bool:
def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]:
"""
Given a string that may be a delimited path, parse it into its components,
respecting backticks that are used to protect component names that may contain the delimiter.
"""
aliases = getattr(self, "_aliases", None)
if aliases is None:
delimited_path, aliases = _identify_aliases(delimited_path)
return [aliases.get(x, x) for x in delimited_path.split(delimiter)]

def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
"""Determine whether a string is a known hierarchical column name"""
if "." in colname:
base_name = colname.split(".")[0]
if base_name in self.nested_columns:
# TODO: only handles one level of nesting for now
nested_name = ".".join(colname.split(".")[1:])
return nested_name in self.all_columns[base_name]
if isinstance(components, str):
components = self._parse_hierarchical_components(components)
if len(components) < 2:
return False
base_name = components[0]
if base_name in self.nested_columns:
nested_name = ".".join(components[1:])
return nested_name in self.all_columns[base_name]
return False

def _is_known_column(self, colname) -> bool:
"""Determine whether a string is a known column name"""
return colname in self.columns or self._is_known_hierarchical_column(colname)
def _is_known_column(self, components: list[str] | str) -> bool:
"""Determine whether a list of field components describes a known column name"""
if isinstance(components, str):
components = self._parse_hierarchical_components(components)
if ".".join(components) in self.columns:
return True
return self._is_known_hierarchical_column(components)

def __getitem__(self, item):
"""Adds custom __getitem__ behavior for nested columns"""

if isinstance(item, str):
# Preempt the nested check if the item is a base column
if item in self.columns:
return super().__getitem__(item)
# If a nested column name is passed, return a flat series for that column
# flat series is chosen over list series for utility
# e.g. native ability to do something like ndf["nested.a"] + 3
elif self._is_known_hierarchical_column(item):
# TODO: only handles one level of nesting for now
nested = item.split(".")[0]
col = ".".join(item.split(".")[1:])
return self[nested].nest.get_flat_series(col)
else:
raise KeyError(f"Column '{item}' not found in nested columns or base columns")
else:
if not isinstance(item, str):
return super().__getitem__(item)

# Preempt the nested check if the item is a base column, with or without
# dots and backticks.
if item in self.columns:
return super().__getitem__(item)
components = self._parse_hierarchical_components(item)
# One more check on the entirety of the item name, in case backticks were used
# (even if they weren't necessary).
cleaned_item = ".".join(components)
if cleaned_item in self.columns:
return super().__getitem__(cleaned_item)
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved

# If a nested column name is passed, return a flat series for that column
# flat series is chosen over list series for utility
# e.g. native ability to do something like ndf["nested.a"] + 3
if self._is_known_hierarchical_column(components):
nested = components[0]
field = ".".join(components[1:])
return self[nested].nest.get_flat_series(field)
else:
raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")

def __setitem__(self, key, value):
"""Adds custom __setitem__ behavior for nested columns"""

components = self._parse_hierarchical_components(key)
# Replacing or adding columns to a nested structure
# Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5
# Or ndf["nested.base_t"] = ndf["nested.t"] - 5
# Performance note: This requires building a new nested structure
# TODO: Support assignment of a new column to an existing nested col from a list series
if self._is_known_hierarchical_column(key) or (
"." in key and key.split(".")[0] in self.nested_columns
if self._is_known_hierarchical_column(components) or (
len(components) > 1 and components[0] in self.nested_columns
):
nested, col = key.split(".")
new_nested_series = self[nested].nest.with_flat_field(col, value)
if len(components) != 2:
raise ValueError(f"Only one level of nesting is supported; given {key}")
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
nested, field = components
new_nested_series = self[nested].nest.with_flat_field(field, value)
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
if "." in key:
new_nested, col = key.split(".")
if len(components) > 1:
new_nested, field = components
if isinstance(value, pd.Series):
value.name = col
value.name = field
value = value.to_frame()
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
Expand Down Expand Up @@ -512,10 +578,15 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
--------
https://pandas.pydata.org/docs/reference/api/pandas.eval.html
"""
_, aliases = _identify_aliases(expr)
self._aliases: dict[str, str] | None = aliases

kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),)
kwargs["inplace"] = inplace
kwargs["parser"] = "nested-pandas"
return super().eval(expr, **kwargs)
answer = super().eval(expr, **kwargs)
self._aliases = None
return answer

def extract_nest_names(
self,
Expand Down Expand Up @@ -801,7 +872,7 @@ def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override
Takes a function and applies it to each top-level row of the NestedFrame.

The user may specify which columns the function is applied to, with
columns from the 'base' layer being passsed to the function as
columns from the 'base' layer being passed to the function as
scalars and columns from the nested layers being passed as numpy arrays.

Parameters
Expand Down Expand Up @@ -838,12 +909,15 @@ def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override
# Parse through the initial args to determine the columns to apply the function to
requested_columns = []
for arg in args:
if not isinstance(arg, str) or not self._is_known_column(arg):
# We've reached an argument that is not a valid column, so we assume
# the remaining args are extra arguments to the function
# Stop when we reach an argument that is not a valid column, as we assume
# that the remaining args are extra arguments to the function
if not isinstance(arg, str):
break
components = self._parse_hierarchical_components(arg)
if not self._is_known_column(components):
break
layer = "base" if "." not in arg else arg.split(".")[0]
col = arg.split(".")[-1]
layer = "base" if len(components) < 2 else components[0]
col = components[-1]
requested_columns.append((layer, col))

# We require the first *args to be the columns to apply the function to
Expand Down
77 changes: 77 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,35 @@ def test_is_known_hierarchical_column():
assert not base._is_known_hierarchical_column("base.a")


def test_is_known_column():
"""
Test that known (non-hierarchical) columns can be identified. The key
point to test is that columns which might look like they are nested,
but which are already known to not be, are correctly identified.
"""
base = NestedFrame(data={"R. A.": [1, 2, 3], "nested.b": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert base._is_known_column("R. A.")
assert base._is_known_column("`R. A.`")
assert base._is_known_column("nested.b")

# In this context, the "." delimiter matters a lot, so the following, which would be
# acceptable in an .eval() context, is not acceptable here.
assert not base._is_known_column("nested . b")
assert not base._is_known_column("nested. c")
assert not base._is_known_column("nested .d")

# But hierarchical ones should also work
assert base._is_known_column("nested.c")
assert base._is_known_column("nested.d")


def test_get_nested_column():
"""Test that __getitem__ can retrieve a nested column"""

Expand Down Expand Up @@ -167,6 +196,21 @@ def test_get_dot_names():
assert len(nf["nested.R.A."]) == 4


def test_nesting_limit():
"""Test the ability to prevent nesting beyond a depth of 1."""
nf = NestedFrame.from_flat(
NestedFrame({"a": [1, 2, 3, 4], ".b.": [1, 1, 3, 3], "R.A.": [3, None, 6, 5]}, index=[1, 1, 2, 2]),
base_columns=[".b."],
)
with pytest.raises(ValueError):
# The error gets triggered for the attempt to create new nested columns; if the column has
# already been created, it should be fine.
nf["nested.c.d.e"] = nf[".b."]
nf["nested.c"] = nf["nested.R.A."]
# Test that the above works with backticks, too, even in cases where they are not strictly necessary.
nf["`nested.d`"] = nf["`.b.`"]


def test_add_nested_with_flat_df():
"""Test that add_nested correctly adds a nested column to the base df"""

Expand Down Expand Up @@ -612,6 +656,17 @@ def test_query_on_non_identifier_columns():
nf3 = nf.query("`bad dog`.a > 2")
assert nf3["bad dog"].nest["a"].size == 4

# And also for fields within the nested columns.
# Taken from GH#176
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nf4 = nf.query("`bad dog`.`n/a` > 2")
assert nf4["bad dog"].nest["n/a"].size == 4


def test_dropna():
"""Test that dropna works on all layers"""
Expand Down Expand Up @@ -815,6 +870,14 @@ def offset_avg(offset, col_to_avg, column_names):
for i in range(len(result)):
assert result["offset_avg"].values[i] == expected_offset_avg[i]

# Verify that we can understand a string argument to the reduce function,
# so long as it isn't a column name.
def make_id(col1, prefix_str):
return f"{prefix_str}{col1}"

result = nf.reduce(make_id, "b", "some_id_")
assert result[0][1] == "some_id_4"


def test_reduce_duplicated_cols():
"""Tests nf.reduce() to correctly handle duplicated column names."""
Expand Down Expand Up @@ -1034,6 +1097,20 @@ def test_eval_assignment():
assert (nf["p2.e"] == nf["packed.d"] * 2 + nf.c).all()
assert (nf["p2.f"] == nf["p2.e"] + nf.b).all()

# Verify that assignment can be done to nested columns and fields
# having names which are not valid Python identifiers, and must
# be quoted with backticks.
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nfx = nf.eval("`bad dog`.`n/c` = `bad dog`.`n/b` + 2.5")
# The number of columns at the top should not have changed
assert len(nfx.columns) == len(nf.columns)
assert (nfx["bad dog"].nest["n/c"] == nf["bad dog"].nest["n/b"] + 2.5).all()


def test_access_non_existing_column():
"""Test that accessing a non-existing column raises a KeyError"""
Expand Down