Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow nested field names to be escaped with backticks. #177

Merged
merged 8 commits into from
Nov 27, 2024
155 changes: 113 additions & 42 deletions src/nested_pandas/nestedframe/core.py
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ast
import os
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -83,9 +84,9 @@
super().__init__()
# Pre-load the field resolvers for all columns which are known at present.
for column in outer.nested_columns:
self._initialize_field_resolver(column, outer)
self._initialize_column_resolver(column, outer)

def _initialize_field_resolver(self, column: str, outer: NestedFrame):
def _initialize_column_resolver(self, column: str, outer: NestedFrame):
"""
Initialize a resolver for the given nested column, and also an alias
for it, in the case of column names that have spaces or are otherwise
Expand All @@ -107,7 +108,7 @@
if not super().__contains__(top_nest):
if top_nest not in self._outer.nested_columns:
raise KeyError(f"Unknown nest {top_nest}")
self._initialize_field_resolver(top_nest, self._outer)
self._initialize_column_resolver(top_nest, self._outer)
return super().__getitem__(top_nest)

def __setitem__(self, item, _):
Expand Down Expand Up @@ -139,8 +140,20 @@
# Flattened only once for every access of this particular nest
# within the expression.
self._flat_nest = outer[nest_name].nest.to_flat()
# Save aliases to any columns that are not identifier-like.
# If our given frame has aliases for identifiers, use these instead
# of generating our own.
self._aliases = getattr(outer, "_aliases", None)
if self._aliases is None:
self._aliases = {}
for column in self._flat_nest.columns:
clean_id = clean_column_name(column)
if clean_id != column:
self._aliases[clean_id] = column

def __getattr__(self, item_name: str):
if self._aliases:
item_name = self._aliases.get(item_name, item_name)
if item_name in self._flat_nest:
result = _SeriesFromNest(self._flat_nest[item_name])
# Assigning these properties directly in order to avoid any complication
Expand Down Expand Up @@ -193,6 +206,26 @@
return result


def _identify_aliases(expr: str) -> tuple[str, dict[str, str]]:
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
"""
Given an expression string, identify backtick-quoted names
and replace them with cleaned names, returning the cleaned
expression and a dictionary of aliases, where the keys are
clean aliases to the original names.
"""
aliases = {}
pattern = re.compile(r"`[^`]+`", re.MULTILINE)

def sub_and_alias(match):
original = match.group(0)[1:-1] # remove backticks
alias = clean_column_name(original)
if alias != original:
aliases[alias] = original
return alias

return pattern.sub(sub_and_alias, expr), aliases


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.

Expand All @@ -205,6 +238,12 @@
# Series produce instances of this class, preserving the type and origin.
__pandas_priority__ = 4500

# The "_aliases" attribute is usually None or not even present, but when it is present,
# it indicates that an evaluation is in progress, and that columns and fields with names
# that are not identifier-like have been aliases to cleaned names, and this attribute
# contains those aliases, keyed by the cleaned name.
_metadata = ["_aliases"]

@property
def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame
Expand Down Expand Up @@ -232,62 +271,86 @@
nest_cols.append(column)
return nest_cols

def _is_known_hierarchical_column(self, colname) -> bool:
def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]:
"""
Given a string that may be a delimited path, parse it into its components,
respecting backticks that are used to protect component names that may contain the delimiter.
"""
aliases = getattr(self, "_aliases", None)
if aliases is None:
delimited_path, aliases = _identify_aliases(delimited_path)
return [aliases.get(x, x) for x in delimited_path.split(delimiter)]

def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
"""Determine whether a string is a known hierarchical column name"""
if "." in colname:
base_name = colname.split(".")[0]
if base_name in self.nested_columns:
# TODO: only handles one level of nesting for now
nested_name = ".".join(colname.split(".")[1:])
return nested_name in self.all_columns[base_name]
if isinstance(components, str):
components = self._parse_hierarchical_components(components)
if len(components) < 2:
return False
base_name = components[0]
if base_name in self.nested_columns:
nested_name = ".".join(components[1:])
return nested_name in self.all_columns[base_name]
return False

def _is_known_column(self, colname) -> bool:
"""Determine whether a string is a known column name"""
return colname in self.columns or self._is_known_hierarchical_column(colname)
def _is_known_column(self, components: list[str] | str) -> bool:
"""Determine whether a list of field components describes a known column name"""
if isinstance(components, str):
components = self._parse_hierarchical_components(components)

Check warning on line 299 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L299

Added line #L299 was not covered by tests
if ".".join(components) in self.columns:
return True
return self._is_known_hierarchical_column(components)

def __getitem__(self, item):
"""Adds custom __getitem__ behavior for nested columns"""

if isinstance(item, str):
# Preempt the nested check if the item is a base column
if item in self.columns:
return super().__getitem__(item)
# If a nested column name is passed, return a flat series for that column
# flat series is chosen over list series for utility
# e.g. native ability to do something like ndf["nested.a"] + 3
elif self._is_known_hierarchical_column(item):
# TODO: only handles one level of nesting for now
nested = item.split(".")[0]
col = ".".join(item.split(".")[1:])
return self[nested].nest.get_flat_series(col)
else:
raise KeyError(f"Column '{item}' not found in nested columns or base columns")
else:
if not isinstance(item, str):
return super().__getitem__(item)

# Preempt the nested check if the item is a base column, with or without
# dots and backticks.
if item in self.columns:
return super().__getitem__(item)
components = self._parse_hierarchical_components(item)
# One more check on the entirety of the item name, in case backticks were used
# (even if they weren't necessary).
cleaned_item = ".".join(components)
if cleaned_item in self.columns:
return super().__getitem__(cleaned_item)

Check warning on line 319 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L319

Added line #L319 was not covered by tests
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved

# If a nested column name is passed, return a flat series for that column
# flat series is chosen over list series for utility
# e.g. native ability to do something like ndf["nested.a"] + 3
if self._is_known_hierarchical_column(components):
nested = components[0]
field = ".".join(components[1:])
return self[nested].nest.get_flat_series(field)
else:
raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")

def __setitem__(self, key, value):
"""Adds custom __setitem__ behavior for nested columns"""

components = self._parse_hierarchical_components(key)
# Replacing or adding columns to a nested structure
# Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5
# Or ndf["nested.base_t"] = ndf["nested.t"] - 5
# Performance note: This requires building a new nested structure
# TODO: Support assignment of a new column to an existing nested col from a list series
if self._is_known_hierarchical_column(key) or (
"." in key and key.split(".")[0] in self.nested_columns
if self._is_known_hierarchical_column(components) or (
len(components) > 1 and components[0] in self.nested_columns
):
nested, col = key.split(".")
new_nested_series = self[nested].nest.with_flat_field(col, value)
if len(components) != 2:
raise ValueError(f"Only one level of nesting is supported; given {key}")

Check warning on line 343 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L343

Added line #L343 was not covered by tests
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
nested, field = components
new_nested_series = self[nested].nest.with_flat_field(field, value)
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
if "." in key:
new_nested, col = key.split(".")
if len(components) > 1:
new_nested, field = components
if isinstance(value, pd.Series):
value.name = col
value.name = field
value = value.to_frame()
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
Expand Down Expand Up @@ -512,10 +575,15 @@
--------
https://pandas.pydata.org/docs/reference/api/pandas.eval.html
"""
_, aliases = _identify_aliases(expr)
self._aliases: dict[str, str] | None = aliases

kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),)
kwargs["inplace"] = inplace
kwargs["parser"] = "nested-pandas"
return super().eval(expr, **kwargs)
answer = super().eval(expr, **kwargs)
self._aliases = None
return answer

def extract_nest_names(
self,
Expand Down Expand Up @@ -838,12 +906,15 @@
# Parse through the initial args to determine the columns to apply the function to
requested_columns = []
for arg in args:
if not isinstance(arg, str) or not self._is_known_column(arg):
# We've reached an argument that is not a valid column, so we assume
# the remaining args are extra arguments to the function
# Stop when we reach an argument that is not a valid column, as we assume
# that the remaining args are extra arguments to the function
if not isinstance(arg, str):
break
components = self._parse_hierarchical_components(arg)
if not self._is_known_column(components):
break
layer = "base" if "." not in arg else arg.split(".")[0]
col = arg.split(".")[-1]
layer = "base" if len(components) < 2 else components[0]
col = components[-1]
requested_columns.append((layer, col))

# We require the first *args to be the columns to apply the function to
Expand Down
25 changes: 25 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,17 @@ def test_query_on_non_identifier_columns():
nf3 = nf.query("`bad dog`.a > 2")
assert nf3["bad dog"].nest["a"].size == 4

# And also for fields within the nested columns.
# Taken from GH#176
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nf4 = nf.query("`bad dog`.`n/a` > 2")
assert nf4["bad dog"].nest["n/a"].size == 4


def test_dropna():
"""Test that dropna works on all layers"""
Expand Down Expand Up @@ -1034,6 +1045,20 @@ def test_eval_assignment():
assert (nf["p2.e"] == nf["packed.d"] * 2 + nf.c).all()
assert (nf["p2.f"] == nf["p2.e"] + nf.b).all()

# Verify that assignment can be done to nested columns and fields
# having names which are not valid Python identifiers, and must
# be quoted with backticks.
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nfx = nf.eval("`bad dog`.`n/c` = `bad dog`.`n/b` + 2.5")
# The number of columns at the top should not have changed
assert len(nfx.columns) == len(nf.columns)
assert (nfx["bad dog"].nest["n/c"] == nf["bad dog"].nest["n/b"] + 2.5).all()


def test_access_non_existing_column():
"""Test that accessing a non-existing column raises a KeyError"""
Expand Down