Skip to content

Commit

Permalink
Merge pull request #93 from lincc-frameworks/int32-for-offsets
Browse files Browse the repository at this point in the history
Use Int32 for offset arrays
  • Loading branch information
hombit authored May 30, 2024
2 parents f861313 + 952c98d commit aadb12f
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 10 deletions.
7 changes: 3 additions & 4 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,12 +621,11 @@ def list_offsets(self) -> pa.Array:

chunks = []
# The offset of the current chunk in the flat array.
# It is 0 for the first chunk, and the last offset of the previous chunk for the next chunks,
# as a pa.Scalar.
chunk_offset: pa.Scalar | int = 0
# Offset arrays use int32 type, so we cast to it
chunk_offset = pa.scalar(0, type=pa.int32())
for chunk in self._chunked_array.iterchunks():
list_array = cast(pa.ListArray, chunk.field(0))
if chunk_offset == 0:
if chunk_offset.equals(pa.scalar(0, type=pa.int32())):
offsets = list_array.offsets
else:
offsets = pa.compute.add(list_array.offsets[1:], chunk_offset)
Expand Down
3 changes: 3 additions & 0 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,7 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
offset = np.append(offset_but_last, len(index))

# Arrow uses int32 for offsets
offset = offset.astype(np.int32)

return offset
8 changes: 5 additions & 3 deletions tests/nested_pandas/series/test_ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,9 @@ def test_list_offsets_single_chunk():
)
ext_array = NestedExtensionArray(struct_array)

desired = pa.chunked_array([pa.array([0, 3, 6])])
assert_array_equal(ext_array.list_offsets, desired)
desired = pa.array([0, 3, 6], type=pa.int32())
# pyarrow returns a single bool for ==
assert ext_array.list_offsets == desired


def test_list_offsets_multiple_chunks():
Expand All @@ -631,7 +632,8 @@ def test_list_offsets_multiple_chunks():
ext_array = NestedExtensionArray(chunked_arrray)

desired = chunked_arrray.combine_chunks().field("a").offsets
assert_array_equal(ext_array.list_offsets, desired)
# pyarrow returns a single bool for ==
assert ext_array.list_offsets == desired


def test___getitem___with_integer():
Expand Down
26 changes: 23 additions & 3 deletions tests/nested_pandas/series/test_packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@
from pandas.testing import assert_frame_equal, assert_series_equal


def offsets_reused(nested_series):
"""Check if the offset buffers are reused for all columns of the nested series"""
lists_df = nested_series.nest.to_lists()
first_offset_buffers = None
for column in lists_df.columns:
offset_buffers = pa.array(lists_df[column]).offsets.buffers()
if first_offset_buffers is None:
first_offset_buffers = offset_buffers
assert offset_buffers == first_offset_buffers


def test_pack_with_flat_df():
"""Test pack(pd.DataFrame)."""
df = pd.DataFrame(
Expand All @@ -28,6 +39,7 @@ def test_pack_with_flat_df():
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand All @@ -51,6 +63,7 @@ def test_pack_with_flat_df_and_index():
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand Down Expand Up @@ -85,6 +98,7 @@ def test_pack_with_series_of_dfs():
name="nested",
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand All @@ -109,7 +123,7 @@ def test_pack_flat():
index=[1, 2, 3, 4],
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)

offsets_reused(actual)
assert_series_equal(actual, desired)


Expand All @@ -134,7 +148,7 @@ def test_pack_sorted_df_into_struct():
index=[1, 2, 3, 4],
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)

offsets_reused(actual)
assert_series_equal(actual, desired)


Expand Down Expand Up @@ -172,6 +186,7 @@ def test_pack_lists():
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
)
series = packer.pack_lists(packed_df)
offsets_reused(series)

for field_name in packed_df.columns:
assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name])
Expand Down Expand Up @@ -221,6 +236,7 @@ def test_pack_seq_with_dfs_and_index():
index=[100, 101, 102, 103],
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand Down Expand Up @@ -249,6 +265,7 @@ def test_pack_seq_with_different_elements_and_index():
index=[100, 101, 102, 103],
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand Down Expand Up @@ -290,6 +307,7 @@ def test_pack_seq_with_series_of_dfs():
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
assert_series_equal(series, desired)


Expand Down Expand Up @@ -402,7 +420,9 @@ def test_view_sorted_series_as_list_array_raises_when_not_sorted():
)
def test_calculate_sorted_index_offsets(index, offsets):
"""Test calculate_sorted_index_offsets()."""
assert_array_equal(packer.calculate_sorted_index_offsets(index), offsets)
actual = packer.calculate_sorted_index_offsets(index)
assert actual.dtype == np.int32
assert_array_equal(actual, offsets)


def test_calculate_sorted_index_offsets_raises_when_not_sorted():
Expand Down

0 comments on commit aadb12f

Please sign in to comment.