diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index 388e786..0b204eb 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -621,12 +621,11 @@ def list_offsets(self) -> pa.Array: chunks = [] # The offset of the current chunk in the flat array. - # It is 0 for the first chunk, and the last offset of the previous chunk for the next chunks, - # as a pa.Scalar. - chunk_offset: pa.Scalar | int = 0 + # Offset arrays use int32 type, so we cast to it + chunk_offset = pa.scalar(0, type=pa.int32()) for chunk in self._chunked_array.iterchunks(): list_array = cast(pa.ListArray, chunk.field(0)) - if chunk_offset == 0: + if chunk_offset.equals(pa.scalar(0, type=pa.int32())): offsets = list_array.offsets else: offsets = pa.compute.add(list_array.offsets[1:], chunk_offset) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index c8acb59..ff22930 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -297,4 +297,7 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0] offset = np.append(offset_but_last, len(index)) + # Arrow uses int32 for offsets + offset = offset.astype(np.int32) + return offset diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py index c44732c..9110413 100644 --- a/tests/nested_pandas/series/test_ext_array.py +++ b/tests/nested_pandas/series/test_ext_array.py @@ -614,8 +614,9 @@ def test_list_offsets_single_chunk(): ) ext_array = NestedExtensionArray(struct_array) - desired = pa.chunked_array([pa.array([0, 3, 6])]) - assert_array_equal(ext_array.list_offsets, desired) + desired = pa.array([0, 3, 6], type=pa.int32()) + # pyarrow returns a single bool for == + assert ext_array.list_offsets == desired def test_list_offsets_multiple_chunks(): @@ -631,7 +632,8 @@ def test_list_offsets_multiple_chunks(): ext_array = NestedExtensionArray(chunked_arrray) desired = chunked_arrray.combine_chunks().field("a").offsets - assert_array_equal(ext_array.list_offsets, desired) + # pyarrow returns a single bool for == + assert ext_array.list_offsets == desired def test___getitem___with_integer(): diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index a77edf4..55801c0 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -8,6 +8,17 @@ from pandas.testing import assert_frame_equal, assert_series_equal +def offsets_reused(nested_series): + """Check if the offset buffers are reused for all columns of the nested series""" + lists_df = nested_series.nest.to_lists() + first_offset_buffers = None + for column in lists_df.columns: + offset_buffers = pa.array(lists_df[column]).offsets.buffers() + if first_offset_buffers is None: + first_offset_buffers = offset_buffers + assert offset_buffers == first_offset_buffers + + def test_pack_with_flat_df(): """Test pack(pd.DataFrame).""" df = pd.DataFrame( @@ -28,6 +39,7 @@ def test_pack_with_flat_df(): dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), name="series", ) + offsets_reused(series) assert_series_equal(series, desired) @@ -51,6 +63,7 @@ def test_pack_with_flat_df_and_index(): dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), name="series", ) + offsets_reused(series) assert_series_equal(series, desired) @@ -85,6 +98,7 @@ def test_pack_with_series_of_dfs(): name="nested", dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), ) + offsets_reused(series) assert_series_equal(series, desired) @@ -109,7 +123,7 @@ def test_pack_flat(): index=[1, 2, 3, 4], dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), ) - + offsets_reused(actual) assert_series_equal(actual, desired) @@ -134,7 +148,7 @@ def test_pack_sorted_df_into_struct(): index=[1, 2, 3, 4], dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), ) - + offsets_reused(actual) assert_series_equal(actual, desired) @@ -172,6 +186,7 @@ def test_pack_lists(): dtype=pd.ArrowDtype(pa.list_(pa.int64())), ) series = packer.pack_lists(packed_df) + offsets_reused(series) for field_name in packed_df.columns: assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name]) @@ -221,6 +236,7 @@ def test_pack_seq_with_dfs_and_index(): index=[100, 101, 102, 103], dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), ) + offsets_reused(series) assert_series_equal(series, desired) @@ -249,6 +265,7 @@ def test_pack_seq_with_different_elements_and_index(): index=[100, 101, 102, 103], dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), ) + offsets_reused(series) assert_series_equal(series, desired) @@ -290,6 +307,7 @@ def test_pack_seq_with_series_of_dfs(): dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), name="series", ) + offsets_reused(series) assert_series_equal(series, desired) @@ -402,7 +420,9 @@ def test_view_sorted_series_as_list_array_raises_when_not_sorted(): ) def test_calculate_sorted_index_offsets(index, offsets): """Test calculate_sorted_index_offsets().""" - assert_array_equal(packer.calculate_sorted_index_offsets(index), offsets) + actual = packer.calculate_sorted_index_offsets(index) + assert actual.dtype == np.int32 + assert_array_equal(actual, offsets) def test_calculate_sorted_index_offsets_raises_when_not_sorted():