Skip to content

Commit

Permalink
REF: implement Manager.concat_vertical, concat_horizontal (pandas-dev…
Browse files Browse the repository at this point in the history
…#53066)

* REF: implement Manager.concat_vertical, concat_horizontal

* Update pandas/core/internals/managers.py

Co-authored-by: Matthew Roeschke <[email protected]>

---------

Co-authored-by: Matthew Roeschke <[email protected]>
  • Loading branch information
jbrockmendel and mroeschke authored May 4, 2023
1 parent d3bc372 commit 86a4ee0
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 102 deletions.
87 changes: 86 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from __future__ import annotations

import itertools
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -20,9 +21,13 @@
)
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.astype import astype_array_safe
from pandas.core.dtypes.astype import (
astype_array,
astype_array_safe,
)
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1132,6 +1137,30 @@ def as_array(

return result

@classmethod
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers horizontally.
"""
# concatting along the columns -> combine reindexed arrays in a single manager
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr

@classmethod
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers vertically.
"""
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr


class SingleArrayManager(BaseArrayManager, SingleDataManager):
__slots__ = [
Expand Down Expand Up @@ -1350,3 +1379,59 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike:
arr = np.empty(self.n, dtype=dtype)
arr.fill(fill_value)
return ensure_wrapped_if_datetimelike(arr)


def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]

dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1

if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np.find_common_type(list(dtypes), [])
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])

to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)

result = np.concatenate(to_concat)

# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
109 changes: 8 additions & 101 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import itertools
from typing import (
TYPE_CHECKING,
Sequence,
Expand All @@ -20,7 +19,6 @@
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
Expand All @@ -38,13 +36,9 @@
isna_all,
)

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
ArrayManager,
NullArrayProxy,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block_2d,
Expand All @@ -59,7 +53,7 @@
ArrayLike,
AxisInt,
DtypeObj,
Manager,
Manager2D,
Shape,
)

Expand All @@ -71,8 +65,8 @@


def _concatenate_array_managers(
mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt
) -> Manager:
mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt
) -> Manager2D:
"""
Concatenate array managers into one.
Expand All @@ -87,80 +81,16 @@ def _concatenate_array_managers(
ArrayManager
"""
if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
return mgrs[0].concat_vertical(mgrs, axes)
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))

new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr


def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]

dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1

if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np.find_common_type(list(dtypes), [])
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])

to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)

result = np.concatenate(to_concat)

# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
return mgrs[0].concat_horizontal(mgrs, axes)


def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
) -> Manager:
) -> Manager2D:
"""
Concatenate block managers into one.
Expand Down Expand Up @@ -196,7 +126,7 @@ def concatenate_managers(

if concat_axis == 0:
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
return _concat_managers_axis0(mgrs, axes)
return mgrs[0].concat_horizontal(mgrs, axes)

if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
Expand Down Expand Up @@ -266,29 +196,6 @@ def concatenate_managers(
return BlockManager(tuple(blocks), axes)


def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager:
"""
concat_managers specialized to concat_axis=0, with reindexing already
having been done in _maybe_reindex_columns_na_proxy.
"""

offset = 0
blocks: list[Block] = []
for i, mgr in enumerate(mgrs):
for blk in mgr.blocks:
# We need to do getitem_block here otherwise we would be altering
# blk.mgr_locs in place, which would render it invalid. This is only
# relevant in the copy=False case.
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)

offset += len(mgr.items)

result = BlockManager(tuple(blocks), axes)
return result


def _maybe_reindex_columns_na_proxy(
axes: list[Index],
mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],
Expand Down
31 changes: 31 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1841,6 +1841,37 @@ def _consolidate_inplace(self) -> None:
self._known_consolidated = True
self._rebuild_blknos_and_blklocs()

# ----------------------------------------------------------------
# Concatenation

@classmethod
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed BlockManagers horizontally.
"""
offset = 0
blocks: list[Block] = []
for mgr in mgrs:
for blk in mgr.blocks:
# We need to do getitem_block here otherwise we would be altering
# blk.mgr_locs in place, which would render it invalid. This is only
# relevant in the copy=False case.
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)

offset += len(mgr.items)

new_mgr = cls(tuple(blocks), axes)
return new_mgr

@classmethod
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed BlockManagers vertically.
"""
raise NotImplementedError("This logic lives (for now) in internals.concat")


class SingleBlockManager(BaseBlockManager, SingleDataManager):
"""manage a single block with"""
Expand Down

0 comments on commit 86a4ee0

Please sign in to comment.