From 2ff60d610847ae4a3a983617f70d2138bf0fd239 Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 3 May 2024 17:47:57 -0400 Subject: [PATCH] Concatenate dictionary of objects along axis=1 (#15623) Note: This work is heavily based off [amanlai's](https://github.com/amanlai) PR [raised here](https://github.com/rapidsai/cudf/pull/15160), wasn't able to base my branch off amanlai's due to deleted branch. > Closes https://github.com/rapidsai/cudf/issues/15115. >Unlike `pandas.concat`, `cudf.concat` doesn't work with a dictionary of objects. The following code raises an error. ```python d = { 'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}), 'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}), } cudf.concat(d, axis=1) ``` >This commit resolves this issue. Authors: - https://github.com/er-eis - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15623 --- python/cudf/cudf/core/reshape.py | 192 +++++++++++++++++--------- python/cudf/cudf/tests/test_concat.py | 148 ++++++++++++++++++-- 2 files changed, 268 insertions(+), 72 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 9008d2f3a1b..26d91bed173 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): Parameters ---------- - objs : list of DataFrame, Series, or Index + objs : list or dictionary of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. + `axis=1` must be passed if a dictionary is passed. join : {'inner', 'outer'}, default 'outer' How to handle indexes on other axis (or axes). ignore_index : bool, default False @@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): letter number animal name 0 a 1 bird polly 1 b 2 monkey george + + Combine a dictionary of DataFrame objects horizontally: + + >>> d = {'first': df1, 'second': df2} + >>> cudf.concat(d, axis=1) + first second + letter number letter number + 0 a 1 c 3 + 1 b 2 d 4 """ # TODO: Do we really need to have different error messages for an empty # list and a list of None? if not objs: raise ValueError("No objects to concatenate") - objs = [obj for obj in objs if obj is not None] - - if not objs: - raise ValueError("All objects passed were None") - axis = _AXIS_MAP.get(axis, None) if axis is None: raise ValueError( f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}' ) + if isinstance(objs, dict): + if axis != 1: + raise NotImplementedError( + f"Can only concatenate dictionary input along axis=1, not {axis}" + ) + objs = {k: obj for k, obj in objs.items() if obj is not None} + keys = list(objs) + objs = list(objs.values()) + if any(isinstance(o, cudf.BaseIndex) for o in objs): + raise TypeError( + "cannot concatenate a dictionary containing indices" + ) + else: + objs = [obj for obj in objs if obj is not None] + keys = None + + if not objs: + raise ValueError("All objects passed were None") + + # Retrieve the base types of `objs`. In order to support sub-types + # and object wrappers, we use `isinstance()` instead of comparing + # types directly + allowed_typs = { + cudf.Series, + cudf.DataFrame, + cudf.BaseIndex, + } + if not all(isinstance(o, tuple(allowed_typs)) for o in objs): + raise TypeError( + f"can only concatenate objects which are instances of " + f"{allowed_typs}, instead received {[type(o) for o in objs]}" + ) + + if any(isinstance(o, cudf.BaseIndex) for o in objs): + if not all(isinstance(o, cudf.BaseIndex) for o in objs): + raise TypeError( + "when concatenating indices you must provide ONLY indices" + ) + + only_series = all(isinstance(o, cudf.Series) for o in objs) + # Return for single object if len(objs) == 1: obj = objs[0] - if ignore_index: if axis == 1: result = cudf.DataFrame._from_data( @@ -290,6 +335,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): result = cudf.DataFrame._from_data( data, index=obj.index.copy(deep=True) ) + if keys is not None: + if isinstance(result, cudf.DataFrame): + k = keys[0] + result.columns = cudf.MultiIndex.from_tuples( + [ + (k, *c) if isinstance(c, tuple) else (k, c) + for c in result._column_names + ] + ) if isinstance(result, cudf.Series) and axis == 0: # sort has no effect for series concatted along axis 0 @@ -297,27 +351,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): else: return result.sort_index(axis=(1 - axis)) if sort else result - # Retrieve the base types of `objs`. In order to support sub-types - # and object wrappers, we use `isinstance()` instead of comparing - # types directly - typs = set() - for o in objs: - if isinstance(o, cudf.MultiIndex): - typs.add(cudf.MultiIndex) - elif isinstance(o, cudf.BaseIndex): - typs.add(type(o)) - elif isinstance(o, cudf.DataFrame): - typs.add(cudf.DataFrame) - elif isinstance(o, cudf.Series): - typs.add(cudf.Series) - else: - raise TypeError(f"cannot concatenate object of type {type(o)}") - - allowed_typs = {cudf.Series, cudf.DataFrame} - # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: - if not typs.issubset(allowed_typs): + if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs): raise TypeError( "Can only concatenate Series and DataFrame objects when axis=1" ) @@ -353,35 +389,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): objs = _align_objs(objs, how=join, sort=sort) df.index = objs[0].index - for o in objs: - for name, col in o._data.items(): - if name in df._data: - raise NotImplementedError( - f"A Column with duplicate name found: {name}, cuDF " - f"doesn't support having multiple columns with " - f"same names yet." - ) - if empty_inner: - # if join is inner and it contains an empty df - # we return an empty df, hence creating an empty - # column with dtype metadata retained. - df[name] = cudf.core.column.column_empty_like( - col, newsize=0 - ) - else: - df[name] = col - - result_columns = ( - objs[0] - ._data.to_pandas_index() - .append([obj._data.to_pandas_index() for obj in objs[1:]]) - ) + if keys is None: + for o in objs: + for name, col in o._data.items(): + if name in df._data: + raise NotImplementedError( + f"A Column with duplicate name found: {name}, cuDF " + f"doesn't support having multiple columns with " + f"same names yet." + ) + if empty_inner: + # if join is inner and it contains an empty df + # we return an empty df, hence creating an empty + # column with dtype metadata retained. + df[name] = cudf.core.column.column_empty_like( + col, newsize=0 + ) + else: + df[name] = col + + result_columns = ( + objs[0] + ._data.to_pandas_index() + .append([obj._data.to_pandas_index() for obj in objs[1:]]) + .unique() + ) - if ignore_index: - # with ignore_index the column names change to numbers - df.columns = pd.RangeIndex(len(result_columns.unique())) + # need to create a MultiIndex column else: + # All levels in the multiindex label must have the same type + has_multiple_level_types = ( + len({type(name) for o in objs for name in o._data.keys()}) > 1 + ) + if has_multiple_level_types: + raise NotImplementedError( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + for k, o in zip(keys, objs): + for name, col in o._data.items(): + # if only series, then only keep keys as column labels + # if the existing column is multiindex, prepend it + # to handle cases where dfs and srs are concatenated + if only_series: + col_label = k + elif isinstance(name, tuple): + col_label = (k, *name) + else: + col_label = (k, name) + if empty_inner: + df[col_label] = cudf.core.column.column_empty_like( + col, newsize=0 + ) + else: + df[col_label] = col + + if keys is None: df.columns = result_columns.unique() + if ignore_index: + df.columns = cudf.RangeIndex(len(result_columns.unique())) + elif ignore_index: + # with ignore_index the column names change to numbers + df.columns = cudf.RangeIndex(len(result_columns)) + elif not only_series: + df.columns = cudf.MultiIndex.from_tuples(df._column_names) if empty_inner: # if join is inner and it contains an empty df @@ -391,18 +463,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): return df # If we get here, we are always concatenating along axis 0 (the rows). - typ = list(typs)[0] - if len(typs) > 1: - if allowed_typs == typs: - # This block of code will run when `objs` has - # both Series & DataFrame kind of inputs. - _normalize_series_and_dataframe(objs, axis=axis) - typ = cudf.DataFrame - else: - raise TypeError( - f"`concat` cannot concatenate objects of " - f"types: {sorted([t.__name__ for t in typs])}." - ) + typ = type(objs[0]) + if len({type(o) for o in objs}) > 1: + _normalize_series_and_dataframe(objs, axis=axis) + typ = cudf.DataFrame if typ is cudf.DataFrame: old_objs = objs diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 87b3beb5589..4b43a33c8c8 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -218,7 +218,8 @@ def test_concat_columns(axis): assert_eq(expect, got, check_index_type=True) -def test_concat_multiindex_dataframe(): +@pytest.mark.parametrize("axis", [0, 1]) +def test_concat_multiindex_dataframe(axis): gdf = cudf.DataFrame( { "w": np.arange(4), @@ -233,14 +234,11 @@ def test_concat_multiindex_dataframe(): pdg2 = pdg.iloc[:, 1:] gdg1 = cudf.from_pandas(pdg1) gdg2 = cudf.from_pandas(pdg2) + expected = pd.concat([pdg1, pdg2], axis=axis) + result = cudf.concat([gdg1, gdg2], axis=axis) assert_eq( - cudf.concat([gdg1, gdg2]).astype("float64"), - pd.concat([pdg1, pdg2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), - pd.concat([pdg1, pdg2], axis=1), + expected, + result, check_index_type=True, ) @@ -1865,3 +1863,137 @@ def test_concat_mixed_list_types_error(s1, s2): with pytest.raises(NotImplementedError): cudf.concat([s1, s2], ignore_index=True) + + +@pytest.mark.parametrize( + "axis", + [ + pytest.param( + 0, + marks=pytest.mark.xfail( + reason="concat dictionaries with axis=0 not implemented" + ), + ), + 1, + "columns", + ], +) +@pytest.mark.parametrize( + "d", + [ + {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})}, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), + "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}), + "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}), + }, + pytest.param( + { + "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}), + "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + { + "first": (cudf.Series, {"data": [1, 2, 3]}), + "second": (cudf.Series, {"data": [4, 5, 6]}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.Series, {"data": [5, 6], "name": "C"}), + }, + pytest.param( + { + "first": ( + cudf.DataFrame, + {"data": {("A", "B"): [1, 2], "C": [3, 4]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {"D": [5, 6], ("A", "B"): [7, 8]}}, + ), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + pytest.param( + { + "first": ( + cudf.DataFrame, + {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}}, + ), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + { + "first": ( + cudf.DataFrame, + {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}}, + ), + }, + ], +) +def test_concat_dictionary(d, axis): + _dict = {k: c(**v) for k, (c, v) in d.items()} + result = cudf.concat(_dict, axis=axis) + expected = cudf.from_pandas( + pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis) + ) + assert_eq(expected, result) + + +@pytest.mark.parametrize( + "d", + [ + {"first": cudf.Index([1, 2, 3])}, + { + "first": cudf.MultiIndex( + levels=[[1, 2], ["blue", "red"]], + codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + ) + }, + {"first": cudf.CategoricalIndex([1, 2, 3])}, + ], +) +def test_concat_dict_incorrect_type_index(d): + with pytest.raises( + TypeError, + match="cannot concatenate a dictionary containing indices", + ): + cudf.concat(d, axis=1)