Skip to content

Commit

Permalink
Concatenate dictionary of objects along axis=1 (#15623)
Browse files Browse the repository at this point in the history
Note: This work is heavily based off [amanlai's](https://github.com/amanlai) PR [raised here](#15160), wasn't able to base my branch off amanlai's due to deleted branch.
 
> Closes #15115.
>Unlike `pandas.concat`, `cudf.concat` doesn't work with a dictionary of objects. The following code raises an error.
```python
d = {
    'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}),
    'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}),
}

cudf.concat(d, axis=1)
```
>This commit resolves this issue.

Authors:
  - https://github.com/er-eis
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: #15623
  • Loading branch information
er-eis authored May 3, 2024
1 parent 09f8ff3 commit 2ff60d6
Show file tree
Hide file tree
Showing 2 changed files with 268 additions and 72 deletions.
192 changes: 128 additions & 64 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list or dictionary of DataFrame, Series, or Index
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
Expand Down Expand Up @@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
letter number animal name
0 a 1 bird polly
1 b 2 monkey george
Combine a dictionary of DataFrame objects horizontally:
>>> d = {'first': df1, 'second': df2}
>>> cudf.concat(d, axis=1)
first second
letter number letter number
0 a 1 c 3
1 b 2 d 4
"""
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
raise ValueError("No objects to concatenate")

objs = [obj for obj in objs if obj is not None]

if not objs:
raise ValueError("All objects passed were None")

axis = _AXIS_MAP.get(axis, None)
if axis is None:
raise ValueError(
f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
)

if isinstance(objs, dict):
if axis != 1:
raise NotImplementedError(
f"Can only concatenate dictionary input along axis=1, not {axis}"
)
objs = {k: obj for k, obj in objs.items() if obj is not None}
keys = list(objs)
objs = list(objs.values())
if any(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"cannot concatenate a dictionary containing indices"
)
else:
objs = [obj for obj in objs if obj is not None]
keys = None

if not objs:
raise ValueError("All objects passed were None")

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
allowed_typs = {
cudf.Series,
cudf.DataFrame,
cudf.BaseIndex,
}
if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
raise TypeError(
f"can only concatenate objects which are instances of "
f"{allowed_typs}, instead received {[type(o) for o in objs]}"
)

if any(isinstance(o, cudf.BaseIndex) for o in objs):
if not all(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"when concatenating indices you must provide ONLY indices"
)

only_series = all(isinstance(o, cudf.Series) for o in objs)

# Return for single object
if len(objs) == 1:
obj = objs[0]

if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
Expand Down Expand Up @@ -290,34 +335,25 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result._column_names
]
)

if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
return result
else:
return result.sort_index(axis=(1 - axis)) if sort else result

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
typs = set()
for o in objs:
if isinstance(o, cudf.MultiIndex):
typs.add(cudf.MultiIndex)
elif isinstance(o, cudf.BaseIndex):
typs.add(type(o))
elif isinstance(o, cudf.DataFrame):
typs.add(cudf.DataFrame)
elif isinstance(o, cudf.Series):
typs.add(cudf.Series)
else:
raise TypeError(f"cannot concatenate object of type {type(o)}")

allowed_typs = {cudf.Series, cudf.DataFrame}

# when axis is 1 (column) we can concat with Series and Dataframes
if axis == 1:
if not typs.issubset(allowed_typs):
if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
raise TypeError(
"Can only concatenate Series and DataFrame objects when axis=1"
)
Expand Down Expand Up @@ -353,35 +389,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
)
if keys is None:
for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
.unique()
)

if ignore_index:
# with ignore_index the column names change to numbers
df.columns = pd.RangeIndex(len(result_columns.unique()))
# need to create a MultiIndex column
else:
# All levels in the multiindex label must have the same type
has_multiple_level_types = (
len({type(name) for o in objs for name in o._data.keys()}) > 1
)
if has_multiple_level_types:
raise NotImplementedError(
"Cannot construct a MultiIndex column with multiple "
"label types in cuDF at this time. You must convert "
"the labels to the same type."
)
for k, o in zip(keys, objs):
for name, col in o._data.items():
# if only series, then only keep keys as column labels
# if the existing column is multiindex, prepend it
# to handle cases where dfs and srs are concatenated
if only_series:
col_label = k
elif isinstance(name, tuple):
col_label = (k, *name)
else:
col_label = (k, name)
if empty_inner:
df[col_label] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[col_label] = col

if keys is None:
df.columns = result_columns.unique()
if ignore_index:
df.columns = cudf.RangeIndex(len(result_columns.unique()))
elif ignore_index:
# with ignore_index the column names change to numbers
df.columns = cudf.RangeIndex(len(result_columns))
elif not only_series:
df.columns = cudf.MultiIndex.from_tuples(df._column_names)

if empty_inner:
# if join is inner and it contains an empty df
Expand All @@ -391,18 +463,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
return df

# If we get here, we are always concatenating along axis 0 (the rows).
typ = list(typs)[0]
if len(typs) > 1:
if allowed_typs == typs:
# This block of code will run when `objs` has
# both Series & DataFrame kind of inputs.
_normalize_series_and_dataframe(objs, axis=axis)
typ = cudf.DataFrame
else:
raise TypeError(
f"`concat` cannot concatenate objects of "
f"types: {sorted([t.__name__ for t in typs])}."
)
typ = type(objs[0])
if len({type(o) for o in objs}) > 1:
_normalize_series_and_dataframe(objs, axis=axis)
typ = cudf.DataFrame

if typ is cudf.DataFrame:
old_objs = objs
Expand Down
Loading

0 comments on commit 2ff60d6

Please sign in to comment.