Skip to content

Commit

Permalink
feat: Adding dataframe estimated size (#1549)
Browse files Browse the repository at this point in the history
  • Loading branch information
DeaMariaLeon authored Dec 10, 2024
1 parent 77d11e8 commit 36afa70
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- columns
- drop
- drop_nulls
- estimated_size
- filter
- gather_every
- get_column
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import scale_bytes

if TYPE_CHECKING:
from types import ModuleType
Expand All @@ -35,6 +36,7 @@
from narwhals._arrow.series import ArrowSeries
from narwhals._arrow.typing import IntoArrowExpr
from narwhals.dtypes import DType
from narwhals.typing import SizeUnit
from narwhals.utils import Version


Expand Down Expand Up @@ -285,6 +287,10 @@ def schema(self: Self) -> dict[str, DType]:
def collect_schema(self: Self) -> dict[str, DType]:
return self.schema

def estimated_size(self: Self, unit: SizeUnit) -> int | float:
sz = self._native_frame.nbytes
return scale_bytes(sz, unit)

@property
def columns(self: Self) -> list[str]:
return self._native_frame.schema.names # type: ignore[no-any-return]
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from narwhals.utils import import_dtypes_module
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import scale_bytes

if TYPE_CHECKING:
from types import ModuleType
Expand All @@ -38,6 +39,7 @@
from narwhals._pandas_like.series import PandasLikeSeries
from narwhals._pandas_like.typing import IntoPandasLikeExpr
from narwhals.dtypes import DType
from narwhals.typing import SizeUnit
from narwhals.utils import Version


Expand Down Expand Up @@ -371,6 +373,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self:
plx = self.__narwhals_namespace__()
return self.filter(~plx.any_horizontal(plx.col(*subset).is_null()))

def estimated_size(self, unit: SizeUnit) -> int | float:
sz = self._native_frame.memory_usage(deep=True).sum()
return scale_bytes(sz, unit=unit)

def with_row_index(self, name: str) -> Self:
row_index = create_compliant_series(
range(len(self._native_frame)),
Expand Down
45 changes: 45 additions & 0 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from narwhals.typing import IntoDataFrame
from narwhals.typing import IntoExpr
from narwhals.typing import IntoFrame
from narwhals.typing import SizeUnit
from narwhals.utils import Implementation

FrameT = TypeVar("FrameT", bound="IntoFrame")
Expand Down Expand Up @@ -764,6 +765,50 @@ def get_column(self, name: str) -> Series[Any]:
level=self._level,
)

def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""Return an estimation of the total (heap) allocated size of the `DataFrame`.
Estimated size is given in the specified unit (bytes by default).
Arguments:
unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes',
'gigabytes', or 'terabytes'.
Returns:
Integer or Float.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoDataFrameT
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)
Let's define a dataframe-agnostic function:
>>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float:
... df = nw.from_native(df_native)
... return df.estimated_size()
We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`:
>>> agnostic_estimated_size(df_pd)
np.int64(330)
>>> agnostic_estimated_size(df_pl)
51
>>> agnostic_estimated_size(df_pa)
63
"""
return self._compliant_frame.estimated_size(unit=unit) # type: ignore[no-any-return]

@overload
def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ...
@overload
Expand Down
14 changes: 14 additions & 0 deletions narwhals/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal
from typing import Protocol
from typing import TypeVar
from typing import Union
Expand Down Expand Up @@ -173,6 +174,19 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ...
... return s.abs().to_native()
"""

SizeUnit: TypeAlias = Literal[
"b",
"kb",
"mb",
"gb",
"tb",
"bytes",
"kilobytes",
"megabytes",
"gigabytes",
"terabytes",
]


class DTypes:
Int64: type[dtypes.Int64]
Expand Down
26 changes: 26 additions & 0 deletions narwhals/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from narwhals.series import Series
from narwhals.typing import DTypes
from narwhals.typing import IntoSeriesT
from narwhals.typing import SizeUnit

FrameOrSeriesT = TypeVar(
"FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]]
Expand Down Expand Up @@ -681,6 +682,31 @@ def maybe_convert_dtypes(
return obj_any # type: ignore[no-any-return]


def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").
Arguments:
sz: original size in bytes
unit: size unit to convert into
Returns:
Integer or float.
"""
if unit in {"b", "bytes"}:
return sz
elif unit in {"kb", "kilobytes"}:
return sz / 1024
elif unit in {"mb", "megabytes"}:
return sz / 1024**2
elif unit in {"gb", "gigabytes"}:
return sz / 1024**3
elif unit in {"tb", "terabytes"}:
return sz / 1024**4
else:
msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}"
raise ValueError(msg)


def is_ordered_categorical(series: Series[Any]) -> bool:
"""Return whether indices of categories are semantically meaningful.
Expand Down
28 changes: 28 additions & 0 deletions tests/frame/estimated_size_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

import narwhals.stable.v1 as nw

if TYPE_CHECKING:
from tests.utils import ConstructorEager

data = {"a": list(range(100))}


def test_estimated_size(constructor_eager: ConstructorEager) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)

assert df.estimated_size("b") > 0
assert df.estimated_size("kb") == (df.estimated_size("b") / 1024)
assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024)
assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024)
assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024)

with pytest.raises(
ValueError,
match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'",
):
df.estimated_size("pizza") # type: ignore[arg-type]

0 comments on commit 36afa70

Please sign in to comment.