From 110d4cd3e1ec19c98f11583d7a24f81f89e8414d Mon Sep 17 00:00:00 2001 From: mattijn Date: Sun, 24 Mar 2024 23:44:09 +0100 Subject: [PATCH 01/10] adapt tools files --- tools/generate_schema_wrapper.py | 3 +-- tools/schemapi/schemapi.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index fdd73df54..b071df34d 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -139,7 +139,7 @@ def to_dict( # We still parse it out of the shorthand, but drop it here. parsed.pop("type", None) elif not (type_in_shorthand or type_defined_explicitly): - if isinstance(context.get("data", None), pd.DataFrame): + if isinstance(context.get("data", None), DataFrameLike): raise ValueError( 'Unable to determine data type for the field "{}";' " verify that the field name is not misspelled." @@ -547,7 +547,6 @@ def generate_vegalite_channel_wrappers( imports = [ "import sys", "from . import core", - "import pandas as pd", "from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters", "from altair.utils import parse_shorthand", "from typing import Any, overload, Sequence, List, Literal, Union, Optional", diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 52d58d1d3..db008e5ef 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -20,6 +20,7 @@ overload, Literal, TypeVar, + TYPE_CHECKING, ) from itertools import zip_longest from importlib.metadata import version as importlib_version @@ -29,7 +30,6 @@ import jsonschema.exceptions import jsonschema.validators import numpy as np -import pandas as pd from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -42,6 +42,13 @@ else: from typing_extensions import Self +if TYPE_CHECKING: + import pandas as pd + +class _PandasTimestamp: + def isoformat(self): + return "dummy_isoformat" # Return a dummy ISO format string + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -475,7 +482,8 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return obj.to_dict() elif isinstance(obj, np.number): return float(obj) - elif isinstance(obj, (pd.Timestamp, np.datetime64)): + elif isinstance(obj, (_PandasTimestamp, np.datetime64)): + import pandas as pd return pd.Timestamp(obj).isoformat() else: return obj @@ -934,7 +942,7 @@ def to_dict( # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) - # Prevent that pandas categorical data is automatically sorted + # Prevent that categorical data is automatically sorted # when a non-ordinal data type is specifed manually # or if the encoding channel does not support sorting if "sort" in parsed_shorthand and ( From f40d951d14f22b03c511a33247e30c0d3582440f Mon Sep 17 00:00:00 2001 From: mattijn Date: Sun, 24 Mar 2024 23:45:29 +0100 Subject: [PATCH 02/10] changes from rerun generate_schema_wrapper --- altair/utils/schemapi.py | 14 +++++++++++--- altair/vegalite/v5/schema/channels.py | 3 +-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b6cfa0ded..4e4961c10 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -22,6 +22,7 @@ overload, Literal, TypeVar, + TYPE_CHECKING, ) from itertools import zip_longest from importlib.metadata import version as importlib_version @@ -31,7 +32,6 @@ import jsonschema.exceptions import jsonschema.validators import numpy as np -import pandas as pd from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -44,6 +44,13 @@ else: from typing_extensions import Self +if TYPE_CHECKING: + import pandas as pd + +class _PandasTimestamp: + def isoformat(self): + return "dummy_isoformat" # Return a dummy ISO format string + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -477,7 +484,8 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return obj.to_dict() elif isinstance(obj, np.number): return float(obj) - elif isinstance(obj, (pd.Timestamp, np.datetime64)): + elif isinstance(obj, (_PandasTimestamp, np.datetime64)): + import pandas as pd return pd.Timestamp(obj).isoformat() else: return obj @@ -936,7 +944,7 @@ def to_dict( # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) - # Prevent that pandas categorical data is automatically sorted + # Prevent that categorical data is automatically sorted # when a non-ordinal data type is specifed manually # or if the encoding channel does not support sorting if "sort" in parsed_shorthand and ( diff --git a/altair/vegalite/v5/schema/channels.py b/altair/vegalite/v5/schema/channels.py index 089a534a6..818b9b10b 100644 --- a/altair/vegalite/v5/schema/channels.py +++ b/altair/vegalite/v5/schema/channels.py @@ -11,7 +11,6 @@ import sys from . import core -import pandas as pd from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters from altair.utils import parse_shorthand from typing import Any, overload, Sequence, List, Literal, Union, Optional @@ -60,7 +59,7 @@ def to_dict( # We still parse it out of the shorthand, but drop it here. parsed.pop("type", None) elif not (type_in_shorthand or type_defined_explicitly): - if isinstance(context.get("data", None), pd.DataFrame): + if isinstance(context.get("data", None), DataFrameLike): raise ValueError( 'Unable to determine data type for the field "{}";' " verify that the field name is not misspelled." From ba4b7786f72389769c9c8cea3bb4b866671d8a9f Mon Sep 17 00:00:00 2001 From: mattijn Date: Sun, 24 Mar 2024 23:46:51 +0100 Subject: [PATCH 03/10] add importer for pandas --- altair/utils/_importers.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index 718fa9129..158f7d577 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -95,3 +95,26 @@ def pyarrow_available() -> bool: return True except ImportError: return False + +def import_pandas() -> ModuleType: + min_version = "0.25" + try: + version = importlib_version("pandas") + if Version(version) < Version(min_version): + raise RuntimeError( + f"The pandas package must be version {min_version} or greater. " + f"Found version {version}" + ) + import pandas as pd + + return pd + except ImportError as err: + raise ImportError( + f"Serialization of the DataFrame requires\n" + f"version {min_version} or greater of the 'pandas' package. \n" + f"This can be installed with pip using:\n" + f' pip install "pandas>={min_version}"\n' + "or conda:\n" + f' conda install -c conda-forge "pandas>={min_version}"\n\n' + f"ImportError: {err.args[0]}" + ) from err \ No newline at end of file From 21910b1451bd3f9e43a6bd5a6320cf1a64bb9637 Mon Sep 17 00:00:00 2001 From: mattijn Date: Sun, 24 Mar 2024 23:57:21 +0100 Subject: [PATCH 04/10] prioritize DataFrameLike, use the pandas importer only when needed. --- altair/utils/core.py | 17 ++++---- altair/utils/data.py | 98 ++++++++++++++++++++++---------------------- 2 files changed, 59 insertions(+), 56 deletions(-) diff --git a/altair/utils/core.py b/altair/utils/core.py index baf1013f7..83cfe76a8 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -25,9 +25,7 @@ from types import ModuleType import jsonschema -import pandas as pd import numpy as np -from pandas.api.types import infer_dtype from altair.utils.schemapi import SchemaBase from altair.utils._dfi_types import Column, DtypeKind, DataFrame as DfiDataFrame @@ -40,6 +38,7 @@ from typing import Literal, Protocol, TYPE_CHECKING, runtime_checkable if TYPE_CHECKING: + import pandas as pd from pandas.core.interchange.dataframe_protocol import Column as PandasColumn V = TypeVar("V") @@ -208,7 +207,9 @@ def infer_vegalite_type( ---------- data: object """ - typ = infer_dtype(data, skipna=False) + from altair.utils._importers import import_pandas + pd = import_pandas() + typ = pd.api.types.infer_dtype(data, skipna=False) if typ in [ "floating", @@ -299,7 +300,7 @@ def numpy_is_subtype(dtype: Any, subtype: Any) -> bool: return False -def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame: # noqa: C901 +def sanitize_dataframe(df: "pd.DataFrame") -> "pd.DataFrame": # noqa: C901 """Sanitize a DataFrame to prepare it for serialization. * Make a copy @@ -316,6 +317,8 @@ def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame: # noqa: C901 * convert dedicated string column to objects and replace NaN with None * Raise a ValueError for TimeDelta dtypes """ + from altair.utils._importers import import_pandas + pd = import_pandas() df = df.copy() if isinstance(df.columns, pd.RangeIndex): @@ -448,7 +451,7 @@ def sanitize_arrow_table(pa_table): def parse_shorthand( shorthand: Union[Dict[str, Any], str], - data: Optional[Union[pd.DataFrame, DataFrameLike]] = None, + data: Optional[Union[DataFrameLike, "pd.DataFrame"]] = None, parse_aggregates: bool = True, parse_window_ops: bool = False, parse_timeunits: bool = True, @@ -601,7 +604,7 @@ def parse_shorthand( # Fall back to pandas-based inference. # Note: The AttributeError catch is a workaround for # https://github.com/pandas-dev/pandas/issues/55332 - if isinstance(data, pd.DataFrame): + if isinstance(data, "pd.DataFrame"): attrs["type"] = infer_vegalite_type(data[unescaped_field]) else: raise @@ -609,7 +612,7 @@ def parse_shorthand( if isinstance(attrs["type"], tuple): attrs["sort"] = attrs["type"][1] attrs["type"] = attrs["type"][0] - elif isinstance(data, pd.DataFrame): + elif isinstance(data, "pd.DataFrame"): # Fallback if pyarrow is not installed or if pandas is older than 1.5 # # Remove escape sequences so that types can be inferred for columns with special characters diff --git a/altair/utils/data.py b/altair/utils/data.py index 871b43092..cc21e9d1f 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -5,7 +5,6 @@ import warnings from typing import Union, MutableMapping, Optional, Dict, Sequence, TYPE_CHECKING, List -import pandas as pd from toolz import curried from typing import TypeVar @@ -21,13 +20,14 @@ if TYPE_CHECKING: import pyarrow.lib + import pandas as pd class SupportsGeoInterface(Protocol): __geo_interface__: MutableMapping -DataType = Union[dict, pd.DataFrame, SupportsGeoInterface, DataFrameLike] +DataType = Union[dict, DataFrameLike, SupportsGeoInterface, "pd.DataFrame"] TDataType = TypeVar("TDataType", bound=DataType) VegaLiteDataDict = Dict[str, Union[str, dict, List[dict]]] @@ -96,15 +96,6 @@ def raise_max_rows_error(): values = data.__geo_interface__["features"] else: values = data.__geo_interface__ - elif isinstance(data, pd.DataFrame): - values = data - elif isinstance(data, dict): - if "values" in data: - values = data["values"] - else: - # mypy gets confused as it doesn't see Dict[Any, Any] - # as equivalent to TDataType - return data # type: ignore[return-value] elif isinstance(data, DataFrameLike): pa_table = arrow_table_from_dfi_dataframe(data) if max_rows is not None and pa_table.num_rows > max_rows: @@ -112,6 +103,15 @@ def raise_max_rows_error(): # Return pyarrow Table instead of input since the # `arrow_table_from_dfi_dataframe` call above may be expensive return pa_table + elif isinstance(data, dict): + if "values" in data: + values = data["values"] + else: + # mypy gets confused as it doesn't see Dict[Any, Any] + # as equivalent to TDataType + return data # type: ignore[return-value] + elif isinstance(data, "pd.DataFrame"): + values = data if max_rows is not None and len(values) > max_rows: raise_max_rows_error() @@ -122,11 +122,19 @@ def raise_max_rows_error(): @curried.curry def sample( data: DataType, n: Optional[int] = None, frac: Optional[float] = None -) -> Optional[Union[pd.DataFrame, Dict[str, Sequence], "pyarrow.lib.Table"]]: +) -> Optional[Union["pd.DataFrame", Dict[str, Sequence], "pyarrow.lib.Table"]]: """Reduce the size of the data model by sampling without replacement.""" check_data_type(data) - if isinstance(data, pd.DataFrame): - return data.sample(n=n, frac=frac) + if isinstance(data, DataFrameLike): + pa_table = arrow_table_from_dfi_dataframe(data) + if not n: + if frac is None: + raise ValueError( + "frac cannot be None if n is None with this data input type" + ) + n = int(frac * len(pa_table)) + indices = random.sample(range(len(pa_table)), n) + return pa_table.take(indices) elif isinstance(data, dict): if "values" in data: values = data["values"] @@ -141,16 +149,8 @@ def sample( else: # Maybe this should raise an error or return something useful? return None - elif isinstance(data, DataFrameLike): - pa_table = arrow_table_from_dfi_dataframe(data) - if not n: - if frac is None: - raise ValueError( - "frac cannot be None if n is None with this data input type" - ) - n = int(frac * len(pa_table)) - indices = random.sample(range(len(pa_table)), n) - return pa_table.take(indices) + elif isinstance(data, "pd.DataFrame"): + return data.sample(n=n, frac=frac) else: # Maybe this should raise an error or return something useful? Currently, # if data is of type SupportsGeoInterface it lands here @@ -196,7 +196,7 @@ def to_json( @curried.curry def to_csv( - data: Union[dict, pd.DataFrame, DataFrameLike], + data: Union[dict, DataFrameLike, "pd.DataFrame"], prefix: str = "altair-data", extension: str = "csv", filename: str = "{prefix}-{hash}.{extension}", @@ -216,29 +216,30 @@ def to_values(data: DataType) -> ToValuesReturnType: """Replace a DataFrame by a data model with values.""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, pd.DataFrame): + if isinstance(data, "pd.DataFrame"): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? data_sanitized = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return {"values": data_sanitized} - elif isinstance(data, pd.DataFrame): - data = sanitize_dataframe(data) - return {"values": data.to_dict(orient="records")} + elif isinstance(data, DataFrameLike): + pa_table = sanitize_arrow_table(arrow_table_from_dfi_dataframe(data)) + return {"values": pa_table.to_pylist()} elif isinstance(data, dict): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return data - elif isinstance(data, DataFrameLike): - pa_table = sanitize_arrow_table(arrow_table_from_dfi_dataframe(data)) - return {"values": pa_table.to_pylist()} + elif isinstance(data, "pd.DataFrame"): + data = sanitize_dataframe(data) + return {"values": data.to_dict(orient="records")} + else: # Should never reach this state as tested by check_data_type raise ValueError("Unrecognized data type: {}".format(type(data))) def check_data_type(data: DataType) -> None: - if not isinstance(data, (dict, pd.DataFrame, DataFrameLike)) and not any( + if not isinstance(data, (dict, DataFrameLike, "pd.DataFrame")) and not any( hasattr(data, attr) for attr in ["__geo_interface__"] ): raise TypeError( @@ -259,29 +260,29 @@ def _data_to_json_string(data: DataType) -> str: """Return a JSON string representation of the input data""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, pd.DataFrame): + if isinstance(data, "pd.DataFrame"): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? data = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return json.dumps(data) - elif isinstance(data, pd.DataFrame): - data = sanitize_dataframe(data) - return data.to_json(orient="records", double_precision=15) + elif isinstance(data, DataFrameLike): + pa_table = arrow_table_from_dfi_dataframe(data) + return json.dumps(pa_table.to_pylist()) elif isinstance(data, dict): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return json.dumps(data["values"], sort_keys=True) - elif isinstance(data, DataFrameLike): - pa_table = arrow_table_from_dfi_dataframe(data) - return json.dumps(pa_table.to_pylist()) + elif isinstance(data, "pd.DataFrame"): + data = sanitize_dataframe(data) + return data.to_json(orient="records", double_precision=15) else: raise NotImplementedError( "to_json only works with data expressed as " "a DataFrame or as a dict" ) -def _data_to_csv_string(data: Union[dict, pd.DataFrame, DataFrameLike]) -> str: +def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str: """return a CSV string representation of the input data""" check_data_type(data) if hasattr(data, "__geo_interface__"): @@ -289,15 +290,7 @@ def _data_to_csv_string(data: Union[dict, pd.DataFrame, DataFrameLike]) -> str: "to_csv does not work with data that " "contains the __geo_interface__ attribute" ) - elif isinstance(data, pd.DataFrame): - data = sanitize_dataframe(data) - return data.to_csv(index=False) - elif isinstance(data, dict): - if "values" not in data: - raise KeyError("values expected in data dict, but not present") - return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) elif isinstance(data, DataFrameLike): - # experimental interchange dataframe support import pyarrow as pa import pyarrow.csv as pa_csv @@ -305,6 +298,13 @@ def _data_to_csv_string(data: Union[dict, pd.DataFrame, DataFrameLike]) -> str: csv_buffer = pa.BufferOutputStream() pa_csv.write_csv(pa_table, csv_buffer) return csv_buffer.getvalue().to_pybytes().decode() + elif isinstance(data, dict): + if "values" not in data: + raise KeyError("values expected in data dict, but not present") + return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) + elif isinstance(data, "pd.DataFrame"): + data = sanitize_dataframe(data) + return data.to_csv(index=False) else: raise NotImplementedError( "to_csv only works with data expressed as " "a DataFrame or as a dict" From 88a8870c5f4601b5a7c4f2a4fb08a960daf3d86f Mon Sep 17 00:00:00 2001 From: mattijn Date: Mon, 25 Mar 2024 00:01:43 +0100 Subject: [PATCH 05/10] prioritze DataFrameLike, check pandas dataframe using iloc, columns and index attributes over type --- altair/_magics.py | 7 +++++-- altair/vegalite/v5/api.py | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/altair/_magics.py b/altair/_magics.py index bac190aa3..6b014dbbe 100644 --- a/altair/_magics.py +++ b/altair/_magics.py @@ -9,10 +9,11 @@ import IPython from IPython.core import magic_arguments -import pandas as pd +from altair.utils.core import DataFrameLike from toolz import curried from altair.vegalite import v5 as vegalite_v5 +from altair.vegalite.api import _is_pandas_dataframe try: import yaml @@ -40,10 +41,12 @@ def _prepare_data(data, data_transformers): """Convert input data to data for use within schema""" if data is None or isinstance(data, dict): return data - elif isinstance(data, pd.DataFrame): + elif isinstance(data, DataFrameLike): return curried.pipe(data, data_transformers.get()) elif isinstance(data, str): return {"url": data} + elif _is_pandas_dataframe(data): + return curried.pipe(data, data_transformers.get()) else: warnings.warn("data of type {} not recognized".format(type(data)), stacklevel=1) return data diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index dfde5ee7e..720753d33 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -4,7 +4,6 @@ import io import json import jsonschema -import pandas as pd from toolz.curried import pipe as _pipe import itertools import sys @@ -89,6 +88,9 @@ def _consolidate_data(data, context): return data +def _is_pandas_dataframe(obj: Any) -> bool: + """Check if the object is an instance of a pandas DataFrame.""" + return all(attr in dir(obj) for attr in ['iloc', 'columns', 'index']) def _prepare_data(data, context=None): """Convert input data to data for use within schema @@ -106,15 +108,15 @@ def _prepare_data(data, context=None): if data is Undefined: return data - # convert dataframes or objects with __geo_interface__ to dict - elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"): + # convert dataframes or objects with __geo_interface__ to dict + elif isinstance(data, DataFrameLike) or hasattr(data, "__geo_interface__"): data = _pipe(data, data_transformers.get()) # convert string input to a URLData elif isinstance(data, str): data = core.UrlData(data) - elif isinstance(data, DataFrameLike): + elif _is_pandas_dataframe(data): data = _pipe(data, data_transformers.get()) # consolidate inline data to top-level datasets From c14d94a0b6472918bf0e770b9e500a9dcadba1b5 Mon Sep 17 00:00:00 2001 From: mattijn Date: Mon, 25 Mar 2024 00:06:55 +0100 Subject: [PATCH 06/10] ruff --- altair/utils/_importers.py | 2 +- altair/utils/schemapi.py | 4 ++-- tools/schemapi/schemapi.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index 158f7d577..a87688088 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -117,4 +117,4 @@ def import_pandas() -> ModuleType: "or conda:\n" f' conda install -c conda-forge "pandas>={min_version}"\n\n' f"ImportError: {err.args[0]}" - ) from err \ No newline at end of file + ) from err diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 4e4961c10..5a56bfc37 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -45,12 +45,12 @@ from typing_extensions import Self if TYPE_CHECKING: - import pandas as pd + pass class _PandasTimestamp: def isoformat(self): return "dummy_isoformat" # Return a dummy ISO format string - + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index db008e5ef..bd6eaa7f9 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -43,12 +43,12 @@ from typing_extensions import Self if TYPE_CHECKING: - import pandas as pd + pass class _PandasTimestamp: def isoformat(self): return "dummy_isoformat" # Return a dummy ISO format string - + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] From d00cd107661e1575096d806e394ea36d9454973f Mon Sep 17 00:00:00 2001 From: mattijn Date: Mon, 25 Mar 2024 00:10:54 +0100 Subject: [PATCH 07/10] ruff format --- altair/utils/_importers.py | 1 + altair/utils/core.py | 2 ++ altair/utils/schemapi.py | 3 +++ altair/vegalite/v5/api.py | 4 +++- tools/schemapi/schemapi.py | 3 +++ 5 files changed, 12 insertions(+), 1 deletion(-) diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index a87688088..2a8c6aee6 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -96,6 +96,7 @@ def pyarrow_available() -> bool: except ImportError: return False + def import_pandas() -> ModuleType: min_version = "0.25" try: diff --git a/altair/utils/core.py b/altair/utils/core.py index 83cfe76a8..6d5075289 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -208,6 +208,7 @@ def infer_vegalite_type( data: object """ from altair.utils._importers import import_pandas + pd = import_pandas() typ = pd.api.types.infer_dtype(data, skipna=False) @@ -318,6 +319,7 @@ def sanitize_dataframe(df: "pd.DataFrame") -> "pd.DataFrame": # noqa: C901 * Raise a ValueError for TimeDelta dtypes """ from altair.utils._importers import import_pandas + pd = import_pandas() df = df.copy() diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 5a56bfc37..c34d0d4ff 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -47,10 +47,12 @@ if TYPE_CHECKING: pass + class _PandasTimestamp: def isoformat(self): return "dummy_isoformat" # Return a dummy ISO format string + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -486,6 +488,7 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return float(obj) elif isinstance(obj, (_PandasTimestamp, np.datetime64)): import pandas as pd + return pd.Timestamp(obj).isoformat() else: return obj diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 720753d33..9810f7a5a 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -88,9 +88,11 @@ def _consolidate_data(data, context): return data + def _is_pandas_dataframe(obj: Any) -> bool: """Check if the object is an instance of a pandas DataFrame.""" - return all(attr in dir(obj) for attr in ['iloc', 'columns', 'index']) + return all(attr in dir(obj) for attr in ["iloc", "columns", "index"]) + def _prepare_data(data, context=None): """Convert input data to data for use within schema diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index bd6eaa7f9..ef494f3ff 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -45,10 +45,12 @@ if TYPE_CHECKING: pass + class _PandasTimestamp: def isoformat(self): return "dummy_isoformat" # Return a dummy ISO format string + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -484,6 +486,7 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return float(obj) elif isinstance(obj, (_PandasTimestamp, np.datetime64)): import pandas as pd + return pd.Timestamp(obj).isoformat() else: return obj From bd70bf4cb0322c7f631fd9788a4fa43d5e376032 Mon Sep 17 00:00:00 2001 From: mattijn Date: Mon, 25 Mar 2024 23:20:40 +0100 Subject: [PATCH 08/10] relocate function --- altair/_magics.py | 3 +-- altair/utils/core.py | 9 +++++++-- altair/utils/data.py | 21 +++++++++++++-------- altair/vegalite/v5/api.py | 7 +------ altair/vegalite/v5/schema/channels.py | 1 + tools/generate_schema_wrapper.py | 1 + 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/altair/_magics.py b/altair/_magics.py index 6b014dbbe..99bc8bc67 100644 --- a/altair/_magics.py +++ b/altair/_magics.py @@ -9,11 +9,10 @@ import IPython from IPython.core import magic_arguments -from altair.utils.core import DataFrameLike +from altair.utils.core import DataFrameLike, _is_pandas_dataframe from toolz import curried from altair.vegalite import v5 as vegalite_v5 -from altair.vegalite.api import _is_pandas_dataframe try: import yaml diff --git a/altair/utils/core.py b/altair/utils/core.py index 6d5075289..b8ecaba07 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -52,6 +52,11 @@ def __dataframe__( ) -> DfiDataFrame: ... +def _is_pandas_dataframe(obj: Any) -> bool: + """Check if the object is an instance of a pandas DataFrame.""" + return all(attr in dir(obj) for attr in ["iloc", "columns", "index"]) + + TYPECODE_MAP = { "ordinal": "O", "nominal": "N", @@ -606,7 +611,7 @@ def parse_shorthand( # Fall back to pandas-based inference. # Note: The AttributeError catch is a workaround for # https://github.com/pandas-dev/pandas/issues/55332 - if isinstance(data, "pd.DataFrame"): + if _is_pandas_dataframe(data): attrs["type"] = infer_vegalite_type(data[unescaped_field]) else: raise @@ -614,7 +619,7 @@ def parse_shorthand( if isinstance(attrs["type"], tuple): attrs["sort"] = attrs["type"][1] attrs["type"] = attrs["type"][0] - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): # Fallback if pyarrow is not installed or if pandas is older than 1.5 # # Remove escape sequences so that types can be inferred for columns with special characters diff --git a/altair/utils/data.py b/altair/utils/data.py index cc21e9d1f..ec74f42d3 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -9,7 +9,12 @@ from typing import TypeVar from ._importers import import_pyarrow_interchange -from .core import sanitize_dataframe, sanitize_arrow_table, DataFrameLike +from .core import ( + sanitize_dataframe, + sanitize_arrow_table, + DataFrameLike, + _is_pandas_dataframe, +) from .core import sanitize_geo_interface from .deprecation import AltairDeprecationWarning from .plugin_registry import PluginRegistry @@ -110,7 +115,7 @@ def raise_max_rows_error(): # mypy gets confused as it doesn't see Dict[Any, Any] # as equivalent to TDataType return data # type: ignore[return-value] - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): values = data if max_rows is not None and len(values) > max_rows: @@ -149,7 +154,7 @@ def sample( else: # Maybe this should raise an error or return something useful? return None - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): return data.sample(n=n, frac=frac) else: # Maybe this should raise an error or return something useful? Currently, @@ -216,7 +221,7 @@ def to_values(data: DataType) -> ToValuesReturnType: """Replace a DataFrame by a data model with values.""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, "pd.DataFrame"): + if _is_pandas_dataframe(data): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? @@ -229,7 +234,7 @@ def to_values(data: DataType) -> ToValuesReturnType: if "values" not in data: raise KeyError("values expected in data dict, but not present.") return data - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return {"values": data.to_dict(orient="records")} @@ -260,7 +265,7 @@ def _data_to_json_string(data: DataType) -> str: """Return a JSON string representation of the input data""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, "pd.DataFrame"): + if _is_pandas_dataframe(data): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? @@ -273,7 +278,7 @@ def _data_to_json_string(data: DataType) -> str: if "values" not in data: raise KeyError("values expected in data dict, but not present.") return json.dumps(data["values"], sort_keys=True) - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return data.to_json(orient="records", double_precision=15) else: @@ -302,7 +307,7 @@ def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str if "values" not in data: raise KeyError("values expected in data dict, but not present") return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) - elif isinstance(data, "pd.DataFrame"): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return data.to_csv(index=False) else: diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 9810f7a5a..789b0fc4f 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -25,7 +25,7 @@ using_vegafusion as _using_vegafusion, compile_with_vegafusion as _compile_with_vegafusion, ) -from ...utils.core import DataFrameLike +from ...utils.core import DataFrameLike, _is_pandas_dataframe from ...utils.data import DataType if sys.version_info >= (3, 11): @@ -89,11 +89,6 @@ def _consolidate_data(data, context): return data -def _is_pandas_dataframe(obj: Any) -> bool: - """Check if the object is an instance of a pandas DataFrame.""" - return all(attr in dir(obj) for attr in ["iloc", "columns", "index"]) - - def _prepare_data(data, context=None): """Convert input data to data for use within schema diff --git a/altair/vegalite/v5/schema/channels.py b/altair/vegalite/v5/schema/channels.py index 818b9b10b..62adc8301 100644 --- a/altair/vegalite/v5/schema/channels.py +++ b/altair/vegalite/v5/schema/channels.py @@ -11,6 +11,7 @@ import sys from . import core +from altair.utils.core import DataFrameLike from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters from altair.utils import parse_shorthand from typing import Any, overload, Sequence, List, Literal, Union, Optional diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index b071df34d..9481693b8 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -547,6 +547,7 @@ def generate_vegalite_channel_wrappers( imports = [ "import sys", "from . import core", + "from altair.utils.core import DataFrameLike", "from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters", "from altair.utils import parse_shorthand", "from typing import Any, overload, Sequence, List, Literal, Union, Optional", From 8b41305d160ca5be37780840a2dcf5677593ab05 Mon Sep 17 00:00:00 2001 From: mattijn Date: Tue, 26 Mar 2024 00:05:03 +0100 Subject: [PATCH 09/10] prioritze pd.dataframe, currently no dependency on pyarrow --- altair/utils/data.py | 85 ++++++++++++++++++++-------------------- tests/utils/test_core.py | 2 +- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index ec74f42d3..a2086c16e 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -101,13 +101,8 @@ def raise_max_rows_error(): values = data.__geo_interface__["features"] else: values = data.__geo_interface__ - elif isinstance(data, DataFrameLike): - pa_table = arrow_table_from_dfi_dataframe(data) - if max_rows is not None and pa_table.num_rows > max_rows: - raise_max_rows_error() - # Return pyarrow Table instead of input since the - # `arrow_table_from_dfi_dataframe` call above may be expensive - return pa_table + elif _is_pandas_dataframe(data): + values = data elif isinstance(data, dict): if "values" in data: values = data["values"] @@ -115,8 +110,13 @@ def raise_max_rows_error(): # mypy gets confused as it doesn't see Dict[Any, Any] # as equivalent to TDataType return data # type: ignore[return-value] - elif _is_pandas_dataframe(data): - values = data + elif isinstance(data, DataFrameLike): + pa_table = arrow_table_from_dfi_dataframe(data) + if max_rows is not None and pa_table.num_rows > max_rows: + raise_max_rows_error() + # Return pyarrow Table instead of input since the + # `arrow_table_from_dfi_dataframe` call above may be expensive + return pa_table if max_rows is not None and len(values) > max_rows: raise_max_rows_error() @@ -130,16 +130,8 @@ def sample( ) -> Optional[Union["pd.DataFrame", Dict[str, Sequence], "pyarrow.lib.Table"]]: """Reduce the size of the data model by sampling without replacement.""" check_data_type(data) - if isinstance(data, DataFrameLike): - pa_table = arrow_table_from_dfi_dataframe(data) - if not n: - if frac is None: - raise ValueError( - "frac cannot be None if n is None with this data input type" - ) - n = int(frac * len(pa_table)) - indices = random.sample(range(len(pa_table)), n) - return pa_table.take(indices) + if _is_pandas_dataframe(data): + return data.sample(n=n, frac=frac) elif isinstance(data, dict): if "values" in data: values = data["values"] @@ -154,8 +146,16 @@ def sample( else: # Maybe this should raise an error or return something useful? return None - elif _is_pandas_dataframe(data): - return data.sample(n=n, frac=frac) + elif isinstance(data, DataFrameLike): + pa_table = arrow_table_from_dfi_dataframe(data) + if not n: + if frac is None: + raise ValueError( + "frac cannot be None if n is None with this data input type" + ) + n = int(frac * len(pa_table)) + indices = random.sample(range(len(pa_table)), n) + return pa_table.take(indices) else: # Maybe this should raise an error or return something useful? Currently, # if data is of type SupportsGeoInterface it lands here @@ -227,24 +227,23 @@ def to_values(data: DataType) -> ToValuesReturnType: # SupportGeoInterface and then the ignore statement is not needed? data_sanitized = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return {"values": data_sanitized} - elif isinstance(data, DataFrameLike): - pa_table = sanitize_arrow_table(arrow_table_from_dfi_dataframe(data)) - return {"values": pa_table.to_pylist()} + elif _is_pandas_dataframe(data): + data = sanitize_dataframe(data) + return {"values": data.to_dict(orient="records")} elif isinstance(data, dict): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return data - elif _is_pandas_dataframe(data): - data = sanitize_dataframe(data) - return {"values": data.to_dict(orient="records")} - + elif isinstance(data, DataFrameLike): + pa_table = sanitize_arrow_table(arrow_table_from_dfi_dataframe(data)) + return {"values": pa_table.to_pylist()} else: # Should never reach this state as tested by check_data_type raise ValueError("Unrecognized data type: {}".format(type(data))) def check_data_type(data: DataType) -> None: - if not isinstance(data, (dict, DataFrameLike, "pd.DataFrame")) and not any( + if not isinstance(data, (dict, DataFrameLike)) and not _is_pandas_dataframe(data) and not any( hasattr(data, attr) for attr in ["__geo_interface__"] ): raise TypeError( @@ -271,16 +270,16 @@ def _data_to_json_string(data: DataType) -> str: # SupportGeoInterface and then the ignore statement is not needed? data = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return json.dumps(data) - elif isinstance(data, DataFrameLike): - pa_table = arrow_table_from_dfi_dataframe(data) - return json.dumps(pa_table.to_pylist()) + elif _is_pandas_dataframe(data): + data = sanitize_dataframe(data) + return data.to_json(orient="records", double_precision=15) elif isinstance(data, dict): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return json.dumps(data["values"], sort_keys=True) - elif _is_pandas_dataframe(data): - data = sanitize_dataframe(data) - return data.to_json(orient="records", double_precision=15) + elif isinstance(data, DataFrameLike): + pa_table = arrow_table_from_dfi_dataframe(data) + return json.dumps(pa_table.to_pylist()) else: raise NotImplementedError( "to_json only works with data expressed as " "a DataFrame or as a dict" @@ -295,6 +294,15 @@ def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str "to_csv does not work with data that " "contains the __geo_interface__ attribute" ) + elif _is_pandas_dataframe(data): + data = sanitize_dataframe(data) + return data.to_csv(index=False) + elif isinstance(data, dict): + from altair.utils._importers import import_pandas + if "values" not in data: + raise KeyError("values expected in data dict, but not present") + pd = import_pandas() + return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) elif isinstance(data, DataFrameLike): import pyarrow as pa import pyarrow.csv as pa_csv @@ -303,13 +311,6 @@ def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str csv_buffer = pa.BufferOutputStream() pa_csv.write_csv(pa_table, csv_buffer) return csv_buffer.getvalue().to_pybytes().decode() - elif isinstance(data, dict): - if "values" not in data: - raise KeyError("values expected in data dict, but not present") - return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) - elif _is_pandas_dataframe(data): - data = sanitize_dataframe(data) - return data.to_csv(index=False) else: raise NotImplementedError( "to_csv only works with data expressed as " "a DataFrame or as a dict" diff --git a/tests/utils/test_core.py b/tests/utils/test_core.py index 27cd3b7ee..185588456 100644 --- a/tests/utils/test_core.py +++ b/tests/utils/test_core.py @@ -4,11 +4,11 @@ import numpy as np import pandas as pd +from pandas.api.types import infer_dtype import pytest import altair as alt from altair.utils.core import parse_shorthand, update_nested, infer_encoding_types -from altair.utils.core import infer_dtype json_schema_specification = alt.load_schema()["$schema"] json_schema_dict_str = f'{{"$schema": "{json_schema_specification}"}}' From 62ab14dcedb52f0a32c8de613c4e6883bdba8796 Mon Sep 17 00:00:00 2001 From: mattijn Date: Tue, 26 Mar 2024 00:06:46 +0100 Subject: [PATCH 10/10] ruff --- altair/utils/data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index a2086c16e..1b68e8518 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -243,8 +243,10 @@ def to_values(data: DataType) -> ToValuesReturnType: def check_data_type(data: DataType) -> None: - if not isinstance(data, (dict, DataFrameLike)) and not _is_pandas_dataframe(data) and not any( - hasattr(data, attr) for attr in ["__geo_interface__"] + if ( + not isinstance(data, (dict, DataFrameLike)) + and not _is_pandas_dataframe(data) + and not any(hasattr(data, attr) for attr in ["__geo_interface__"]) ): raise TypeError( "Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format( @@ -299,6 +301,7 @@ def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str return data.to_csv(index=False) elif isinstance(data, dict): from altair.utils._importers import import_pandas + if "values" not in data: raise KeyError("values expected in data dict, but not present") pd = import_pandas()