From c835c131282cc189b9bc4cc91bef2492c0b2dd36 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:25:27 +0000 Subject: [PATCH] feat(DRAFT): Adds `altair.datasets.url` A dataframe package is still required currently,. Can later be adapted to fit the requirements of (https://github.com/vega/altair/pull/3631#discussion_r1846662053). Related: - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 @mattijn, @joelostblom --- altair/datasets/__init__.py | 415 ++++-------------------------------- altair/datasets/_loader.py | 394 ++++++++++++++++++++++++++++++++++ tests/test_datasets.py | 59 ++++- 3 files changed, 491 insertions(+), 377 deletions(-) create mode 100644 altair/datasets/_loader.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 26fd39b20..ac7ac9f06 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,380 +1,23 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from typing import TYPE_CHECKING -from narwhals.typing import IntoDataFrameT, IntoFrameT - -from altair.datasets._readers import _Reader, backend +from altair.datasets._loader import Loader if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal - - import pandas as pd - import polars as pl - import pyarrow as pa - from _typeshed import StrPath + from typing import Any if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - from altair.datasets._readers import _Backend - from altair.datasets._typing import Dataset, Extension, Version - -__all__ = ["Loader", "load"] - - -class Loader(Generic[IntoDataFrameT, IntoFrameT]): - """ - Load examples **remotely** from `vega-datasets`_, with *optional* caching. - - A new ``Loader`` must be initialized by specifying a backend: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data # doctest: +SKIP - Loader[polars] - - .. _vega-datasets: - https://github.com/vega/vega-datasets - """ - - _reader: _Reader[IntoDataFrameT, IntoFrameT] - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["polars", "polars[pyarrow]"], / - ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / - ) -> Loader[pd.DataFrame, pd.DataFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pyarrow"], / - ) -> Loader[pa.Table, pa.Table]: ... - - @classmethod - def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: - """ - Initialize a new loader, with the specified backend. - - Parameters - ---------- - backend_name - DataFrame package/config used to return data. - - * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: Using `pandas defaults`_. - * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` - * *pyarrow*: (*Experimental*) - - .. warning:: - Most datasets use a `JSON format not supported`_ by ``pyarrow`` - - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - polars.dataframe.frame.DataFrame - - Using ``pandas``: - - data = Loader.from_backend("pandas") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - Using ``pandas``, backed by ``pyarrow`` dtypes: - - data = Loader.from_backend("pandas[pyarrow]") - cars = data("cars", tag="v1.29.0") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - >>> cars.dtypes # doctest: +SKIP - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year string[pyarrow] - Origin string[pyarrow] - dtype: object - """ - obj = Loader.__new__(Loader) - obj._reader = backend(backend_name) - return obj - - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """ - Get a remote dataset and load as tabular data. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - **kwds - Arguments passed to the underlying read function. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - data = Loader.from_backend("polars") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - shape: (560, 3) - ┌────────┬────────────┬────────┐ - │ symbol ┆ date ┆ price │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞════════╪════════════╪════════╡ - │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ - │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ - │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ - │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ - │ MSFT ┆ May 1 2000 ┆ 25.45 │ - │ … ┆ … ┆ … │ - │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ - │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ - │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ - │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ - │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ - └────────┴────────────┴────────┘ - - Using ``pandas``: - - data = Loader.from_backend("pandas") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - Index(['symbol', 'date', 'price'], dtype='object') - - >>> source # doctest: +SKIP - symbol date price - 0 MSFT Jan 1 2000 39.81 - 1 MSFT Feb 1 2000 36.35 - 2 MSFT Mar 1 2000 43.22 - 3 MSFT Apr 1 2000 28.37 - 4 MSFT May 1 2000 25.45 - .. ... ... ... - 555 AAPL Nov 1 2009 199.91 - 556 AAPL Dec 1 2009 210.73 - 557 AAPL Jan 1 2010 192.06 - 558 AAPL Feb 1 2010 204.62 - 559 AAPL Mar 1 2010 223.02 - - [560 rows x 3 columns] - - Using ``pyarrow``: - - data = Loader.from_backend("pyarrow") - source = data("stocks", tag="v2.10.0") - - >>> source.column_names # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - pyarrow.Table - symbol: string - date: string - price: double - ---- - symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] - date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] - price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] - """ - return self._reader.dataset(name, suffix, tag=tag, **kwds) - - def url( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - ) -> str: - """ - Return the address of a remote dataset. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - The returned url will always point to an accessible dataset: - - import altair as alt - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP - 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' - - We can pass the result directly to a chart: - - url = data.url("cars", tag="v2.9.0") - alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") - """ - return self._reader.url(name, suffix, tag=tag) - - @property - def cache_dir(self) -> Path | None: - """ - Returns path to datasets cache. - - By default, this can be configured using the environment variable: - - "ALTAIR_DATASETS_DIR" - - You *may* also set this directly, but the value will **not** persist between sessions: - - from pathlib import Path - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - data.cache_dir = Path.home() / ".altair_cache" - - >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP - '.altair_cache' - """ - return self._reader._cache - - @cache_dir.setter - def cache_dir(self, source: StrPath, /) -> None: - import os - - os.environ[self._reader._ENV_VAR] = str(source) - - def __repr__(self) -> str: - return f"{type(self).__name__}[{self._reader._name}]" + from altair.datasets._loader import _Load + from altair.datasets._typing import Dataset, Extension, Version -@final -class _Load(Loader[IntoDataFrameT, IntoFrameT]): - @overload - def __call__( # pyright: ignore[reportOverlappingOverload] - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: None = ..., - **kwds: Any, - ) -> IntoDataFrameT: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["polars", "polars[pyarrow]"] = ..., - **kwds: Any, - ) -> pl.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pandas", "pandas[pyarrow]"] = ..., - **kwds: Any, - ) -> pd.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pyarrow"] = ..., - **kwds: Any, - ) -> pa.Table: ... - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - backend: _Backend | None = None, - **kwds: Any, - ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: - if backend is None: - return super().__call__(name, suffix, tag, **kwds) - else: - return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +__all__ = ["Loader", "load", "url"] load: _Load[Any, Any] @@ -400,14 +43,50 @@ def __call__( """ +def url( + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, +) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Related + ------- + - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 + - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 + - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 + - https://github.com/vega/altair/pull/3631#discussion_r1846662053 + """ + from altair.datasets._loader import load + + return load.url(name, suffix, tag=tag) + + def __getattr__(name): if name == "load": - from altair.datasets._readers import infer_backend + from altair.datasets._loader import load - reader = infer_backend() - global load - load = _Load.__new__(_Load) - load._reader = reader return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py new file mode 100644 index 000000000..3c2a0ee21 --- /dev/null +++ b/altair/datasets/_loader.py @@ -0,0 +1,394 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, final, overload + +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._readers import _Reader, backend + +if TYPE_CHECKING: + import sys + from pathlib import Path + from typing import Any, Literal + + import pandas as pd + import polars as pl + import pyarrow as pa + from _typeshed import StrPath + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Backend + from altair.datasets._typing import Dataset, Extension, Version + +__all__ = ["Loader", "load"] + + +class Loader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Load examples **remotely** from `vega-datasets`_, with *optional* caching. + + A new ``Loader`` must be initialized by specifying a backend: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data # doctest: +SKIP + Loader[polars] + + .. _vega-datasets: + https://github.com/vega/vega-datasets + """ + + _reader: _Reader[IntoDataFrameT, IntoFrameT] + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["polars", "polars[pyarrow]"], / + ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pyarrow"], / + ) -> Loader[pa.Table, pa.Table]: ... + + @classmethod + def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + """ + Initialize a new loader, with the specified backend. + + Parameters + ---------- + backend_name + DataFrame package/config used to return data. + + * *polars*: Using `polars defaults`_ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: Using `pandas defaults`_. + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + polars.dataframe.frame.DataFrame + + Using ``pandas``: + + data = Loader.from_backend("pandas") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + Using ``pandas``, backed by ``pyarrow`` dtypes: + + data = Loader.from_backend("pandas[pyarrow]") + cars = data("cars", tag="v1.29.0") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + >>> cars.dtypes # doctest: +SKIP + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year string[pyarrow] + Origin string[pyarrow] + dtype: object + """ + obj = Loader.__new__(Loader) + obj._reader = backend(backend_name) + return obj + + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """ + Get a remote dataset and load as tabular data. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + **kwds + Arguments passed to the underlying read function. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + shape: (560, 3) + ┌────────┬────────────┬────────┐ + │ symbol ┆ date ┆ price │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞════════╪════════════╪════════╡ + │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ + │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ + │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ + │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ + │ MSFT ┆ May 1 2000 ┆ 25.45 │ + │ … ┆ … ┆ … │ + │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ + │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ + │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ + │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ + │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ + └────────┴────────────┴────────┘ + + Using ``pandas``: + + data = Loader.from_backend("pandas") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + Index(['symbol', 'date', 'price'], dtype='object') + + >>> source # doctest: +SKIP + symbol date price + 0 MSFT Jan 1 2000 39.81 + 1 MSFT Feb 1 2000 36.35 + 2 MSFT Mar 1 2000 43.22 + 3 MSFT Apr 1 2000 28.37 + 4 MSFT May 1 2000 25.45 + .. ... ... ... + 555 AAPL Nov 1 2009 199.91 + 556 AAPL Dec 1 2009 210.73 + 557 AAPL Jan 1 2010 192.06 + 558 AAPL Feb 1 2010 204.62 + 559 AAPL Mar 1 2010 223.02 + + [560 rows x 3 columns] + + Using ``pyarrow``: + + data = Loader.from_backend("pyarrow") + source = data("stocks", tag="v2.10.0") + + >>> source.column_names # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + pyarrow.Table + symbol: string + date: string + price: double + ---- + symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] + date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] + price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] + """ + return self._reader.dataset(name, suffix, tag=tag, **kwds) + + def url( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + ) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + The returned url will always point to an accessible dataset: + + import altair as alt + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + + We can pass the result directly to a chart: + + url = data.url("cars", tag="v2.9.0") + alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") + """ + return self._reader.url(name, suffix, tag=tag) + + @property + def cache_dir(self) -> Path | None: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You *may* also set this directly, but the value will **not** persist between sessions: + + from pathlib import Path + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + data.cache_dir = Path.home() / ".altair_cache" + + >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP + '.altair_cache' + """ + return self._reader._cache + + @cache_dir.setter + def cache_dir(self, source: StrPath, /) -> None: + import os + + os.environ[self._reader._ENV_VAR] = str(source) + + def __repr__(self) -> str: + return f"{type(self).__name__}[{self._reader._name}]" + + +@final +class _Load(Loader[IntoDataFrameT, IntoFrameT]): + @overload + def __call__( # pyright: ignore[reportOverlappingOverload] + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: None = ..., + **kwds: Any, + ) -> IntoDataFrameT: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["polars", "polars[pyarrow]"] = ..., + **kwds: Any, + ) -> pl.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pandas", "pandas[pyarrow]"] = ..., + **kwds: Any, + ) -> pd.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pyarrow"] = ..., + **kwds: Any, + ) -> pa.Table: ... + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + backend: _Backend | None = None, + **kwds: Any, + ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: + if backend is None: + return super().__call__(name, suffix, tag, **kwds) + else: + return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + + +load: _Load[Any, Any] + + +def __getattr__(name): + if name == "load": + from altair.datasets._readers import infer_backend + + reader = infer_backend() + global load + load = _Load.__new__(_Load) + load._reader = reader + return load + else: + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3d986ec75..6de691ff2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -141,11 +141,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" """ - import altair.datasets + import altair.datasets._loader from altair.datasets import load assert load._reader._name == "polars" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) monkeypatch.setitem(sys.modules, "polars", None) @@ -154,20 +154,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: if find_spec("pyarrow") is None: # NOTE: We can end the test early for the CI job that removes `pyarrow` assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) with pytest.raises(NotImplementedError, match="no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) monkeypatch.delitem(sys.modules, "pyarrow") @@ -175,7 +175,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load assert load._reader._name == "pyarrow" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) with pytest.raises(NotImplementedError, match="no.+backend"): @@ -184,11 +184,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @requires_pyarrow def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: - import altair.datasets + import altair.datasets._loader - monkeypatch.delattr(altair.datasets, "load", raising=False) + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + from altair.datasets import load - load = altair.datasets.load assert load._reader._name == "polars" default = load("cars") @@ -204,6 +204,47 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: assert is_polars_dataframe(df_polars) +@pytest.mark.parametrize( + "name", + [ + "jobs", + "la-riots", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "lookup_groups", + "lookup_people", + "miserables", + "monarchs", + "movies", + "normal-2d", + "obesity", + "ohlc", + "penguins", + "platformer-terrain", + "points", + "political-contributions", + "population", + "population_engineers_hurricanes", + "seattle-temps", + "seattle-weather", + "seattle-weather-hourly-normals", + "sf-temps", + "sp500", + "sp500-2000", + "stocks", + "udistrict", + ], +) +def test_url(name: Dataset) -> None: + from altair.datasets import url + + pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") + result = url(name) + assert isinstance(result, str) + assert pattern.match(result) is not None + + @backends def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False)