From d487e1093f7a2ec9bba0f5b230305016cdbe55e7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 20 Aug 2023 08:22:16 -0400 Subject: [PATCH] feat(pyspark): enable reading csv and parquet globs and implement `read_json` --- ibis/backends/pyspark/__init__.py | 34 ++++++++++++++++++++++++++++ ibis/backends/tests/test_register.py | 3 --- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py index 65afef53d450..76a8d97fbbf5 100644 --- a/ibis/backends/pyspark/__init__.py +++ b/ibis/backends/pyspark/__init__.py @@ -33,6 +33,8 @@ from ibis.backends.pyspark.datatypes import PySparkType if TYPE_CHECKING: + from collections.abc import Sequence + import pandas as pd import pyarrow as pa @@ -682,6 +684,38 @@ def read_csv( spark_df.createOrReplaceTempView(table_name) return self.table(table_name) + def read_json( + self, + source_list: str | Sequence[str], + table_name: str | None = None, + **kwargs: Any, + ) -> ir.Table: + """Register a JSON file as a table in the current database. + + Parameters + ---------- + source_list + The data source(s). May be a path to a file or directory of JSON files, or an + iterable of JSON files. + table_name + An optional name to use for the created table. This defaults to + a sequentially generated name. + kwargs + Additional keyword arguments passed to PySpark loading function. + https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.json.html + + Returns + ------- + ir.Table + The just-registered table + """ + source_list = normalize_filenames(source_list) + spark_df = self._session.read.json(source_list, **kwargs) + table_name = table_name or util.gen_name("read_json") + + spark_df.createOrReplaceTempView(table_name) + return self.table(table_name) + def register( self, source: str | Path | Any, diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py index 3732609e6cb1..5fb3155b84bb 100644 --- a/ibis/backends/tests/test_register.py +++ b/ibis/backends/tests/test_register.py @@ -455,7 +455,6 @@ def ft_data(data_dir): "mysql", "pandas", "postgres", - "pyspark", "sqlite", "trino", ] @@ -485,7 +484,6 @@ def test_read_parquet_glob(con, tmp_path, ft_data): "mysql", "pandas", "postgres", - "pyspark", "sqlite", "trino", ] @@ -517,7 +515,6 @@ def test_read_csv_glob(con, tmp_path, ft_data): "mysql", "pandas", "postgres", - "pyspark", "sqlite", "trino", ]