From d487e1093f7a2ec9bba0f5b230305016cdbe55e7 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sun, 20 Aug 2023 08:22:16 -0400
Subject: [PATCH] feat(pyspark): enable reading csv and parquet globs and
 implement `read_json`

---
 ibis/backends/pyspark/__init__.py    | 34 ++++++++++++++++++++++++++++
 ibis/backends/tests/test_register.py |  3 ---
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
index 65afef53d450..76a8d97fbbf5 100644
--- a/ibis/backends/pyspark/__init__.py
+++ b/ibis/backends/pyspark/__init__.py
@@ -33,6 +33,8 @@
 from ibis.backends.pyspark.datatypes import PySparkType
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import pandas as pd
     import pyarrow as pa
 
@@ -682,6 +684,38 @@ def read_csv(
         spark_df.createOrReplaceTempView(table_name)
         return self.table(table_name)
 
+    def read_json(
+        self,
+        source_list: str | Sequence[str],
+        table_name: str | None = None,
+        **kwargs: Any,
+    ) -> ir.Table:
+        """Register a JSON file as a table in the current database.
+
+        Parameters
+        ----------
+        source_list
+            The data source(s). May be a path to a file or directory of JSON files, or an
+            iterable of JSON files.
+        table_name
+            An optional name to use for the created table. This defaults to
+            a sequentially generated name.
+        kwargs
+            Additional keyword arguments passed to PySpark loading function.
+            https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.json.html
+
+        Returns
+        -------
+        ir.Table
+            The just-registered table
+        """
+        source_list = normalize_filenames(source_list)
+        spark_df = self._session.read.json(source_list, **kwargs)
+        table_name = table_name or util.gen_name("read_json")
+
+        spark_df.createOrReplaceTempView(table_name)
+        return self.table(table_name)
+
     def register(
         self,
         source: str | Path | Any,
diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py
index 3732609e6cb1..5fb3155b84bb 100644
--- a/ibis/backends/tests/test_register.py
+++ b/ibis/backends/tests/test_register.py
@@ -455,7 +455,6 @@ def ft_data(data_dir):
         "mysql",
         "pandas",
         "postgres",
-        "pyspark",
         "sqlite",
         "trino",
     ]
@@ -485,7 +484,6 @@ def test_read_parquet_glob(con, tmp_path, ft_data):
         "mysql",
         "pandas",
         "postgres",
-        "pyspark",
         "sqlite",
         "trino",
     ]
@@ -517,7 +515,6 @@ def test_read_csv_glob(con, tmp_path, ft_data):
         "mysql",
         "pandas",
         "postgres",
-        "pyspark",
         "sqlite",
         "trino",
     ]