diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 48dc8a13c3e..1e8f45def2f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -12,6 +12,7 @@ strings find_multiple findall padding + json regex_flags regex_program repeat diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/json.rst new file mode 100644 index 00000000000..d53f5b15cc9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/json.rst @@ -0,0 +1,6 @@ +==== +json +==== + +.. automodule:: pylibcudf.strings.json + :members: diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index e712937f816..ffa5e603408 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -72,7 +72,7 @@ ) from cudf._lib.strings.find_multiple import find_multiple from cudf._lib.strings.findall import find_re, findall -from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object +from cudf._lib.strings.json import get_json_object from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx index c9b0bba088d..226a9e961bf 100644 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -1,84 +1,26 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc +from pylibcudf.strings.json cimport GetJsonObjectOptions from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.json cimport ( - get_json_object as cpp_get_json_object, - get_json_object_options, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() def get_json_object( - Column col, object py_json_path, GetJsonObjectOptions options): + Column col, + object py_json_path, + GetJsonObjectOptions options +): """ Apply a JSONPath string to all rows in an input column of json strings. """ - cdef unique_ptr[column] c_result - - cdef column_view col_view = col.view() - cdef DeviceScalar json_path = py_json_path.device_value - - cdef const string_scalar* scalar_json_path = ( - json_path.get_raw_ptr() + plc_column = plc.strings.json.get_json_object( + col.to_pylibcudf(mode="read"), + py_json_path.device_value.c_value, + options ) - - with nogil: - c_result = move(cpp_get_json_object( - col_view, - scalar_json_path[0], - options.options, - )) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class GetJsonObjectOptions: - cdef get_json_object_options options - - def __init__( - self, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False - ): - self.options.set_allow_single_quotes(allow_single_quotes) - self.options.set_strip_quotes_from_single_strings( - strip_quotes_from_single_strings - ) - self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) - - @property - def allow_single_quotes(self): - return self.options.get_allow_single_quotes() - - @property - def strip_quotes_from_single_strings(self): - return self.options.get_strip_quotes_from_single_strings() - - @property - def missing_fields_as_nulls(self): - return self.options.get_missing_fields_as_nulls() - - @allow_single_quotes.setter - def allow_single_quotes(self, val): - self.options.set_allow_single_quotes(val) - - @strip_quotes_from_single_strings.setter - def strip_quotes_from_single_strings(self, val): - self.options.set_strip_quotes_from_single_strings(val) - - @missing_fields_as_nulls.setter - def missing_fields_as_nulls(self, val): - self.options.set_missing_fields_as_nulls(val) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b50e23bd52e..a86a29d2cfb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2385,8 +2385,7 @@ def get_json_object( 0 [\n { "category": "reference",\n ... dtype: object """ - - options = libstrings.GetJsonObjectOptions( + options = plc.strings.json.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, strip_quotes_from_single_strings=( strip_quotes_from_single_strings diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index eeb44d19333..feebfe198dc 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -22,6 +22,7 @@ set(cython_sources find.pyx find_multiple.pyx findall.pyx + json.pyx padding.pyx regex_flags.pyx regex_program.pyx diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 187ef113073..b5cf7ef13fa 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -11,9 +11,11 @@ from . cimport ( find, find_multiple, findall, + json, padding, regex_flags, regex_program, + repeat, replace, side_type, slice, @@ -33,9 +35,13 @@ __all__ = [ "convert", "extract", "find", + "find_multiple", "findall", + "json", + "padding", "regex_flags", "regex_program", + "repeat", "replace", "slice", "strip", diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 6033cea0625..f8b4211cb7f 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -11,6 +11,7 @@ find, find_multiple, findall, + json, padding, regex_flags, regex_program, @@ -34,9 +35,13 @@ "convert", "extract", "find", + "find_multiple", "findall", + "json", + "padding", "regex_flags", "regex_program", + "repeat", "replace", "slice", "strip", diff --git a/python/pylibcudf/pylibcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/strings/json.pxd new file mode 100644 index 00000000000..fe1c5f695fc --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/json.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.json cimport get_json_object_options +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + cdef get_json_object_options options + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=* +) diff --git a/python/pylibcudf/pylibcudf/strings/json.pyx b/python/pylibcudf/pylibcudf/strings/json.pyx new file mode 100644 index 00000000000..99056a1075e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/json.pyx @@ -0,0 +1,154 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport json as cpp_json +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + """Settings for ``get_json_object()``""" + def __init__( + self, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False + ): + self.set_allow_single_quotes(allow_single_quotes) + self.set_strip_quotes_from_single_strings( + strip_quotes_from_single_strings + ) + self.set_missing_fields_as_nulls(missing_fields_as_nulls) + + def get_allow_single_quotes(self): + """ + Returns true/false depending on whether single-quotes for representing strings + are allowed. + + Returns + ------- + bool + true if single-quotes are allowed, false otherwise. + """ + return self.options.get_allow_single_quotes() + + def get_strip_quotes_from_single_strings(self): + """ + Returns true/false depending on whether individually returned string values have + their quotes stripped. + + Returns + ------- + bool + true if individually returned string values have their quotes stripped. + """ + return self.options.get_strip_quotes_from_single_strings() + + def get_missing_fields_as_nulls(self): + """ + Whether a field not contained by an object is to be interpreted as null. + + Returns + ------- + bool + true if missing fields are interpreted as null. + """ + return self.options.get_missing_fields_as_nulls() + + def set_allow_single_quotes(self, bool val): + """ + Set whether single-quotes for strings are allowed. + + Parameters + ---------- + val : bool + Whether to allow single quotes + + Returns + ------- + None + """ + self.options.set_allow_single_quotes(val) + + def set_strip_quotes_from_single_strings(self, bool val): + """ + Set whether individually returned string values have their quotes stripped. + + Parameters + ---------- + val : bool + Whether to strip quotes from single strings. + + Returns + ------- + None + """ + self.options.set_strip_quotes_from_single_strings(val) + + def set_missing_fields_as_nulls(self, bool val): + """ + Set whether missing fields are interpreted as null. + + Parameters + ---------- + val : bool + Whether to treat missing fields as nulls. + + Returns + ------- + None + """ + self.options.set_missing_fields_as_nulls(val) + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=None +): + """ + Apply a JSONPath string to all rows in an input strings column. + + For details, see :cpp:func:`cpp::strings::get_json_object` + + Parameters + ---------- + col : Column + The input strings column. Each row must contain a valid json string. + + json_path : Scalar + The JSONPath string to be applied to each row. + + options : GetJsonObjectOptions + Options for controlling the behavior of the function. + + Returns + ------- + Column + New strings column containing the retrieved json object strings. + """ + cdef unique_ptr[column] c_result + cdef string_scalar* c_json_path = ( + json_path.c_obj.get() + ) + if options is None: + options = GetJsonObjectOptions() + + cdef cpp_json.get_json_object_options c_options = options.options + + with nogil: + c_result = move( + cpp_json.get_json_object( + col.view(), + dereference(c_json_path), + c_options + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_json.py b/python/pylibcudf/pylibcudf/tests/test_string_json.py new file mode 100644 index 00000000000..405a84c9cb2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_json.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def plc_col(): + arr = pa.array( + ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None] + ) + return plc.interop.from_arrow(arr) + + +@pytest.fixture(scope="module") +def json_path(): + slr = pa.scalar("$.foo.bar") + return plc.interop.from_arrow(slr) + + +@pytest.mark.parametrize("allow_single_quotes", [True, False]) +@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False]) +@pytest.mark.parametrize("missing_fields_as_nulls", [True, False]) +def test_get_json_object( + plc_col, + json_path, + allow_single_quotes, + strip_quotes_from_single_strings, + missing_fields_as_nulls, +): + result = plc.strings.json.get_json_object( + plc_col, + json_path, + plc.strings.json.GetJsonObjectOptions( + allow_single_quotes=allow_single_quotes, + strip_quotes_from_single_strings=strip_quotes_from_single_strings, + missing_fields_as_nulls=missing_fields_as_nulls, + ), + ) + expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]) + assert_column_eq(result, expected)