Skip to content

Commit

Permalink
Add string.json APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Oct 8, 2024
1 parent 349ba5d commit 42aaf46
Show file tree
Hide file tree
Showing 11 changed files with 244 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ strings
find_multiple
findall
padding
json
regex_flags
regex_program
repeat
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
json
====

.. automodule:: pylibcudf.strings.json
:members:
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
)
from cudf._lib.strings.find_multiple import find_multiple
from cudf._lib.strings.findall import find_re, findall
from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
Expand Down
80 changes: 11 additions & 69 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -1,84 +1,26 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
import pylibcudf as plc
from pylibcudf.strings.json cimport GetJsonObjectOptions

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.json cimport (
get_json_object as cpp_get_json_object,
get_json_object_options,
)

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar


@acquire_spill_lock()
def get_json_object(
Column col, object py_json_path, GetJsonObjectOptions options):
Column col,
object py_json_path,
GetJsonObjectOptions options
):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
"""
cdef unique_ptr[column] c_result

cdef column_view col_view = col.view()
cdef DeviceScalar json_path = py_json_path.device_value

cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
plc_column = plc.strings.json.get_json_object(
col.to_pylibcudf(mode="read"),
py_json_path.device_value.c_value,
options
)

with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
options.options,
))

return Column.from_unique_ptr(move(c_result))


cdef class GetJsonObjectOptions:
cdef get_json_object_options options

def __init__(
self,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False
):
self.options.set_allow_single_quotes(allow_single_quotes)
self.options.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)

@property
def allow_single_quotes(self):
return self.options.get_allow_single_quotes()

@property
def strip_quotes_from_single_strings(self):
return self.options.get_strip_quotes_from_single_strings()

@property
def missing_fields_as_nulls(self):
return self.options.get_missing_fields_as_nulls()

@allow_single_quotes.setter
def allow_single_quotes(self, val):
self.options.set_allow_single_quotes(val)

@strip_quotes_from_single_strings.setter
def strip_quotes_from_single_strings(self, val):
self.options.set_strip_quotes_from_single_strings(val)

@missing_fields_as_nulls.setter
def missing_fields_as_nulls(self, val):
self.options.set_missing_fields_as_nulls(val)
return Column.from_pylibcudf(plc_column)
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,8 +2385,7 @@ def get_json_object(
0 [\n { "category": "reference",\n ...
dtype: object
"""

options = libstrings.GetJsonObjectOptions(
options = plc.strings.json.GetJsonObjectOptions(
allow_single_quotes=allow_single_quotes,
strip_quotes_from_single_strings=(
strip_quotes_from_single_strings
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ set(cython_sources
find.pyx
find_multiple.pyx
findall.pyx
json.pyx
padding.pyx
regex_flags.pyx
regex_program.pyx
Expand Down
6 changes: 6 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ from . cimport (
find,
find_multiple,
findall,
json,
padding,
regex_flags,
regex_program,
repeat,
replace,
side_type,
slice,
Expand All @@ -33,9 +35,13 @@ __all__ = [
"convert",
"extract",
"find",
"find_multiple",
"findall",
"json",
"padding",
"regex_flags",
"regex_program",
"repeat",
"replace",
"slice",
"strip",
Expand Down
5 changes: 5 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
find,
find_multiple,
findall,
json,
padding,
regex_flags,
regex_program,
Expand All @@ -34,9 +35,13 @@
"convert",
"extract",
"find",
"find_multiple",
"findall",
"json",
"padding",
"regex_flags",
"regex_program",
"repeat",
"replace",
"slice",
"strip",
Expand Down
16 changes: 16 additions & 0 deletions python/pylibcudf/pylibcudf/strings/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.json cimport get_json_object_options
from pylibcudf.scalar cimport Scalar


cdef class GetJsonObjectOptions:
cdef get_json_object_options options


cpdef Column get_json_object(
Column col,
Scalar json_path,
GetJsonObjectOptions options=*
)
154 changes: 154 additions & 0 deletions python/pylibcudf/pylibcudf/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings cimport json as cpp_json
from pylibcudf.scalar cimport Scalar


cdef class GetJsonObjectOptions:
"""Settings for ``get_json_object()``"""
def __init__(
self,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False
):
self.set_allow_single_quotes(allow_single_quotes)
self.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
self.set_missing_fields_as_nulls(missing_fields_as_nulls)

def get_allow_single_quotes(self):
"""
Returns true/false depending on whether single-quotes for representing strings
are allowed.
Returns
-------
bool
true if single-quotes are allowed, false otherwise.
"""
return self.options.get_allow_single_quotes()

def get_strip_quotes_from_single_strings(self):
"""
Returns true/false depending on whether individually returned string values have
their quotes stripped.
Returns
-------
bool
true if individually returned string values have their quotes stripped.
"""
return self.options.get_strip_quotes_from_single_strings()

def get_missing_fields_as_nulls(self):
"""
Whether a field not contained by an object is to be interpreted as null.
Returns
-------
bool
true if missing fields are interpreted as null.
"""
return self.options.get_missing_fields_as_nulls()

def set_allow_single_quotes(self, bool val):
"""
Set whether single-quotes for strings are allowed.
Parameters
----------
val : bool
Whether to allow single quotes
Returns
-------
None
"""
self.options.set_allow_single_quotes(val)

def set_strip_quotes_from_single_strings(self, bool val):
"""
Set whether individually returned string values have their quotes stripped.
Parameters
----------
val : bool
Whether to strip quotes from single strings.
Returns
-------
None
"""
self.options.set_strip_quotes_from_single_strings(val)

def set_missing_fields_as_nulls(self, bool val):
"""
Set whether missing fields are interpreted as null.
Parameters
----------
val : bool
Whether to treat missing fields as nulls.
Returns
-------
None
"""
self.options.set_missing_fields_as_nulls(val)


cpdef Column get_json_object(
Column col,
Scalar json_path,
GetJsonObjectOptions options=None
):
"""
Apply a JSONPath string to all rows in an input strings column.
For details, see :cpp:func:`cpp::strings::get_json_object`
Parameters
----------
col : Column
The input strings column. Each row must contain a valid json string.
json_path : Scalar
The JSONPath string to be applied to each row.
options : GetJsonObjectOptions
Options for controlling the behavior of the function.
Returns
-------
Column
New strings column containing the retrieved json object strings.
"""
cdef unique_ptr[column] c_result
cdef string_scalar* c_json_path = <string_scalar*>(
json_path.c_obj.get()
)
if options is None:
options = GetJsonObjectOptions()

cdef cpp_json.get_json_object_options c_options = options.options

with nogil:
c_result = move(
cpp_json.get_json_object(
col.view(),
dereference(c_json_path),
c_options
)
)

return Column.from_libcudf(move(c_result))
Loading

0 comments on commit 42aaf46

Please sign in to comment.