Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.12' into pylibcudf/s…
Browse files Browse the repository at this point in the history
…trings/convert_urls
  • Loading branch information
mroeschke committed Oct 7, 2024
2 parents 62378c8 + f926a61 commit d27685e
Show file tree
Hide file tree
Showing 30 changed files with 541 additions and 162 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/pr_issue_status_automation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,33 @@ jobs:
UPDATE_ITEM: true
UPDATE_LINKED_ISSUES: true
secrets: inherit

process-branch-name:
if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
needs: get-project-id
runs-on: ubuntu-latest
outputs:
branch-name: ${{ steps.process-branch-name.outputs.branch-name }}
steps:
- name: Extract branch name
id: process-branch-name
run: |
branch=${{ github.event.pull_request.base.ref }}
release=${branch#branch-}
echo "branch-name=$release" >> "$GITHUB_OUTPUT"
update-release:
# This job sets the PR and its linked issues to the release they are targeting
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
needs: [get-project-id, process-branch-name]
with:
PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ"
SINGLE_SELECT_FIELD_NAME: "Release"
SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}"
ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
UPDATE_ITEM: true
UPDATE_LINKED_ISSUES: true
secrets: inherit
2 changes: 1 addition & 1 deletion ci/test_python_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf

rapids-logger "Check GPU usage"
nvidia-smi

rapids-print-env
EXITCODE=0
trap "EXITCODE=1" ERR
set +e
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ dependencies:
- openpyxl
- packaging
- pandas
- pandas>=2.0,<2.2.3dev0
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ dependencies:
- openpyxl
- packaging
- pandas
- pandas>=2.0,<2.2.3dev0
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ requirements:
run:
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- pandas >=2.0,<2.2.4dev0
- cupy >=12.0.0
- numba-cuda >=0.0.13
- numpy >=1.23,<3.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ requirements:
run:
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- pandas >=2.0,<2.2.4dev0
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ struct escape_strings_fn {
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (column_v.is_empty()) { // empty begets empty
return make_empty_column(type_id::STRING);
}
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);

Expand Down
37 changes: 37 additions & 0 deletions cpp/tests/io/json/json_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput)
EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
}

TEST_F(JsonWriterTest, EmptyLeaf)
{
cudf::test::strings_column_wrapper col1{""};
cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
auto col2 = make_lists_column(1,
offsets.release(),
cudf::test::strings_column_wrapper{}.release(),
0,
rmm::device_buffer{},
cudf::test::get_default_stream());
auto col3 = cudf::test::lists_column_wrapper<int>::make_one_empty_row_column();
cudf::table_view tbl_view{{col1, *col2, col3}};
cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}};

std::vector<char> out_buffer;
auto destination = cudf::io::sink_info(&out_buffer);
auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
.include_nulls(true)
.metadata(mt)
.lines(false)
.na_rep("null")
.build();

// Empty columns in table
cudf::io::write_json(out_options, cudf::test::get_default_stream());
std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])";
EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));

// Empty columns in table - JSON Lines
out_buffer.clear();
out_options.enable_lines(true);
cudf::io::write_json(out_options, cudf::test::get_default_stream());
std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})"
"\n";
EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
}

TEST_F(JsonWriterTest, ErrorCases)
{
cudf::test::strings_column_wrapper col1{"a", "b", "c"};
Expand Down
6 changes: 5 additions & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ dependencies:
packages:
- fsspec>=0.6.0
- &numpy numpy>=1.23,<3.0a0
- pandas>=2.0,<2.2.3dev0
- pandas>=2.0,<2.2.4dev0
run_pylibcudf:
common:
- output_types: [conda, requirements, pyproject]
Expand Down Expand Up @@ -748,6 +748,10 @@ dependencies:
packages:
- *numba-cuda-dep
- pandas==2.0.*
- matrix: {dependencies: "latest"}
packages:
- numba-cuda==0.0.15
- pandas==2.2.3
- matrix:
packages:
- output_types: conda
Expand Down
50 changes: 14 additions & 36 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ from cudf._lib.scalar import as_device_scalar
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
Expand All @@ -22,11 +21,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport (
is_hex as cpp_is_hex,
to_integers as cpp_to_integers,
)
from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
integers_to_ipv4 as cpp_integers_to_ipv4,
ipv4_to_integers as cpp_ipv4_to_integers,
is_ipv4 as cpp_is_ipv4,
)
from pylibcudf.libcudf.types cimport data_type, type_id

from cudf._lib.types cimport underlying_type_t_type_id
Expand Down Expand Up @@ -525,12 +519,11 @@ def timedelta2int(Column input_col, dtype, format):
"""
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.to_durations(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
format
)
)

Expand All @@ -549,12 +542,10 @@ def int2timedelta(Column input_col, str format):
A Column with Timedelta represented in string format
"""

cdef string c_duration_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.from_durations(
input_col.to_pylibcudf(mode="read"),
c_duration_format
format
)
)

Expand All @@ -572,14 +563,10 @@ def int2ip(Column input_col):
A Column with integer represented in string ipv4 format
"""

cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_integers_to_ipv4(input_column_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
input_col.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


def ip2int(Column input_col):
Expand All @@ -595,14 +582,10 @@ def ip2int(Column input_col):
A Column with ipv4 represented as integer
"""

cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_ipv4_to_integers(input_column_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers(
input_col.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


def is_ipv4(Column source_strings):
Expand All @@ -611,15 +594,10 @@ def is_ipv4(Column source_strings):
that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn
where nnn is integer digits in [0,255].
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_ipv4(
source_view
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_ipv4.is_ipv4(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


def htoi(Column input_col, **kwargs):
Expand Down
69 changes: 17 additions & 52 deletions python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

import cudf

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
from_fixed_point as cpp_from_fixed_point,
is_fixed_point as cpp_is_fixed_point,
to_fixed_point as cpp_to_fixed_point,
)
from pylibcudf.libcudf.types cimport data_type, type_id

from cudf._lib.column cimport Column
from cudf._lib.types cimport dtype_to_pylibcudf_type

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -32,14 +21,10 @@ def from_decimal(Column input_col):
-------
A column of strings representing the input decimal values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_fixed_point(
input_column_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
input_col.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type):
-------
A column of decimals parsed from the string values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
cdef int scale = out_type.scale
cdef data_type c_out_type
if isinstance(out_type, cudf.Decimal32Dtype):
c_out_type = data_type(type_id.DECIMAL32, -scale)
elif isinstance(out_type, cudf.Decimal64Dtype):
c_out_type = data_type(type_id.DECIMAL64, -scale)
elif isinstance(out_type, cudf.Decimal128Dtype):
c_out_type = data_type(type_id.DECIMAL128, -scale)
else:
raise TypeError("should be a decimal dtype")
with nogil:
c_result = move(
cpp_to_fixed_point(
input_column_view,
c_out_type))

result = Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
input_col.to_pylibcudf(mode="read"),
dtype_to_pylibcudf_type(out_type),
)
result = Column.from_pylibcudf(plc_column)
result.dtype.precision = out_type.precision
return result

Expand All @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype):
-------
A Column of booleans indicating valid decimal conversion.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = input_col.view()
cdef int scale = dtype.scale
cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale)
with nogil:
c_result = move(cpp_is_fixed_point(
source_view,
c_dtype
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
input_col.to_pylibcudf(mode="read"),
dtype_to_pylibcudf_type(dtype),
)
return Column.from_pylibcudf(plc_column)
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
from packaging import version

PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3")
PANDAS_VERSION = version.parse(pd.__version__)


Expand Down
2 changes: 1 addition & 1 deletion python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dependencies = [
"numpy>=1.23,<3.0a0",
"nvtx>=0.2.1",
"packaging",
"pandas>=2.0,<2.2.3dev0",
"pandas>=2.0,<2.2.4dev0",
"ptxcompiler",
"pyarrow>=14.0.0,<18.0.0a0",
"pylibcudf==24.12.*,>=0.0.0a0",
Expand Down
2 changes: 1 addition & 1 deletion python/dask_cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
"cupy-cuda11x>=12.0.0",
"fsspec>=0.6.0",
"numpy>=1.23,<3.0a0",
"pandas>=2.0,<2.2.3dev0",
"pandas>=2.0,<2.2.4dev0",
"rapids-dask-dependency==24.12.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
classifiers = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_durations(
const column_view & strings_col,
const column_view & input,
data_type duration_type,
const string & format) except +

Expand Down
Loading

0 comments on commit d27685e

Please sign in to comment.