Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into fea/move_to_latest_nanoarrow
Browse files Browse the repository at this point in the history
  • Loading branch information
robertmaynard committed Apr 12, 2024
2 parents e007013 + f19d4eb commit d649bf4
Show file tree
Hide file tree
Showing 14 changed files with 160 additions and 255 deletions.
184 changes: 0 additions & 184 deletions cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff

This file was deleted.

18 changes: 0 additions & 18 deletions cpp/cmake/thirdparty/patches/nanoarrow_override.json

This file was deleted.

30 changes: 17 additions & 13 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
tree_meta_t const& d_gpu_tree,
rmm::cuda_stream_view stream)
{
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
"node_categories",
to_cat);
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
"parent_node_ids",
to_int);
print_vec(
cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
print_vec(node_range_begin, "node_range_begin", to_int);
print_vec(node_range_end, "node_range_end", to_int);
for (int i = 0; i < int(node_range_begin.size()); i++) {
Expand Down Expand Up @@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
* @param stream CUDA stream
* @return Vector of strings
*/
std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
std::vector<std::string> copy_strings_to_host_sync(
device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
auto const num_strings = node_range_begin.size();
Expand Down Expand Up @@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
auto to_host = [stream](auto const& col) {
if (col.is_empty()) return std::vector<std::string>{};
auto const scv = cudf::strings_column_view(col);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
auto const h_chars = cudf::detail::make_std_vector_async<char>(
cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
auto const h_offsets = cudf::detail::make_std_vector_sync(
auto const h_offsets = cudf::detail::make_std_vector_async(
cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
scv.size() + 1),
stream);
stream.synchronize();

// build std::string vector from chars and offsets
std::vector<std::string> host_data;
Expand Down Expand Up @@ -528,15 +530,17 @@ void make_device_json_column(device_span<SymbolT const> input,
auto column_range_beg =
cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
std::vector<std::string> column_names = copy_strings_to_host(
std::vector<std::string> column_names = copy_strings_to_host_sync(
input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
stream.synchronize();
// array of arrays column names
if (is_array_of_arrays) {
TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
auto values_column_indices =
get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
auto h_values_column_indices =
cudf::detail::make_std_vector_async(values_column_indices, stream);
stream.synchronize();
std::transform(unique_col_ids.begin(),
unique_col_ids.end(),
column_names.begin(),
Expand Down Expand Up @@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,

std::vector<uint8_t> is_str_column_all_nulls{};
if (is_enabled_mixed_types_as_string) {
is_str_column_all_nulls = cudf::detail::make_std_vector_async(
is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
}

Expand Down
27 changes: 19 additions & 8 deletions docs/cudf/source/user_guide/pandas-comparison.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.

## Result ordering

By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
do *not* guarantee output ordering.
Compare the results obtained from Pandas and cuDF below:
In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
certain guarantees about the order of rows in the result returned. In a Pandas
`join`, the order of join keys is (depending on the particular style of join
being performed) either preserved or sorted lexicographically by default.
`groupby` sorts the group keys, and preserves the order of rows within each
group. In some cases, disabling this option in Pandas can yield better
performance.

By contrast, cuDF's default behavior is to return rows in a
non-deterministic order to maximize performance. Compare the results
obtained from Pandas and cuDF below:

```{code} python
>>> import cupy as cp
Expand All @@ -114,13 +122,16 @@ a
4 342.000000
```

To match Pandas behavior, you must explicitly pass `sort=True`
or enable the `mode.pandas_compatible` option when trying to
match Pandas behavior with `sort=False`:
In most cases, the rows of a DataFrame are accessed by index labels
rather than by position, so the order in which rows are returned
doesn't matter. However, if you require that results be returned in a
predictable (sorted) order, you can pass the `sort=True` option
explicitly or enable the `mode.pandas_compatible` option when trying
to match Pandas behavior with `sort=False`:

```{code} python
>>> df.to_pandas().groupby("a", sort=True).mean().head()
b
>>> df.groupby("a", sort=True).mean().head()
b
a
0 70.000000
1 356.333333
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,5 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers(pylibcudf_interop)

add_subdirectory(strings)
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from . cimport (
search,
sorting,
stream_compaction,
strings,
types,
unary,
)
Expand Down Expand Up @@ -48,6 +49,7 @@ __all__ = [
"rolling",
"search",
"stream_compaction",
"strings",
"sorting",
"types",
"unary",
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
search,
sorting,
stream_compaction,
strings,
types,
unary,
)
Expand Down Expand Up @@ -48,6 +49,7 @@
"rolling",
"search",
"stream_compaction",
"strings",
"sorting",
"types",
"unary",
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources case.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import case
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import case
Loading

0 comments on commit d649bf4

Please sign in to comment.