Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into eagerly-populate-class-dict
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Apr 12, 2024
2 parents a078724 + f19d4eb commit fa542bb
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 21 deletions.
30 changes: 17 additions & 13 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
tree_meta_t const& d_gpu_tree,
rmm::cuda_stream_view stream)
{
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
"node_categories",
to_cat);
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
"parent_node_ids",
to_int);
print_vec(
cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
print_vec(node_range_begin, "node_range_begin", to_int);
print_vec(node_range_end, "node_range_end", to_int);
for (int i = 0; i < int(node_range_begin.size()); i++) {
Expand Down Expand Up @@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
* @param stream CUDA stream
* @return Vector of strings
*/
std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
std::vector<std::string> copy_strings_to_host_sync(
device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
auto const num_strings = node_range_begin.size();
Expand Down Expand Up @@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
auto to_host = [stream](auto const& col) {
if (col.is_empty()) return std::vector<std::string>{};
auto const scv = cudf::strings_column_view(col);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
auto const h_chars = cudf::detail::make_std_vector_async<char>(
cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
auto const h_offsets = cudf::detail::make_std_vector_sync(
auto const h_offsets = cudf::detail::make_std_vector_async(
cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
scv.size() + 1),
stream);
stream.synchronize();

// build std::string vector from chars and offsets
std::vector<std::string> host_data;
Expand Down Expand Up @@ -528,15 +530,17 @@ void make_device_json_column(device_span<SymbolT const> input,
auto column_range_beg =
cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
std::vector<std::string> column_names = copy_strings_to_host(
std::vector<std::string> column_names = copy_strings_to_host_sync(
input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
stream.synchronize();
// array of arrays column names
if (is_array_of_arrays) {
TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
auto values_column_indices =
get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
auto h_values_column_indices =
cudf::detail::make_std_vector_async(values_column_indices, stream);
stream.synchronize();
std::transform(unique_col_ids.begin(),
unique_col_ids.end(),
column_names.begin(),
Expand Down Expand Up @@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,

std::vector<uint8_t> is_str_column_all_nulls{};
if (is_enabled_mixed_types_as_string) {
is_str_column_all_nulls = cudf::detail::make_std_vector_async(
is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
}

Expand Down
27 changes: 19 additions & 8 deletions docs/cudf/source/user_guide/pandas-comparison.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.

## Result ordering

By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
do *not* guarantee output ordering.
Compare the results obtained from Pandas and cuDF below:
In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
certain guarantees about the order of rows in the result returned. In a Pandas
`join`, the order of join keys is (depending on the particular style of join
being performed) either preserved or sorted lexicographically by default.
`groupby` sorts the group keys, and preserves the order of rows within each
group. In some cases, disabling this option in Pandas can yield better
performance.

By contrast, cuDF's default behavior is to return rows in a
non-deterministic order to maximize performance. Compare the results
obtained from Pandas and cuDF below:

```{code} python
>>> import cupy as cp
Expand All @@ -114,13 +122,16 @@ a
4 342.000000
```

To match Pandas behavior, you must explicitly pass `sort=True`
or enable the `mode.pandas_compatible` option when trying to
match Pandas behavior with `sort=False`:
In most cases, the rows of a DataFrame are accessed by index labels
rather than by position, so the order in which rows are returned
doesn't matter. However, if you require that results be returned in a
predictable (sorted) order, you can pass the `sort=True` option
explicitly or enable the `mode.pandas_compatible` option when trying
to match Pandas behavior with `sort=False`:

```{code} python
>>> df.to_pandas().groupby("a", sort=True).mean().head()
b
>>> df.groupby("a", sort=True).mean().head()
b
a
0 70.000000
1 356.333333
Expand Down

0 comments on commit fa542bb

Please sign in to comment.