Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into stod-overflow-exp
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Apr 12, 2024
2 parents 0755cb3 + 2e00cb1 commit 66e72ab
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 43 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,21 @@ jobs:
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
package-name: dask_cudf
trigger-pandas-tests:
if: inputs.build_type == 'nightly'
needs: wheel-build-cudf
runs-on: ubuntu-latest
steps:
- name: Checkout code repo
uses: actions/checkout@v4
with:
ref: ${{ inputs.sha }}
persist-credentials: false
- name: Trigger pandas-tests
env:
GH_TOKEN: ${{ github.token }}
run: |
gh workflow run pandas-tests.yaml \
-f branch=${{ inputs.branch }} \
-f sha=${{ inputs.sha }} \
-f date=${{ inputs.date }}
27 changes: 27 additions & 0 deletions .github/workflows/pandas-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Pandas Test Job

on:
workflow_dispatch:
inputs:
branch:
required: true
type: string
date:
required: true
type: string
sha:
required: true
type: string

jobs:
pandas-tests:
# run the Pandas unit tests
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
2 changes: 1 addition & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
build_type: pull-request
script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
# Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
Expand Down
11 changes: 0 additions & 11 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,3 @@ jobs:
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/run_tests.sh
pandas-tests:
# run the Pandas unit tests
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
8 changes: 5 additions & 3 deletions ci/cudf_pandas_scripts/pandas-tests/diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@

# Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
GH_JOB_NAME="pandas-tests-diff / build"
RAPIDS_FULL_VERSION=$(<./VERSION)
rapids-logger "Github job name: ${GH_JOB_NAME}"
rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"

PY_VER="39"
MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json

rapids-logger "Fetching latest available results from nightly"
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt

read -r COMPARE_ENV < s3_output.txt
export COMPARE_ENV
Expand Down
11 changes: 6 additions & 5 deletions ci/cudf_pandas_scripts/pandas-tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
set -euo pipefail

PANDAS_TESTS_BRANCH=${1}

rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
RAPIDS_FULL_VERSION=$(<./VERSION)
rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
Expand All @@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
--dist worksteal \
--report-log=${PANDAS_TESTS_BRANCH}.json 2>&1

SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-24.06-results.json
# summarize the results and save them to artifacts:
python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"
30 changes: 17 additions & 13 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
tree_meta_t const& d_gpu_tree,
rmm::cuda_stream_view stream)
{
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
"node_categories",
to_cat);
print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
"parent_node_ids",
to_int);
print_vec(
cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
print_vec(node_range_begin, "node_range_begin", to_int);
print_vec(node_range_end, "node_range_end", to_int);
for (int i = 0; i < int(node_range_begin.size()); i++) {
Expand Down Expand Up @@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
* @param stream CUDA stream
* @return Vector of strings
*/
std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
std::vector<std::string> copy_strings_to_host_sync(
device_span<SymbolT const> input,
device_span<SymbolOffsetT const> node_range_begin,
device_span<SymbolOffsetT const> node_range_end,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
auto const num_strings = node_range_begin.size();
Expand Down Expand Up @@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
auto to_host = [stream](auto const& col) {
if (col.is_empty()) return std::vector<std::string>{};
auto const scv = cudf::strings_column_view(col);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
auto const h_chars = cudf::detail::make_std_vector_async<char>(
cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
auto const h_offsets = cudf::detail::make_std_vector_sync(
auto const h_offsets = cudf::detail::make_std_vector_async(
cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
scv.size() + 1),
stream);
stream.synchronize();

// build std::string vector from chars and offsets
std::vector<std::string> host_data;
Expand Down Expand Up @@ -528,15 +530,17 @@ void make_device_json_column(device_span<SymbolT const> input,
auto column_range_beg =
cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
std::vector<std::string> column_names = copy_strings_to_host(
std::vector<std::string> column_names = copy_strings_to_host_sync(
input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
stream.synchronize();
// array of arrays column names
if (is_array_of_arrays) {
TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
auto values_column_indices =
get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
auto h_values_column_indices =
cudf::detail::make_std_vector_async(values_column_indices, stream);
stream.synchronize();
std::transform(unique_col_ids.begin(),
unique_col_ids.end(),
column_names.begin(),
Expand Down Expand Up @@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,

std::vector<uint8_t> is_str_column_all_nulls{};
if (is_enabled_mixed_types_as_string) {
is_str_column_all_nulls = cudf::detail::make_std_vector_async(
is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
}

Expand Down
27 changes: 19 additions & 8 deletions docs/cudf/source/user_guide/pandas-comparison.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.

## Result ordering

By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
do *not* guarantee output ordering.
Compare the results obtained from Pandas and cuDF below:
In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
certain guarantees about the order of rows in the result returned. In a Pandas
`join`, the order of join keys is (depending on the particular style of join
being performed) either preserved or sorted lexicographically by default.
`groupby` sorts the group keys, and preserves the order of rows within each
group. In some cases, disabling this option in Pandas can yield better
performance.

By contrast, cuDF's default behavior is to return rows in a
non-deterministic order to maximize performance. Compare the results
obtained from Pandas and cuDF below:

```{code} python
>>> import cupy as cp
Expand All @@ -114,13 +122,16 @@ a
4 342.000000
```

To match Pandas behavior, you must explicitly pass `sort=True`
or enable the `mode.pandas_compatible` option when trying to
match Pandas behavior with `sort=False`:
In most cases, the rows of a DataFrame are accessed by index labels
rather than by position, so the order in which rows are returned
doesn't matter. However, if you require that results be returned in a
predictable (sorted) order, you can pass the `sort=True` option
explicitly or enable the `mode.pandas_compatible` option when trying
to match Pandas behavior with `sort=False`:

```{code} python
>>> df.to_pandas().groupby("a", sort=True).mean().head()
b
>>> df.groupby("a", sort=True).mean().head()
b
a
0 70.000000
1 356.333333
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/pandas/_wrappers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def _DataFrame__dir__(self):
"__arrow_array__": arrow_array_method,
"__cuda_array_interface__": cuda_array_interface,
"__iter__": custom_iter,
"dt": _AccessorAttr(DatetimeProperties),
"dt": _AccessorAttr(CombinedDatetimelikeProperties),
"str": _AccessorAttr(StringMethods),
"cat": _AccessorAttr(_CategoricalAccessor),
"_constructor": _FastSlowAttribute("_constructor"),
Expand Down Expand Up @@ -208,7 +208,7 @@ def Index__new__(cls, *args, **kwargs):
"__array_function__": array_function_method,
"__arrow_array__": arrow_array_method,
"__cuda_array_interface__": cuda_array_interface,
"dt": _AccessorAttr(DatetimeProperties),
"dt": _AccessorAttr(CombinedDatetimelikeProperties),
"str": _AccessorAttr(StringMethods),
"cat": _AccessorAttr(_CategoricalAccessor),
"__iter__": custom_iter,
Expand Down

0 comments on commit 66e72ab

Please sign in to comment.