Merge pull request #14372 from bdice/branch-23.12-merge-23.10

Forward-merge branch-23.10 to branch-23.12
rapidsai · Nov 8, 2023 · a35f90c · a35f90c
2 parents d3dcc75 + 8cdedd8
commit a35f90c
Show file tree

Hide file tree

Showing 128 changed files with 9,508 additions and 531 deletions.
diff --git a/.github/ISSUE_TEMPLATE/pandas_function_request.md b/.github/ISSUE_TEMPLATE/pandas_function_request.md
@@ -0,0 +1,22 @@
+---
+name: Request a Missing Pandas Function
+about: Request GPU support for a function executed on the CPU in pandas accelerator mode.
+title: "[FEA]"
+labels: "? - Needs Triage, feature request"
+assignees: ''
+
+---
+
+This issue template is intended to be used primarily for requests related to pandas accelerator mode. If you'd like to file a general cuDF feature request, please [click here](https://github.com/rapidsai/cudf/issues/new?assignees=&labels=%3F+-+Needs+Triage%2C+feature+request&projects=&template=feature_request.md&title=%5BFEA%5D).
+
+
+**Missing Pandas Feature Request**
+A clear and concise summary of the pandas function(s) you'd like to be able run with cuDF.
+
+
+**Profiler Output**
+If you used the profiler in pandas accelerator mode, please provide the full output of your profiling report.
+
+
+**Additional context**
+Add any other context, code examples, or references to existing implementations about the feature request here.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -25,6 +25,10 @@ jobs:
       - wheel-tests-cudf
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
+      - unit-tests-cudf-pandas
+      - pandas-tests
+      #- pandas-tests-diff
+      #- pandas-tests-diff-comment
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
   checks:
@@ -126,3 +130,52 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
       script: ci/test_wheel_dask_cudf.sh
+  unit-tests-cudf-pandas:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
+      build_type: pull-request
+      script: ci/cudf_pandas_scripts/run_tests.sh
+  pandas-tests:
+    # run the Pandas unit tests using PR branch
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
+      build_type: pull-request
+      script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
+  #pandas-tests-diff:
+  #  # diff the results of running the Pandas unit tests and publish a job summary
+  #  needs: [pandas-tests-main, pandas-tests-pr]
+  #  secrets: inherit
+  #  # This branch exports a `job_output` output that the downstream job reads.
+  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@wence/fea/custom-job-output
+  #  with:
+  #    node_type: cpu4
+  #    build_type: pull-request
+  #    run_script: ci/cudf_pandas_scripts/pandas-tests/diff.sh
+  #pandas-tests-diff-comment:
+  #  # Post comment of pass/fail rate on PR
+  #  runs-on: ubuntu-latest
+  #  needs: pandas-tests-diff
+  #  steps:
+  #    - uses: actions/github-script@v6
+  #      with:
+  #        script: |
+  #          const branch = process.env.GITHUB_REF_NAME;
+  #          const prBranchPattern = new RegExp("^pull-request/[0-9]+$");
+  #          if (!branch.match(prBranchPattern)) {
+  #            throw new Error(`${branch} does not match PR branch pattern.`);
+  #          }
+  #          const summary_url = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+  #          const prNumber = branch.split("/")[1];
+  #          const summary_comment = `${{ needs.pandas-tests-diff.outputs.job_output }}`;
+  #          github.rest.issues.createComment({
+  #            issue_number: prNumber,
+  #            owner: context.repo.owner,
+  #            repo: context.repo.repo,
+  #            body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n`
+  #          })
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -96,3 +96,25 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_dask_cudf.sh
+  unit-tests-cudf-pandas:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/cudf_pandas_scripts/run_tests.sh
+  pandas-tests:
+    # run the Pandas unit tests
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      # pr mode uses the HEAD of the branch, which is also correct for nightlies
+      script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
diff --git a/.gitignore b/.gitignore
@@ -164,7 +164,7 @@ dask-worker-space/
 
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
-docs/cudf/source/api_docs/api/*
+docs/cudf/source/user_guide/api_docs/api/*
 docs/cudf/source/user_guide/example_output/*
 docs/cudf/source/user_guide/cudf.*Dtype.*.rst
 _html

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,6 +52,11 @@ repos:
                 # https://github.com/PyCQA/pydocstyle/issues/603
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
+                exclude: |
+                  (?x)^(
+                    ^python/cudf/cudf/pandas/scripts/.*|
+                    ^python/cudf/cudf_pandas_tests/.*
+                  )
       - repo: https://github.com/nbQA-dev/nbQA
         rev: 1.6.3
         hooks:

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -21,7 +21,7 @@ PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
 # Patch project metadata files to include the CUDA version suffix and version override.
 pyproject_file="${package_dir}/pyproject.toml"
 
-sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
 echo "${version}" > VERSION
 sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
 

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
@@ -11,5 +11,6 @@ export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Download the summarized results of running the Pandas tests on both the main
+# branch and the PR branch:
+
+# Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.pr-results.json
+aws s3 cp $MAIN_ARTIFACT main-results.json
+aws s3 cp $PR_ARTIFACT pr-results.json
+
+# Compute the diff and prepare job summary:
+python -m pip install pandas tabulate
+python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY"
+
+COMMENT=$(head -1 summary.txt)
+
+echo "$COMMENT"
+
+# Magic name that the custom-job.yaml workflow reads and re-exports
+echo "job_output=${COMMENT}" >> "${GITHUB_OUTPUT}"
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import sys
+
+import pandas as pd
+
+
+def get_total_and_passed(results):
+    total_failed = 0
+    total_errored = 0
+    total_passed = 0
+    for module_name, row in results.items():
+        total_failed += row.get("failed", 0)
+        total_errored += row.get("errored", 0)
+        total_passed += row.get("passed", 0)
+    total_tests = total_failed + total_errored + total_passed
+    return total_tests, total_passed
+
+
+main_json = sys.argv[1]
+pr_json = sys.argv[2]
+
+# read the results of summarize-test-results.py --summary
+with open(main_json) as f:
+    main_results = json.load(f)
+main_total, main_passed = get_total_and_passed(main_results)
+
+with open(pr_json) as f:
+    pr_results = json.load(f)
+pr_total, pr_passed = get_total_and_passed(pr_results)
+
+passing_percentage = pr_passed / pr_total * 100
+pass_rate_change = abs(pr_passed - main_passed) / main_passed * 100
+rate_change_type = "a decrease" if pr_passed < main_passed else "an increase"
+
+comment = (
+    "Merging this PR would result in "
+    f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) "
+    "Pandas tests passing, "
+    f"{rate_change_type} in the test pass rate by "
+    f"{pass_rate_change:.2f}%. "
+    f"Trunk stats: {main_passed}/{main_total}."
+)
+
+
+def emoji_passed(x):
+    if x > 0:
+        return f"{x}✅"
+    elif x < 0:
+        return f"{x}❌"
+    else:
+        return f"{x}"
+
+
+def emoji_failed(x):
+    if x > 0:
+        return f"{x}❌"
+    elif x < 0:
+        return f"{x}✅"
+    else:
+        return f"{x}"
+
+
+# convert pr_results to a pandas DataFrame and then a markdown table
+pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
+main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
+diff_df = pr_df - main_df
+
+pr_df = pr_df[["total", "passed", "failed", "skipped"]]
+diff_df = diff_df[["total", "passed", "failed", "skipped"]]
+diff_df.columns = diff_df.columns + "_diff"
+diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
+diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
+diff_df["skipped_diff"] = diff_df["skipped_diff"].map(emoji_failed)
+
+df = pd.concat([pr_df, diff_df], axis=1)
+df = df.rename_axis("Test module")
+
+df = df.rename(
+    columns={
+        "total": "Total tests",
+        "passed": "Passed tests",
+        "failed": "Failed tests",
+        "skipped": "Skipped tests",
+        "total_diff": "Total delta",
+        "passed_diff": "Passed delta",
+        "failed_diff": "Failed delta",
+        "skipped_diff": "Skipped delta",
+    }
+)
+df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)
+
+print(comment)
+print()
+print("Here are the results of running the Pandas tests against this PR:")
+print()
+print(df.to_markdown())
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+PANDAS_TESTS_BRANCH=${1}
+
+rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
+rapids-logger "PR number: $RAPIDS_REF_NAME"
+
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas_tests]
+
+git checkout $COMMIT
+
+bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
+  -n 10 \
+  --tb=line \
+  --skip-slow \
+  --max-worker-restart=3 \
+  --import-mode=importlib \
+  --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
+
+# summarize the results and save them to artifacts:
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
+RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
+mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -eoxu pipefail
+
+# Function to display script usage
+function display_usage {
+    echo "Usage: $0 [--no-cudf]"
+}
+
+# Default value for the --no-cudf option
+no_cudf=false
+
+# Parse command-line arguments
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --no-cudf)
+            no_cudf=true
+            shift
+            ;;
+        *)
+            echo "Error: Unknown option $1"
+            display_usage
+            exit 1
+            ;;
+    esac
+done
+
+if [ "$no_cudf" = true ]; then
+    echo "Skipping cudf install"
+else
+    # Set the manylinux version used for downloading the wheels so that we test the
+    # newer ABI wheels on the newer images that support their installation.
+    # Need to disable pipefail for the head not to fail, see
+    # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+    set +o pipefail
+    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+    set -o pipefail
+    manylinux_version="2_17"
+    if [[ ${glibc_minor_version} -ge 28 ]]; then
+        manylinux_version="2_28"
+    fi
+    manylinux="manylinux_${manylinux_version}"
+
+    RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+    RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf_pandas_tests]
+fi
+
+python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -80,6 +80,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -78,6 +78,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -103,6 +103,7 @@ requirements:
     - nvtx >=0.2.1
     - packaging
     - cachetools
+    - rich
 
 test:
   requires:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,5 +11,6 @@ export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"

		python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*


		RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
		RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist