Skip to content

Commit

Permalink
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into cow_and_…
Browse files Browse the repository at this point in the history
…spilling
  • Loading branch information
madsbk committed Apr 17, 2024
2 parents 6fe2d58 + 02f8e2f commit 51b4b82
Show file tree
Hide file tree
Showing 50 changed files with 748 additions and 589 deletions.
1 change: 1 addition & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ ENV SCCACHE_REGION="us-east-2"
ENV SCCACHE_BUCKET="rapids-sccache-devs"
ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
ENV HISTFILE="/home/coder/.cache/._bash_history"
ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
18 changes: 18 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,21 @@ jobs:
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
package-name: dask_cudf
trigger-pandas-tests:
if: inputs.build_type == 'nightly'
needs: wheel-build-cudf
runs-on: ubuntu-latest
steps:
- name: Checkout code repo
uses: actions/checkout@v4
with:
ref: ${{ inputs.sha }}
persist-credentials: false
- name: Trigger pandas-tests
env:
GH_TOKEN: ${{ github.token }}
run: |
gh workflow run pandas-tests.yaml \
-f branch=${{ inputs.branch }} \
-f sha=${{ inputs.sha }} \
-f date=${{ inputs.date }}
27 changes: 27 additions & 0 deletions .github/workflows/pandas-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Pandas Test Job

on:
workflow_dispatch:
inputs:
branch:
required: true
type: string
date:
required: true
type: string
sha:
required: true
type: string

jobs:
pandas-tests:
# run the Pandas unit tests
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
4 changes: 2 additions & 2 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,15 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
build_type: pull-request
script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
# Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
test_summary_show: "none"
pandas-tests-diff:
# diff the results of running the Pandas unit tests and publish a job summary
needs: pandas-tests
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
with:
node_type: cpu4
build_type: pull-request
Expand Down
12 changes: 0 additions & 12 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ jobs:
container_image: "rapidsai/ci-conda:latest"
run_script: "ci/test_cpp_memcheck.sh"
static-configure:
needs: checks
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
Expand Down Expand Up @@ -125,14 +124,3 @@ jobs:
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/run_tests.sh
pandas-tests:
# run the Pandas unit tests
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
# create the conda environment (assuming in base `cudf` directory)
# note: RAPIDS currently doesn't support `channel_priority: strict`;
# use `channel_priority: flexible` instead
conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
# activate the environment
conda activate cudf_dev
```
Expand Down
2 changes: 0 additions & 2 deletions ci/configure_cpp_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

set -euo pipefail

rapids-configure-conda-channels

source rapids-date-string

rapids-logger "Configure static cpp build"
Expand Down
8 changes: 5 additions & 3 deletions ci/cudf_pandas_scripts/pandas-tests/diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@

# Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
GH_JOB_NAME="pandas-tests-diff / build"
RAPIDS_FULL_VERSION=$(<./VERSION)
rapids-logger "Github job name: ${GH_JOB_NAME}"
rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"

PY_VER="39"
MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json

rapids-logger "Fetching latest available results from nightly"
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt

read -r COMPARE_ENV < s3_output.txt
export COMPARE_ENV
Expand Down
11 changes: 6 additions & 5 deletions ci/cudf_pandas_scripts/pandas-tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
set -euo pipefail

PANDAS_TESTS_BRANCH=${1}

rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
RAPIDS_FULL_VERSION=$(<./VERSION)
rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
Expand All @@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
--dist worksteal \
--report-log=${PANDAS_TESTS_BRANCH}.json 2>&1

SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
# summarize the results and save them to artifacts:
python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ dependencies:
- pandoc
- pip
- pre-commit
- protobuf>=4.21,<5
- protobuf>=3.20,<5
- ptxcompiler
- pyarrow==14.0.2.*
- pydata-sphinx-theme!=0.14.2
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ dependencies:
- pandoc
- pip
- pre-commit
- protobuf>=4.21,<5
- protobuf>=3.20,<5
- pyarrow==14.0.2.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ requirements:
{% endif %}
- cuda-version ={{ cuda_version }}
run:
- {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
- protobuf >=3.20,<5.0a0
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.2dev0
Expand Down
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,11 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)

# ##################################################################################################
# * decimal benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)

add_custom_target(
run_benchmarks
DEPENDS CUDF_BENCHMARKS
Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,11 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
distribution_fn<DeviceType> dist;
std::optional<numeric::scale_type> scale;

random_value_fn(distribution_params<DeviceType> const& desc)
random_value_fn(distribution_params<T> const& desc)
: lower_bound{desc.lower_bound},
upper_bound{desc.upper_bound},
dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
dist{make_distribution<DeviceType>(desc.id, lower_bound, upper_bound)},
scale{desc.scale}
{
}

Expand Down
42 changes: 36 additions & 6 deletions cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,17 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
cudf::size_type max_depth;
};

// Present for compilation only. To be implemented once reader/writers support the fixed width type.
/**
* @brief Fixed-point values are parameterized with a distribution type, scale, and bounds of the
* same type.
*/
template <typename T>
struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
distribution_id id;
typename T::rep lower_bound;
typename T::rep upper_bound;
std::optional<numeric::scale_type> scale;
};

/**
* @brief Returns a vector of types, corresponding to the input type or a type group.
Expand Down Expand Up @@ -226,7 +234,7 @@ class data_profile {
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
distribution_params<cudf::struct_view> struct_dist_desc{
{cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
std::map<cudf::type_id, distribution_params<numeric::decimal128>> decimal_params;

double bool_probability_true = 0.5;
std::optional<double> null_probability = 0.01;
Expand Down Expand Up @@ -300,16 +308,21 @@ class data_profile {
}

template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
distribution_params<typename T::rep> get_distribution_params() const
distribution_params<T> get_distribution_params() const
{
using rep = typename T::rep;
auto it = decimal_params.find(cudf::type_to_id<T>());
if (it == decimal_params.end()) {
auto const range = default_range<rep>();
return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
auto const scale = std::optional<numeric::scale_type>{};
return distribution_params<T>{
default_distribution_id<rep>(), range.first, range.second, scale};
} else {
auto& desc = it->second;
return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
return {desc.id,
static_cast<rep>(desc.lower_bound),
static_cast<rep>(desc.upper_bound),
desc.scale};
}
}

Expand Down Expand Up @@ -359,6 +372,23 @@ class data_profile {
}
}

// Users should pass integral values for bounds when setting the parameters for fixed-point.
// Otherwise the call with have no effect.
template <typename T,
typename Type_enum,
std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
void set_distribution_params(Type_enum type_or_group,
distribution_id dist,
T lower_bound,
T upper_bound,
numeric::scale_type scale)
{
for (auto tid : get_type_or_group(static_cast<int32_t>(type_or_group))) {
decimal_params[tid] = {
dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale};
}
}

template <typename T, typename Type_enum, std::enable_if_t<cudf::is_chrono<T>(), T>* = nullptr>
void set_distribution_params(Type_enum type_or_group,
distribution_id dist,
Expand Down
Loading

0 comments on commit 51b4b82

Please sign in to comment.