diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d6d3e3fdd33..b5d17022a3a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index d670132cca9..10c803f7921 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..b515dbff9f3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,7 +37,7 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -52,7 +52,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: @@ -104,39 +104,39 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_cpp == 'true' with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -145,7 +145,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -153,7 +153,7 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_java == 'true' with: build_type: pull-request @@ -164,7 +164,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -174,7 +174,7 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_notebooks == 'true' with: build_type: pull-request @@ -185,7 +185,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -195,7 +195,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -204,21 +204,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -226,7 +226,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -235,7 +235,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -247,7 +247,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -256,7 +256,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -265,7 +265,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 with: arch: '["amd64"]' cuda: '["12.5"]' @@ -276,7 +276,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -287,7 +287,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -299,7 +299,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index fe77ad4b6b2..45e5191eb54 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4af6a0d690d..8605fa46f68 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -126,7 +126,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3bf9d02b384..6c5f4a68a4c 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 7563c823454..ce115fd7723 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64; void parquet_read_common(cudf::size_type num_rows_to_read, cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, - nvbench::state& state) + nvbench::state& state, + size_t table_data_size = data_size) { cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read, }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(static_cast(table_data_size) / time, "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list +void BM_parquet_read_wide_tables(nvbench::state& state, + nvbench::type_list> type_list) +{ + auto const d_type = get_type_or_group(static_cast(DataType)); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + +void BM_parquet_read_wide_tables_mixed(nvbench::state& state) +{ + auto const d_type = []() { + auto d_type1 = get_type_or_group(static_cast(data_type::INTEGRAL)); + auto d_type2 = get_type_or_group(static_cast(data_type::FLOAT)); + d_type1.reserve(d_type1.size() + d_type2.size()); + std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1)); + return d_type1; + }(); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + using d_type_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table)) + .set_name("parquet_read_wide_tables") + .set_min_samples(4) + .set_type_axes_names({"data_type"}) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed) + .set_name("parquet_read_wide_tables_mixed") + .set_min_samples(4) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + // a benchmark for structs that only contain fixed-width types using d_type_list_struct_only = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only)) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 3abd4280081..7121cb9f034 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) } std::tuple, size_t, size_t> write_file_data( - nvbench::state& state, std::vector const& d_types) + nvbench::state& state, std::vector const& d_types, io_type io_source_type) { cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); @@ -63,7 +63,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_source_type}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .set_name("parquet_multithreaded_read_decode_fixed_width") @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .set_name("parquet_multithreaded_read_decode_string") @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .set_name("parquet_multithreaded_read_decode_list") @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); // mixed data types: fixed width, strings NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .set_name("parquet_multithreaded_read_decode_chunked_fixed_width") @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .set_name("parquet_multithreaded_read_decode_chunked_string") @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .set_name("parquet_multithreaded_read_decode_chunked_list") @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp new file mode 100644 index 00000000000..adc3dddc59c --- /dev/null +++ b/cpp/benchmarks/text/word_minhash.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_word_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; + + data_profile const strings_profile = + data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + + auto const num_offsets = (num_rows / row_width) + 1; + auto offsets = cudf::sequence(num_offsets, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + auto source = cudf::make_lists_column(num_offsets - 1, + std::move(offsets), + std::move(strings_table->release().front()), + 0, + rmm::device_buffer{}); + + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, 256); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) + : nvtext::word_minhash(source->view(), seeds.view()); + }); +} + +NVBENCH_BENCH(bench_word_minhash) + .set_name("word_minhash") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) + .add_int64_axis("row_width", {10, 100, 1000}) + .add_int64_axis("seed_count", {2, 25}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ed7b2ac0850..ee03a382bec 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,8 +39,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group -constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_row_group_size_bytes = + std::numeric_limits::max(); ///< Infinite bytes per row group +constexpr size_type default_row_group_size_rows = 1'000'000; ///< 1 million rows per row group constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page constexpr int32_t default_column_index_truncate_length = 64; ///< truncate to 64 bytes diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index c83a4260c19..7c909f1a948 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ std::unique_ptr minhash( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -133,7 +134,7 @@ std::unique_ptr minhash64( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -150,5 +151,61 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm though + * only the first 64-bits of the hash are used in computing the output. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash64( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fd4ab9f82..ec05f35d405 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const table_size = std::reduce(column_sizes.begin(), column_sizes.end()); auto const avg_row_len = util::div_rounding_up_safe(table_size, input.num_rows()); if (avg_row_len > 0) { - auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len); - max_page_fragment_size = std::min(rg_frag_size, max_page_fragment_size); + // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size + // value (=uint64::max) to avoid a sign overflow when comparing + auto const rg_frag_size = + std::min(std::numeric_limits::max(), + util::div_rounding_up_safe(max_row_group_size, avg_row_len)); + // Safe comparison as rg_frag_size fits in size_type + max_page_fragment_size = + std::min(static_cast(rg_frag_size), max_page_fragment_size); } // dividing page size by average row length will tend to overshoot the desired diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 86402a0e7de..573101cefd9 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -51,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using semi_map_type = cuco::legacy::static_map>; - using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 19701816867..89c13285cfe 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,6 +25,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -160,6 +161,38 @@ struct pair_expression_equality : public expression_equality { } }; +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +struct double_row_equality_comparator { + row_equality const equality_comparator; + row_equality const conditional_comparator; + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } +}; + +// A CUDA Cooperative Group of 4 threads for the hash set. +auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4; + +// The hash set type used by mixed_semi_join with the build_table. +using hash_set_type = cuco::static_set, + cuda::thread_scope_device, + double_row_equality_comparator, + cuco::linear_probing, + cudf::detail::cuco_allocator, + cuco::storage<1>>; + +// The hash_set_ref_type used by mixed_semi_join kerenels for probing. +using hash_set_ref_type = hash_set_type::ref_type; + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 7459ac3e99c..f2c5ff13638 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,12 +38,16 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { + auto constexpr cg_size = hash_set_ref_type::cg_size; + + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between @@ -52,24 +56,24 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = left_num_rows; + &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates]; - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + cudf::size_type const outer_num_rows = left_table.num_rows(); + auto const outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; auto evaluator = cudf::ast::detail::expression_evaluator( left_table, right_table, device_expression_data); if (outer_row_index < outer_num_rows) { + // Make sure to swap_tables here as hash_set will use probe table as the left one. + auto constexpr swap_tables = true; // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, false, equality_probe}; + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - left_table_keep_mask[outer_row_index] = - hash_table_view.contains(outer_row_index, hash_probe, equality); + auto const set_ref_equality = set_ref.with_key_eq(equality); + auto const result = set_ref_equality.contains(tile, outer_row_index); + if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result; } } @@ -78,9 +82,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -94,9 +97,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } else { @@ -106,9 +108,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 43714ffb36a..b08298e64e4 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,9 +45,8 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] hash_table_view The hash table built from `build`. + * @param[in] set_ref The hash table device view built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index cfb785e242c..719b1d47105 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -46,45 +46,6 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -struct make_pair_function_semi { - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // The value is irrelevant since we only ever use the hash map to check for - // membership of a particular row index. - return cuco::make_pair(static_cast(i), 0); - } -}; - -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -class double_row_equality { - public: - double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) - : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} - { - } - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } - - private: - row_equality _equality_comparator; - row_equality _conditional_comparator; -}; - -} // namespace - std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -96,7 +57,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -137,7 +98,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || + cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -156,27 +117,20 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); + cudf::experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - semi_map_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); + // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -191,20 +145,28 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create(right_conditional, stream); + cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); + hash_set_type row_set{ + {compute_hash_table_size(build.num_rows())}, + cuco::empty_key{JoinNoneValue}, + {equality_build_equality, equality_build_conditional}, + {row_hash_build.device_hasher(build_nulls)}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + {stream.value()}}; + + auto iter = thrust::make_counting_iterator(0); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + row_set.insert(iter, iter + right_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -212,18 +174,19 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value()); } - auto hash_table_view = hash_table.get_device_view(); - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; + auto const shmem_size_per_block = + parser.shmem_per_thread * + cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); + hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); + // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -232,9 +195,8 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, - hash_probe, equality_probe, - hash_table_view, + row_set_ref, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 605582f28a6..a03a34f5fa7 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -151,15 +153,111 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); return hashes; } -std::unique_ptr build_list_result(cudf::strings_column_view const& input, +/** + * @brief Compute the minhash of each list row of strings for each seed + * + * This is a warp-per-row algorithm where parallel threads within a warp + * work on strings in a single list row. + * + * @tparam HashFunction hash function to use on each string + * + * @param d_input List of strings to process + * @param seeds Seeds for hashing each string + * @param d_hashes Minhash output values (one per row) + */ +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, + cudf::device_span seeds, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = idx / cudf::detail::warp_size; + + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const d_output = d_hashes + (row_idx * seeds.size()); + + // initialize hashes output for this row + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (lane_idx == 0) { + auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + // each lane hashes a string from the input row + for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { + auto const hash_str = + d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash string and store the min value + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + hv = thrust::get<0>(hasher(hash_str)); + } + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hv, cuda::std::memory_order_relaxed); + } + } +} + +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + auto lcdv = cudf::detail::lists_column_device_view(*d_input); + + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + minhash_word_kernel + <<>>(lcdv, seeds, d_hashes); + + return hashes; +} + +std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, rmm::cuda_stream_view stream, @@ -176,7 +274,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& std::move(offsets), std::move(hashes), input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input, stream, mr), stream, mr); // expect this condition to be very rare @@ -208,7 +306,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -232,7 +330,27 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } } // namespace detail @@ -276,4 +394,21 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash(input, seeds, stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash64(input, seeds, stream, mr); +} } // namespace nvtext diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 6c147c8a128..08a0136700d 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {2, 7, 8}); +} + TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -900,3 +915,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } + +TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {0, 1, 3, 4, 5, 6, 9}); +} diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..e23f3f6e7d8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, WordsMinHash) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto validity = cudf::test::iterators::null_at(1); + + LCWS input( + {LCWS({"hello", "abcdéfgh"}), + LCWS{}, + LCWS({"rapids", "moré", "test", "text"}), + LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, + validity); + + auto view = cudf::lists_column_view(input); + + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); + using LCW32 = cudf::test::lists_column_wrapper; + LCW32 expected({LCW32{2069617641u, 1975382903u}, + LCW32{}, + LCW32{657297235u, 1010955999u}, + LCW32{644643885u, 310002789u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, + LCW64{}, + LCW64{5331949571924938590ul, 2088583894581919741ul}, + LCW64{3400468157617183341ul, 2398577492366130055ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); diff --git a/dependencies.yaml b/dependencies.yaml index 483335c02ff..7a13043cc5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -710,7 +710,16 @@ dependencies: - numpy==1.23.* - pandas==2.0.* - pyarrow==14.0.0 - - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: conda + matrices: + - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"} + packages: + - cupy==12.2.0 # cupy 12.2.0 is the earliest with CUDA 12 ARM packages. + - matrix: {dependencies: "oldest"} + packages: + - cupy==12.0.0 - matrix: packages: - output_types: requirements diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 0398a8d7086..41838e01dd9 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the ```bash python -m cudf.pandas --profile script.py ``` + +### cudf.pandas CLI Features + +Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin. + +Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter. + +The `-c` flag accepts a code string to run, like this: + +```bash +$ python -m cudf.pandas -c "import pandas; print(pandas)" + +``` + +Users can also provide code to execute from stdin, like this: + +```bash +$ echo "import pandas; print(pandas)" | python -m cudf.pandas + +``` diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 5a0fbd224ad..6a0f0f6f169 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount, od, vd, nullCount, numRows, childHandles); } - static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { + /** + * Creates a ColumnVector from a native column_view using a contiguous device allocation. + * + * @param columnViewAddress address of the native column_view + * @param buffer device buffer containing the data referenced by the column view + */ + public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { return new ColumnVector(columnViewAddress, buffer); } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java index 7b58817550d..8c8180436e6 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,9 +24,13 @@ */ public final class ParquetWriterOptions extends CompressionMetadataWriterOptions { private final StatisticsFrequency statsGranularity; + private int rowGroupSizeRows; + private long rowGroupSizeBytes; private ParquetWriterOptions(Builder builder) { super(builder); + this.rowGroupSizeRows = builder.rowGroupSizeRows; + this.rowGroupSizeBytes = builder.rowGroupSizeBytes; this.statsGranularity = builder.statsGranularity; } @@ -51,18 +55,38 @@ public static Builder builder() { return new Builder(); } + public int getRowGroupSizeRows() { + return rowGroupSizeRows; + } + + public long getRowGroupSizeBytes() { + return rowGroupSizeBytes; + } + public StatisticsFrequency getStatisticsFrequency() { return statsGranularity; } public static class Builder extends CompressionMetadataWriterOptions.Builder { + private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group + private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP; public Builder() { super(); } + public Builder withRowGroupSizeRows(int rowGroupSizeRows) { + this.rowGroupSizeRows = rowGroupSizeRows; + return this; + } + + public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) { + this.rowGroupSizeBytes = rowGroupSizeBytes; + return this; + } + public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { this.statsGranularity = statsGranularity; return this; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index cbb126d7ee5..09da43374ae 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -332,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames, /** * Setup everything to write parquet formatted data to a file. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param filename local output path + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param filename local output path * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetFileBegin(String[] columnNames, @@ -355,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -366,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames, /** * Setup everything to write parquet formatted data to a buffer. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param consumer consumer of host buffers produced. + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetBufferBegin(String[] columnNames, @@ -389,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -1820,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), @@ -1840,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 40a111209b0..92e213bcb60 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2150,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2205,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) @@ -2227,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2280,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 56fe63598d9..830f2b33b32 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() { columns.add(Columns.STRUCT.name); WriteUtils.buildWriterOptions(optBuilder, columns); ParquetWriterOptions options = optBuilder.build(); - ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build(); + ParquetWriterOptions optionsNoCompress = + optBuilder.withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) + .build(); try (Table table0 = getExpectedFileTable(columns); MyBufferConsumer consumer = new MyBufferConsumer()) { try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { @@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { .withDecimalColumn("_c7", 4) .withDecimalColumn("_c8", 6) .withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5ee15d0e409..59cb8d51440 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.types cimport size_type @@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width): ) return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a0155671a26..e6c9d60b05b 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 134217728 + row_group_size_bytes: int, default ``uint64 max`` Maximum size of each stripe of the output. - By default, 134217728 (128MB) will be used. + By default, a virtually infinite size equal to ``uint64 max`` will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. @@ -661,11 +661,11 @@ cdef class ParquetWriter: def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", - int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - int row_group_size_rows=1000000, - int max_page_size_bytes=524288, - int max_page_size_rows=20000, - int max_dictionary_size=1048576, + size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + size_type row_group_size_rows=1000000, + size_t max_page_size_bytes=524288, + size_type max_page_size_rows=20000, + size_t max_dictionary_size=1048576, bool use_dictionary=True, bool store_schema=False): filepaths_or_buffers = ( diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 47a194c4fda..4bf8a9b1a8f 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, @@ -6,7 +6,12 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash, minhash64 +from cudf._lib.nvtext.minhash import ( + minhash, + minhash64, + word_minhash, + word_minhash64, +) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 16e6908f308..e059917b0b8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5349,6 +5349,76 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> ls.str.word_minhash(seeds=seeds) + 0 [21141582, 1232889953, 1268336794] + 1 [962346254, 2321233602, 1354839212] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash(self._column, seeds_column) + ) + + def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> ls.str.word_minhash64(seeds) + 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] + 1 [5240044617220523711, 5847101123925041457, 153762819128779913] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash64(self._column, seeds_column) + ) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..d73ad8225ca 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3d205957126..c026579b8b5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -738,7 +738,8 @@ def get_dummies( sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional - Right now this is NON-FUNCTIONAL argument in rapids. + Whether to get k-1 dummies out of k categorical levels by removing the + first level. columns : sequence of str, optional Names of columns to encode. If not provided, will attempt to encode all columns. Note this is different from pandas default behavior, which @@ -806,9 +807,6 @@ def get_dummies( if sparse: raise NotImplementedError("sparse is not supported yet") - if drop_first: - raise NotImplementedError("drop_first is not supported yet") - if isinstance(data, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] @@ -862,6 +860,7 @@ def get_dummies( prefix=prefix_map.get(name, prefix), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, + drop_first=drop_first, ) result_data.update(col_enc_data) return cudf.DataFrame._from_data(result_data, index=data.index) @@ -874,6 +873,7 @@ def get_dummies( prefix=prefix, prefix_sep=prefix_sep, dtype=dtype, + drop_first=drop_first, ) return cudf.DataFrame._from_data(data, index=ser.index) @@ -1256,6 +1256,7 @@ def _one_hot_encode_column( prefix: str | None, prefix_sep: str | None, dtype: Dtype | None, + drop_first: bool, ) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with @@ -1276,6 +1277,8 @@ def _one_hot_encode_column( ) data = one_hot_encode(column, categories) + if drop_first and len(data): + data.pop(next(iter(data))) if prefix is not None and prefix_sep is not None: data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} if dtype: diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 3a82829eb7a..e0d3d9101a9 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -10,6 +10,7 @@ """ import argparse +import code import runpy import sys import tempfile @@ -21,6 +22,8 @@ @contextmanager def profile(function_profile, line_profile, fn): + if fn is None and (line_profile or function_profile): + raise RuntimeError("Enabling the profiler requires a script name.") if line_profile: with open(fn) as f: lines = f.readlines() @@ -54,6 +57,11 @@ def main(): dest="module", nargs=1, ) + parser.add_argument( + "-c", + dest="cmd", + nargs=1, + ) parser.add_argument( "--profile", action="store_true", @@ -72,9 +80,18 @@ def main(): args = parser.parse_args() + if args.cmd: + f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") + f.write(args.cmd[0].encode()) + f.seek(0) + args.args.insert(0, f.name) + install() - with profile(args.profile, args.line_profile, args.args[0]) as fn: - args.args[0] = fn + + script_name = args.args[0] if len(args.args) > 0 else None + with profile(args.profile, args.line_profile, script_name) as fn: + if script_name is not None: + args.args[0] = fn if args.module: (module,) = args.module # run the module passing the remaining arguments @@ -85,6 +102,21 @@ def main(): # Remove ourself from argv and continue sys.argv[:] = args.args runpy.run_path(args.args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") if __name__ == "__main__": diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index cc17dc46e0a..e054143b438 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated(): df = cudf.DataFrame(range(3)) with pytest.warns(FutureWarning): cudf.get_dummies(df, cats={0: [0, 1, 2]}) + + +def test_get_dummies_drop_first_series(): + result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True) + expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) + assert_eq(result, expected) + + +def test_get_dummies_drop_first_dataframe(): + result = cudf.get_dummies( + cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), + drop_first=True, + ) + expected = pd.get_dummies( + pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 52179f55da3..997ca357986 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -946,6 +946,66 @@ def test_minhash(): strings.str.minhash64(seeds=seeds) +def test_word_minhash(): + ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), + cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash(seeds=seeds) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + cudf.Series([2603139454418834912], dtype=np.uint64), + cudf.Series([5240044617220523711], dtype=np.uint64), + ] + ) + actual = ls.str.word_minhash64() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [ + 2603139454418834912, + 8644371945174847701, + 5541030711534384340, + ], + dtype=np.uint64, + ), + cudf.Series( + [5240044617220523711, 5847101123925041457, 153762819128779913], + dtype=np.uint64, + ), + ] + ) + actual = ls.str.word_minhash64(seeds=seeds) + assert_eq(expected, actual) + + # test wrong seed types + with pytest.raises(ValueError): + ls.str.word_minhash(seeds="a") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + ls.str.word_minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + ls.str.word_minhash64(seeds=seeds) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1627107b57d..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -27,7 +27,7 @@ fsspec_parquet = None _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 +_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py new file mode 100644 index 00000000000..326224c8fc0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_main.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import tempfile +import textwrap + + +def _run_python(*, cudf_pandas, command): + executable = "python " + if cudf_pandas: + executable += "-m cudf.pandas " + return subprocess.run( + executable + command, + shell=True, + capture_output=True, + check=True, + text=True, + ) + + +def test_run_cudf_pandas_with_script(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f: + code = textwrap.dedent( + """ + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3]}) + print(df['a'].sum()) + """ + ) + f.write(code) + f.flush() + + res = _run_python(cudf_pandas=True, command=f.name) + expect = _run_python(cudf_pandas=False, command=f.name) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args(): + input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf(): + """Verify that cudf is active with -m cudf.pandas.""" + input_args_and_code = """-c 'import pandas as pd; print(pd)'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert "cudf" in res.stdout + assert "cudf" not in expect.stdout + + +def test_cudf_pandas_script_repl(): + def start_repl_process(cmd): + return subprocess.Popen( + cmd.split(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + + def get_repl_output(process, commands): + for command in commands: + process.stdin.write(command) + process.stdin.flush() + return process.communicate() + + p1 = start_repl_process("python -m cudf.pandas") + p2 = start_repl_process("python") + commands = [ + "import pandas as pd\n", + "print(pd.Series(range(2)).sum())\n", + "print(pd.Series(range(5)).sum())\n", + "import sys\n", + "print(pd.Series(list('abcd')), out=sys.stderr)\n", + ] + + res = get_repl_output(p1, commands) + expect = get_repl_output(p2, commands) + + # Check stdout + assert res[0] != "" + assert res[0] == expect[0] + + # Check stderr + assert res[1] != "" + assert res[1] == expect[1] + + p1.kill() + p2.kill() diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index f60e4ff81ef..97e1dffc65b 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs): return self.to_backend("pandas", **kwargs) + def _prepare_cov_corr(self, min_periods, numeric_only): + # Upstream version of this method sets min_periods + # to 2 by default (which is not supported by cudf) + # TODO: Remove when cudf supports both min_periods + # and numeric_only + # See: https://github.com/rapidsai/cudf/issues/12626 + # See: https://github.com/rapidsai/cudf/issues/9009 + self._meta.cov(min_periods=min_periods) + + frame = self + if numeric_only: + numerics = self._meta._get_numeric_data() + if len(numerics.columns) != len(self.columns): + frame = frame[list(numerics.columns)] + return frame, min_periods + # var can be removed if cudf#15179 is addressed. - # See: https://github.com/rapidsai/cudf/issues/15179 + # See: https://github.com/rapidsai/cudf/issues/14935 def var( self, axis=0, diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 905d8c08135..7aa0f6320f2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1007,3 +1007,20 @@ def test_to_backend_simplify(): df2 = df.to_backend("cudf")[["y"]].simplify() df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify() assert df2._name == df3._name + + +@pytest.mark.parametrize("numeric_only", [True, False]) +@pytest.mark.parametrize("op", ["corr", "cov"]) +def test_cov_corr(op, numeric_only): + df = cudf.DataFrame.from_dict( + { + "x": np.random.randint(0, 5, size=10), + "y": np.random.normal(size=10), + } + ) + ddf = dd.from_pandas(df, npartitions=2) + res = getattr(ddf, op)(numeric_only=numeric_only) + # Use to_pandas until cudf supports numeric_only + # (See: https://github.com/rapidsai/cudf/issues/12626) + expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) + dd.assert_eq(res, expect) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 0c352a5068b..f2dd22f43aa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &seeds, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except +