From 4fddb30f61f027747b12919888d3e3a66e08b623 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 23 Aug 2024 07:42:53 -0700 Subject: [PATCH 01/24] Move tests and ci files to cudf --- .../ci/check_style.sh | 18 + .../ci/ci_run_library_tests.sh | 65 ++++ .../ci/extract_lib.sh | 28 ++ .../ci/release/update-version.sh | 41 ++ .../third_party_integration_tests/ci/test.sh | 55 +++ .../third_party_integration_tests/conftest.py | 173 +++++++++ .../third_party_integration_tests/pytest.ini | 7 + .../run_library_tests.sh | 11 + .../test_cugraph.py | 94 +++++ .../test_cuml.py | 152 ++++++++ .../test_dask.py | 10 + .../test_featureengine.py | 47 +++ .../test_holoviews.py | 79 ++++ .../test_hvplot.py | 72 ++++ .../test_ibis.py | 169 ++++++++ .../test_matplotlib.py | 70 ++++ .../test_numpy.py | 59 +++ .../test_plotly.py | 67 ++++ .../test_pytorch.py | 126 ++++++ .../test_scipy.py | 65 ++++ .../test_seaborn.py | 60 +++ .../test_sklearn.py | 82 ++++ .../test_stumpy.py | 94 +++++ .../test_stumpy_distributed.py | 48 +++ .../test_tensorflow.py | 367 ++++++++++++++++++ .../test_xgboost.py | 135 +++++++ 26 files changed, 2194 insertions(+) create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini create mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh new file mode 100755 index 00000000000..b81b36ddb45 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create checks conda environment" +. /opt/conda/etc/profile.d/conda.sh + +rapids-dependency-file-generator \ + --output conda \ + --file-key checks \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + +rapids-mamba-retry env create --yes -f env.yaml -n checks +conda activate checks + +# Run pre-commit checks +pre-commit run --hook-stage manual --all-files --show-diff-on-failure diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh new file mode 100755 index 00000000000..d1627d6436b --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +cleanup() { + rm tests/results-*.pickle +} + +trap cleanup EXIT + +runtest_gold() { + local lib=$1 + local test_keys=${@:2} + + pytest \ + -v \ + --continue-on-collection-errors \ + --cache-clear \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-gold.xml" \ + --numprocesses=${NUM_PROCESSES} \ + --dist=worksteal \ + ${TEST_DIR}/test_${lib}*.py \ + ${test_keys} +} + +runtest_cudf_pandas() { + local lib=$1 + local test_keys=${@:2} + + pytest \ + -p cudf.pandas \ + -v \ + --continue-on-collection-errors \ + --cache-clear \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-cudf-pandas.xml" \ + --numprocesses=${NUM_PROCESSES} \ + --dist=worksteal \ + ${TEST_DIR}/test_${lib}*.py \ + ${test_keys} +} + +main() { + local lib=$1 + local test_keys=${@:2} + + # generation phase + runtest_gold ${lib} ${test_keys} + runtest_cudf_pandas ${lib} ${test_keys} + + # assertion phase + pytest \ + --compare \ + -p cudf.pandas \ + -v \ + --continue-on-collection-errors \ + --cache-clear \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-assertion.xml" \ + --numprocesses=${NUM_PROCESSES} \ + --dist=worksteal \ + ${TEST_DIR}/test_${lib}*.py \ + ${test_keys} +} + +main $@ diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh new file mode 100755 index 00000000000..4511363146e --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +set -euo pipefail + +write_output() { + local key="$1" + local value="$2" + echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" +} + +extract_lib_from_dependencies_yaml() { + local file=$1 + # Parse all keys in dependencies.yaml under the "files" section, + # extract all the keys that starts with "test_", and extract the + # rest + local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries( select(.key | contains("test_")) ) | keys | map(sub("^test_"; ""))')" + echo $extracted_libs + write_output "LIBS" $extracted_libs +} + + +main() { + local dependencies_yaml="$1" + extract_lib_from_dependencies_yaml "$dependencies_yaml" +} + +main "$@" diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh new file mode 100755 index 00000000000..5b6f8f5ce1c --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +########################################### +# cudf.pandas integration Version Updater # +########################################### + +## Usage +# bash update-version.sh + + +# Format is YY.MM.PP - no leading 'v' or trailing 'a' +NEXT_FULL_TAG=$1 + +#Get . for next version +NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') +NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') +NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} + +# Need to distutils-normalize the versions for some use cases +NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") + +# Inplace sed replace; workaround for Linux and Mac +function sed_runner() { + sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak +} + +# CI files +for FILE in .github/workflows/*.yaml; do + sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" +done + +DEPENDENCIES=( + cugraph + cudf + cuml +) +for DEP in "${DEPENDENCIES[@]}"; do + for FILE in dependencies.yaml; do + sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}" + done +done diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh new file mode 100755 index 00000000000..a012513b93a --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +# Common setup steps shared by Python test jobs + +LIB=$1 + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +rapids-logger "Generate Python testing dependencies" +rapids-dependency-file-generator \ + --output conda \ + --file-key test_${LIB} \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + +rapids-mamba-retry env create --yes -f env.yaml -n test + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" + +repo_root=$(git rev-parse --show-toplevel) +TEST_DIR=${repo_root}/tests + +rapids-print-env + +rapids-logger "Check GPU usage" +nvidia-smi + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +rapids-logger "pytest ${LIB}" + +NUM_PROCESSES=8 +serial_libraries=( + "tensorflow" +) +for serial_library in "${serial_libraries[@]}"; do + if [ "${LIB}" = "${serial_library}" ]; then + NUM_PROCESSES=1 + fi +done + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/ci_run_library_tests.sh ${LIB} + +rapids-logger "Test script exiting with value: ${EXITCODE}" +exit ${EXITCODE} diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py new file mode 100644 index 00000000000..33b6ffdbd5c --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py @@ -0,0 +1,173 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from __future__ import annotations + +import os +import pickle +from typing import TYPE_CHECKING, BinaryIO + +import _pytest +import _pytest.config +import _pytest.nodes +import pytest + +if TYPE_CHECKING: + import _pytest.python + +from _pytest.stash import StashKey + +from cudf.pandas.module_accelerator import disable_module_accelerator + +file_handle_key = StashKey[BinaryIO]() +basename_key = StashKey[str]() +test_folder_key = StashKey[str]() +results = StashKey[tuple[dict, dict]]() + + +def pytest_addoption(parser): + parser.addoption( + "--compare", + action="store_true", + default=False, + help="Run comparison step?", + ) + + +def read_results(f): + while True: + try: + yield pickle.load(f) + except EOFError: + return + + +def pytest_collection_modifyitems( + session, config: _pytest.config.Config, items: list[_pytest.nodes.Item] +): + if config.getoption("--compare"): + current_pass = "compare" + elif "cudf.pandas" in config.option.plugins: + current_pass = "cudf_pandas" + else: + current_pass = "gold" + + def swap_xfail(item: _pytest.nodes.Item, name: str): + """Replace custom `xfail_**` mark with a `xfail` mark having the same kwargs.""" + + old_mark = item.keywords[name] + new_mark = pytest.mark.xfail(**old_mark.kwargs) + + # Replace all "xfail_**" mark in the node chain with the "xfail" mark + # if not found, the node chain is not modified. + for node, mark in item.iter_markers_with_node(name): + idx = node.own_markers.index(mark) + node.own_markers[idx] = new_mark + + for item in items: + if current_pass == "gold" and "xfail_gold" in item.keywords: + swap_xfail(item, "xfail_gold") + elif ( + current_pass == "cudf_pandas" + and "xfail_cudf_pandas" in item.keywords + ): + swap_xfail(item, "xfail_cudf_pandas") + elif current_pass == "compare" and "xfail_compare" in item.keywords: + swap_xfail(item, "xfail_compare") + + +def pytest_configure(config: _pytest.config.Config): + gold_basename = "results-gold" + cudf_basename = "results-cudf-pandas" + test_folder = os.path.join(os.path.dirname(__file__)) + + if config.getoption("--compare"): + # Everyone reads everything + gold_path = os.path.join(test_folder, f"{gold_basename}.pickle") + cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle") + with disable_module_accelerator(): + with open(gold_path, "rb") as f: + gold_results = dict(read_results(f)) + with open(cudf_path, "rb") as f: + cudf_results = dict(read_results(f)) + config.stash[results] = (gold_results, cudf_results) + else: + if "cudf.pandas" in config.option.plugins: + basename = cudf_basename + else: + basename = gold_basename + + if hasattr(config, "workerinput"): + # If we're on an xdist worker, open a worker-unique pickle file. + worker = config.workerinput["workerid"] + filename = f"{basename}-{worker}.pickle" + else: + filename = f"{basename}.pickle" + + pickle_path = os.path.join(test_folder, filename) + config.stash[file_handle_key] = open(pickle_path, "wb") + config.stash[test_folder_key] = test_folder + config.stash[basename_key] = basename + + +def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function): + if pyfuncitem.config.getoption("--compare"): + gold_results, cudf_results = pyfuncitem.config.stash[results] + key = pyfuncitem.nodeid + try: + gold = gold_results[key] + except KeyError: + assert False, "pickled gold result is not available" + try: + cudf = cudf_results[key] + except KeyError: + assert False, "pickled cudf result is not available" + if gold is None and cudf is None: + raise ValueError(f"Integration test {key} did not return a value") + asserter = pyfuncitem.get_closest_marker("assert_eq") + if asserter is None: + assert gold == cudf, "Test failed" + else: + asserter.kwargs["fn"](gold, cudf) + else: + # Replace default call of test function with one that captures the + # result + testfunction = pyfuncitem.obj + funcargs = pyfuncitem.funcargs + testargs = { + arg: funcargs[arg] for arg in pyfuncitem._fixtureinfo.argnames + } + result = testfunction(**testargs) + # Tuple-based key-value pairs, key is the node-id + try: + pickle.dump( + (pyfuncitem.nodeid, result), + pyfuncitem.config.stash[file_handle_key], + ) + except pickle.PicklingError: + pass + return True + + +def pytest_unconfigure(config): + if config.getoption("--compare"): + return + if file_handle_key not in config.stash: + # We didn't open a pickle file + return + if not hasattr(config, "workerinput"): + # If we're the controlling process + if ( + hasattr(config.option, "numprocesses") + and config.option.numprocesses is not None + ): + # Concat the worker partial pickle results and remove them + for i in range(config.option.numprocesses): + worker_result = os.path.join( + config.stash[test_folder_key], + f"{config.stash[basename_key]}-gw{i}.pickle", + ) + with open(worker_result, "rb") as f: + config.stash[file_handle_key].write(f.read()) + os.remove(worker_result) + # Close our file + del config.stash[file_handle_key] diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini new file mode 100644 index 00000000000..817d98e6ba2 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +xfail_strict=true +markers= + assert_eq: custom binary asserter for a test + xfail_gold: this test is expected to fail in the gold pass + xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass + xfail_compare: this test is expected to fail in the comparison pass diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh new file mode 100755 index 00000000000..dafd2e77761 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"$(dirname "$0")"} +mkdir -p "${RAPIDS_TESTS_DIR}/test-results" + +repo_root=$(git rev-parse --show-toplevel) + +TEST_DIR="${repo_root}/tests/" RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR}" ${repo_root}/ci/ci_run_library_tests.sh "$@" diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py new file mode 100644 index 00000000000..7acc8672063 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import cugraph +import cupy as cp +import networkx as nx +import numpy as np +import pandas as pd +import pytest + +cugraph_algos = [ + "betweenness_centrality", + "degree_centrality", + "katz_centrality", + "sorensen_coefficient", + "jaccard_coefficient", +] + +nx_algos = [ + "betweenness_centrality", + "degree_centrality", + "katz_centrality", +] + + +def assert_cugraph_equal(expect, got): + if isinstance(expect, cp.ndarray): + expect = expect.get() + if isinstance(got, cp.ndarray): + got = got.get() + elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray): + assert np.array_equal(expect, got) + else: + assert expect == got + + +pytestmark = pytest.mark.assert_eq(fn=assert_cugraph_equal) + + +@pytest.fixture(scope="session") +def df(): + return pd.DataFrame({"source": [0, 1, 2], "destination": [1, 2, 3]}) + + +@pytest.fixture(scope="session") +def adjacency_matrix(): + data = { + "A": [0, 1, 1, 0], + "B": [1, 0, 0, 1], + "C": [1, 0, 0, 1], + "D": [0, 1, 1, 0], + } + df = pd.DataFrame(data, index=["A", "B", "C", "D"]) + return df + + +@pytest.mark.parametrize("algo", cugraph_algos) +def test_cugraph_from_pandas_edgelist(df, algo): + G = cugraph.Graph() + G.from_pandas_edgelist(df) + return getattr(cugraph, algo)(G).to_pandas().values + + +@pytest.mark.parametrize("algo", cugraph_algos) +def test_cugraph_from_pandas_adjacency(adjacency_matrix, algo): + G = cugraph.Graph() + G.from_pandas_adjacency(adjacency_matrix) + res = getattr(cugraph, algo)(G).to_pandas() + return res.sort_values(list(res.columns)).values + + +@pytest.mark.parametrize("algo", cugraph_algos) +def test_cugraph_from_numpy_array(df, algo): + G = cugraph.Graph() + G.from_numpy_array(df.values) + return getattr(cugraph, algo)(G).to_pandas().values + + +@pytest.mark.parametrize("algo", nx_algos) +def test_networkx_from_pandas_edgelist(df, algo): + G = nx.from_pandas_edgelist( + df, "source", "destination", ["source", "destination"] + ) + return getattr(nx, algo)(G) + + +@pytest.mark.parametrize("algo", nx_algos) +def test_networkx_from_pandas_adjacency(adjacency_matrix, algo): + G = nx.from_pandas_adjacency(adjacency_matrix) + return getattr(nx, algo)(G) + + +@pytest.mark.parametrize("algo", nx_algos) +def test_networkx_from_numpy_array(adjacency_matrix, algo): + G = nx.from_numpy_array(adjacency_matrix.values) + return getattr(nx, algo)(G) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py new file mode 100644 index 00000000000..892d0886596 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py @@ -0,0 +1,152 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import cupy as cp +import numpy as np +import pandas as pd +import pytest +from cuml.cluster import KMeans +from cuml.decomposition import PCA +from cuml.ensemble import RandomForestClassifier +from cuml.linear_model import LinearRegression, LogisticRegression +from cuml.metrics import accuracy_score +from cuml.model_selection import train_test_split +from cuml.pipeline import Pipeline +from cuml.preprocessing import StandardScaler + + +def assert_cuml_equal(expect, got): + # Coerce GPU arrays to CPU + if isinstance(expect, cp.ndarray): + expect = expect.get() + if isinstance(got, cp.ndarray): + got = got.get() + + # Handle equality + if isinstance(expect, KMeans) and isinstance(got, KMeans): + # same clusters + np.testing.assert_allclose( + expect.cluster_centers_, got.cluster_centers_ + ) + elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray): + np.testing.assert_allclose(expect, got) + elif isinstance(expect, tuple) and isinstance(got, tuple): + assert len(expect) == len(got) + for e, g in zip(expect, got): + assert_cuml_equal(e, g) + elif isinstance(expect, pd.DataFrame): + assert pd.testing.assert_frame_equal(expect, got) + elif isinstance(expect, pd.Series): + assert pd.testing.assert_series_equal(expect, got) + else: + assert expect == got + + +pytestmark = pytest.mark.assert_eq(fn=assert_cuml_equal) + + +@pytest.fixture +def binary_classification_data(): + data = { + "feature1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "feature2": [2.0, 4.0, 1.0, 3.0, 5.0, 7.0, 6.0, 8.0, 10.0, 9.0], + "target": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + } + df = pd.DataFrame(data) + return df + + +def test_linear_regression(): + lr = LinearRegression(fit_intercept=True, normalize=False, algorithm="eig") + X = pd.DataFrame() + X["col1"] = np.array([1, 1, 2, 2], dtype=np.float32) + X["col2"] = np.array([1, 2, 2, 3], dtype=np.float32) + y = pd.Series(np.array([6.0, 8.0, 9.0, 11.0], dtype=np.float32)) + lr.fit(X, y) + + X_new = pd.DataFrame() + X_new["col1"] = np.array([3, 2], dtype=np.float32) + X_new["col2"] = np.array([5, 5], dtype=np.float32) + preds = lr.predict(X_new) + return preds.values + + +def test_logistic_regression(binary_classification_data): + X = binary_classification_data[["feature1", "feature2"]] + y = binary_classification_data["target"] + + (X_train, X_test, y_train, y_test) = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + model = LogisticRegression() + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + + return accuracy + + +def test_random_forest(binary_classification_data): + X = binary_classification_data[["feature1", "feature2"]] + y = binary_classification_data["target"] + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + model = RandomForestClassifier(n_estimators=100) + model.fit(X_train, y_train) + preds = model.predict(X_test) + return preds.values + + +def test_clustering(): + rng = np.random.default_rng(42) + nsamps = 300 + X = rng.random((nsamps, 2)) + data = pd.DataFrame(X, columns=["x", "y"]) + + kmeans = KMeans(n_clusters=3, random_state=42) + kmeans.fit(data) + return kmeans + + +def test_data_scaling(): + data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) + scaler = StandardScaler() + + scaled_data = scaler.fit_transform(data.values.reshape(-1, 1)) + return scaled_data + + +def test_pipeline(binary_classification_data): + X = binary_classification_data[["feature1", "feature2"]] + y = binary_classification_data["target"] + + pipe = Pipeline( + [ + ("scaler", StandardScaler()), + ("pca", PCA()), + ("random_forest", LogisticRegression()), + ] + ) + + pipe.fit(X, y) + results = pipe.predict(X) + return results.values + + +@pytest.mark.parametrize( + "X, y", + [ + (pd.DataFrame({"a": range(10), "b": range(10)}), pd.Series(range(10))), + ( + pd.DataFrame({"a": range(10), "b": range(10)}).values, + pd.Series(range(10)).values, + ), # cudf.pandas wrapped numpy arrays + ], +) +def test_train_test_split(X, y): + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + # Compare only the size of the data splits + return len(X_train), len(X_test), len(y_train), len(y_test) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py new file mode 100644 index 00000000000..c34778dfded --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py @@ -0,0 +1,10 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import pandas as pd + +import dask.dataframe as dd + + +def test_sum(): + data = {"x": range(1, 11)} + ddf = dd.from_pandas(pd.DataFrame(data), npartitions=2) + return ddf["x"].sum().compute() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py new file mode 100644 index 00000000000..3e247291fad --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py @@ -0,0 +1,47 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +from feature_engine.imputation import DropMissingData +from feature_engine.preprocessing import MatchVariables + + +def test_drop_missing_data(): + data = { + "x": [np.nan, 1, 1, 0, np.nan], + "y": ["a", np.nan, "b", np.nan, "a"], + } + df = pd.DataFrame(data) + + dmd = DropMissingData() + dmd.fit(df) + dmd.transform(df) + + return dmd + + +def test_match_variables(): + train = pd.DataFrame( + { + "Name": ["tom", "nick", "krish", "jack"], + "City": ["London", "Manchester", "Liverpool", "Bristol"], + "Age": [20, 21, 19, 18], + "Marks": [0.9, 0.8, 0.7, 0.6], + } + ) + + test = pd.DataFrame( + { + "Name": ["tom", "sam", "nick"], + "Age": [20, 22, 23], + "Marks": [0.9, 0.7, 0.6], + "Hobbies": ["tennis", "rugby", "football"], + } + ) + + match_columns = MatchVariables() + + match_columns.fit(train) + + df_transformed = match_columns.transform(test) + + return df_transformed diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py new file mode 100644 index 00000000000..bef02c86355 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import holoviews as hv +import numpy as np +import pandas as pd +import pytest + +nsamps = 1000 +hv.extension("bokeh") # load holoviews extension + + +def assert_holoviews_equal(expect, got): + expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = ( + expect + ) + got_data, got_ndims, got_kdims, got_vdims, got_shape = got + + if isinstance(expect_data, dict): + np.testing.assert_allclose(expect_data["x"], got_data["x"]) + np.testing.assert_allclose( + expect_data["Frequency"], got_data["Frequency"] + ) + else: + pd._testing.assert_frame_equal(expect_data, got_data) + assert expect_ndims == got_ndims + assert expect_kdims == got_kdims + assert expect_vdims == got_vdims + assert expect_shape == got_shape + + +pytestmark = pytest.mark.assert_eq(fn=assert_holoviews_equal) + + +@pytest.fixture(scope="module") +def df(): + rng = np.random.default_rng(42) + return pd.DataFrame( + { + "x": rng.random(nsamps), + "y": rng.random(nsamps), + "category": rng.integers(0, 10, nsamps), + "category2": rng.integers(0, 10, nsamps), + } + ) + + +def get_plot_info(plot): + return ( + plot.data, + plot.ndims, + plot.kdims, + plot.vdims, + plot.shape, + ) + + +def test_holoviews_barplot(df): + return get_plot_info(hv.Bars(df, kdims="category", vdims="y")) + + +def test_holoviews_scatterplot(df): + return get_plot_info(hv.Scatter(df, kdims="x", vdims="y")) + + +def test_holoviews_curve(df): + return get_plot_info(hv.Curve(df, kdims="category", vdims="y")) + + +def test_holoviews_heatmap(df): + return get_plot_info( + hv.HeatMap(df, kdims=["category", "category2"], vdims="y") + ) + + +def test_holoviews_histogram(df): + return get_plot_info(hv.Histogram(df.values)) + + +def test_holoviews_hexbin(df): + return get_plot_info(hv.HexTiles(df, kdims=["x", "y"], vdims="y")) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py new file mode 100644 index 00000000000..0f0d2f8bcbd --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import hvplot.pandas # noqa: F401, needs to monkey patch pandas with this. +import numpy as np +import pandas as pd +import pytest + +nsamps = 1000 + + +def assert_hvplot_equal(expect, got): + expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = ( + expect + ) + got_data, got_ndims, got_kdims, got_vdims, got_shape = got + + if isinstance(expect_data, dict): + np.testing.assert_allclose(expect_data["x"], got_data["x"]) + np.testing.assert_allclose( + expect_data["Frequency"], got_data["Frequency"] + ) + else: + pd._testing.assert_frame_equal(expect_data, got_data) + assert expect_ndims == got_ndims + assert expect_kdims == got_kdims + assert expect_vdims == got_vdims + assert expect_shape == got_shape + + +pytestmark = pytest.mark.assert_eq(fn=assert_hvplot_equal) + + +@pytest.fixture(scope="module") +def df(): + rng = np.random.default_rng(42) + return pd.DataFrame( + { + "x": rng.random(nsamps), + "y": rng.random(nsamps), + "category": rng.integers(0, 10, nsamps), + "category2": rng.integers(0, 10, nsamps), + } + ) + + +def get_plot_info(plot): + return ( + plot.data, + plot.ndims, + plot.kdims, + plot.vdims, + plot.shape, + ) + + +def test_hvplot_barplot(df): + return get_plot_info(df.hvplot.bar(x="category", y="y")) + + +def test_hvplot_scatterplot(df): + return get_plot_info(df.hvplot.scatter(x="x", y="y")) + + +def test_hvplot_lineplot(df): + return get_plot_info(df.hvplot.line(x="x", y="y")) + + +def test_hvplot_heatmap(df): + return get_plot_info(df.hvplot.heatmap(x="x", y="y", C="y")) + + +def test_hvplot_hexbin(df): + return get_plot_info(df.hvplot.hexbin(x="x", y="y", C="y")) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py new file mode 100644 index 00000000000..2a8cf7c6ac2 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py @@ -0,0 +1,169 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import ibis +import numpy as np +import pandas as pd +import pytest + +ibis.set_backend("pandas") +ibis.options.interactive = False + + +def ibis_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0): + pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol) + + +pytestmark = pytest.mark.assert_eq(fn=ibis_assert_equal) + + +COLUMN_REDUCTIONS = ["sum", "min", "max", "mean", "var", "std"] +ELEMENTWISE_UFUNCS = [ + "sin", + "cos", + "atan", + "exp", + "log", + "abs", +] +STRING_UNARY_FUNCS = [ + "lower", + "upper", + "capitalize", + "reverse", +] + + +@pytest.fixture +def ibis_table_num_str(): + N = 1000 + K = 5 + rng = np.random.default_rng(42) + + df = pd.DataFrame( + rng.integers(0, 100, (N, K)), columns=[f"col{x}" for x in np.arange(K)] + ) + df["key"] = rng.choice(np.arange(10), N) + df["str_col"] = rng.choice(["Hello", "World", "It's", "Me", "Again"], N) + table = ibis.memtable(df, name="t") + return table + + +@pytest.fixture +def ibis_table_num(): + N = 100 + K = 2 + rng = np.random.default_rng(42) + + df = pd.DataFrame( + rng.integers(0, 100, (N, K)), columns=[f"val{x}" for x in np.arange(K)] + ) + df["key"] = rng.choice(np.arange(10), N) + table = ibis.memtable(df, name="t") + return table + + +@pytest.mark.parametrize("op", COLUMN_REDUCTIONS) +def test_column_reductions(ibis_table_num_str, op): + t = ibis_table_num_str + return getattr(t.col1, op)().to_pandas() + + +@pytest.mark.parametrize("op", ["mean", "sum", "min", "max"]) +def test_groupby_reductions(ibis_table_num_str, op): + t = ibis_table_num_str + return getattr(t.group_by("key").col1, op)().to_pandas() + + +@pytest.mark.parametrize("op", ELEMENTWISE_UFUNCS) +def test_mutate_ufunc(ibis_table_num_str, op): + t = ibis_table_num_str + expr = getattr(t.col1, op)() + return t.mutate(col1_sin=expr).to_pandas() + + +@pytest.mark.parametrize("op", STRING_UNARY_FUNCS) +def test_string_unary(ibis_table_num_str, op): + t = ibis_table_num_str + return getattr(t.str_col, op)().to_pandas() + + +def test_nunique(ibis_table_num_str): + t = ibis_table_num_str + return t.col1.nunique().to_pandas() + + +def test_count(ibis_table_num_str): + t = ibis_table_num_str + return t.col1.count().to_pandas() + + +def test_select(ibis_table_num_str): + t = ibis_table_num_str + return t.select("col0", "col1").to_pandas() + + +def test_between(ibis_table_num_str): + t = ibis_table_num_str + return t.key.between(4, 8).to_pandas() + + +def test_notin(ibis_table_num_str): + t = ibis_table_num_str + return t.key.notin([0, 1, 8, 3]).to_pandas() + + +def test_window(ibis_table_num_str): + t = ibis_table_num_str + return ( + t.group_by("key").mutate(demeaned=t.col1 - t.col1.mean()).to_pandas() + ) + + +def test_limit(ibis_table_num_str): + t = ibis_table_num_str + return t.limit(5).to_pandas() + + +def test_filter(ibis_table_num_str): + t = ibis_table_num_str + return t.filter([t.key == 4, t.col0 > 15]).to_pandas() + + +@pytest.mark.skip(reason="Join ordering not currently guaranteed, i.e., flaky") +@pytest.mark.parametrize("join_type", ["inner", "left", "right"]) +def test_join_exact_ordering(ibis_table_num_str, ibis_table_num, join_type): + t1 = ibis_table_num_str + t2 = ibis_table_num + res = t1.join(t2, "key", how=join_type).to_pandas() + return res + + +@pytest.mark.parametrize("join_type", ["inner", "left", "right"]) +def test_join_sort_correctness(ibis_table_num_str, ibis_table_num, join_type): + """ + While we don't currently guarantee exact row ordering + we can still test join correctness with ex-post sorting. + """ + t1 = ibis_table_num_str + t2 = ibis_table_num + res = t1.join(t2, "key", how=join_type).to_pandas() + + res_sorted = res.sort_values(by=res.columns.tolist()).reset_index( + drop=True + ) + return res_sorted + + +def test_order_by(ibis_table_num_str): + t = ibis_table_num_str + return t.order_by(ibis.desc("col1")).to_pandas() + + +def test_aggregate_having(ibis_table_num_str): + t = ibis_table_num_str + return t.aggregate( + by=["key"], + sum_c0=t.col0.sum(), + avg_c0=t.col0.mean(), + having=t.col1.mean() > 50, + ).to_pandas() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py new file mode 100644 index 00000000000..665b9d6fb08 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import pytest +from matplotlib.axes import Axes +from matplotlib.collections import PathCollection +from matplotlib.lines import Line2D +from matplotlib.patches import Rectangle +from pandas._testing import assert_equal + + +def assert_plots_equal(expect, got): + if isinstance(expect, Axes) and isinstance(got, Axes): + for expect_ch, got_ch in zip( + expect.get_children(), got.get_children() + ): + assert type(expect_ch) == type(got_ch) + if isinstance(expect_ch, Line2D): + assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) + assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) + elif isinstance(expect_ch, Rectangle): + assert expect_ch.get_height() == got_ch.get_height() + elif isinstance(expect, PathCollection) and isinstance( + got, PathCollection + ): + assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0]) + assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1]) + else: + assert_equal(expect, got) + + +pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal) + + +def test_line(): + df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) + (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-") + + return plt.gca() + + +def test_bar(): + data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + ax = data.plot(kind="bar") + return ax + + +def test_scatter(): + df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [5, 4, 3, 2, 1]}) + + fig, ax = plt.subplots(figsize=(8, 6)) + ax.scatter(df["x"], df["y"]) + + return plt.gca() + + +def test_dataframe_plot(): + rng = np.random.default_rng(42) + df = pd.DataFrame(rng.random((10, 5)), columns=["a", "b", "c", "d", "e"]) + ax = df.plot() + + return ax + + +def test_series_plot(): + sr = pd.Series([1, 2, 3, 4, 5]) + ax = sr.plot() + + return ax diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py new file mode 100644 index 00000000000..472f1889354 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +nsamps = 1000 +reductions = ["sum", "min", "max", "mean", "var", "std"] + + +pytestmark = pytest.mark.assert_eq(fn=np.testing.assert_allclose) + + +@pytest.fixture(scope="module") +def sr(): + rng = np.random.default_rng(42) + return pd.Series(rng.random(nsamps)) + + +@pytest.mark.parametrize("op", reductions) +def test_numpy_series_reductions(sr, op): + return getattr(np, op)(sr) + + +@pytest.fixture(scope="module") +def df(): + rng = np.random.default_rng(42) + return pd.DataFrame({"A": rng.random(nsamps), "B": rng.random(nsamps)}) + + +@pytest.mark.parametrize("op", reductions) +def test_numpy_dataframe_reductions(df, op): + return getattr(np, op)(df) + + +def test_numpy_dot(df): + return np.dot(df, df.T) + + +def test_numpy_fft(sr): + fft = np.fft.fft(sr) + return fft + + +def test_numpy_sort(df): + return np.sort(df) + + +@pytest.mark.parametrize("percentile", [0, 25, 50, 75, 100]) +def test_numpy_percentile(df, percentile): + return np.percentile(df, percentile) + + +def test_numpy_unique(df): + return np.unique(df) + + +def test_numpy_transpose(df): + return np.transpose(df) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py new file mode 100644 index 00000000000..27d9df83476 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import plotly.express as px +import pytest + +nsamps = 100 + + +def assert_plotly_equal(expect, got): + assert type(expect) == type(got) + if isinstance(expect, dict): + assert expect.keys() == got.keys() + for k in expect.keys(): + assert_plotly_equal(expect[k], got[k]) + elif isinstance(got, list): + assert len(expect) == len(got) + for i in range(len(expect)): + assert_plotly_equal(expect[i], got[i]) + elif isinstance(expect, np.ndarray): + np.testing.assert_allclose(expect, got) + else: + assert expect == got + + +pytestmark = pytest.mark.assert_eq(fn=assert_plotly_equal) + + +@pytest.fixture(scope="module") +def df(): + rng = np.random.default_rng(42) + return pd.DataFrame( + { + "x": rng.random(nsamps), + "y": rng.random(nsamps), + "category": rng.integers(0, 10, nsamps), + "category2": rng.integers(0, 10, nsamps), + } + ) + + +def test_plotly_scatterplot(df): + return px.scatter(df, x="x", y="y").to_plotly_json() + + +def test_plotly_lineplot(df): + return px.line(df, x="category", y="y").to_plotly_json() + + +def test_plotly_barplot(df): + return px.bar(df, x="category", y="y").to_plotly_json() + + +def test_plotly_histogram(df): + return px.histogram(df, x="category").to_plotly_json() + + +def test_plotly_pie(df): + return px.pie(df, values="category", names="category2").to_plotly_json() + + +def test_plotly_heatmap(df): + return px.density_heatmap(df, x="category", y="category2").to_plotly_json() + + +def test_plotly_boxplot(df): + return px.box(df, x="category", y="y").to_plotly_json() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py new file mode 100644 index 00000000000..ad287471aa0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +import torch + +pytestmark = pytest.mark.assert_eq(fn=torch.testing.assert_close) + + +@pytest.fixture +def data(): + rng = np.random.default_rng(0) + x1 = rng.random(100, dtype=np.float32) + x2 = rng.random(100, dtype=np.float32) + y = np.zeros(100).astype(np.int64) + + y[(x1 > x2) & (x1 > 0)] = 0 + y[(x1 < x2) & (x1 > 0)] = 1 + y[(x1 > x2) & (x1 < 0)] = 2 + y[(x1 < x2) & (x1 < 0)] = 3 + + return x1, x2, y + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, x1, x2, y): + self.x1 = x1 + self.x2 = x2 + self.y = y + + def __getitem__(self, idx): + x1 = self.x1[idx] + x2 = self.x2[idx] + y = self.y[idx] + return (x1, x2), y + + def __len__(self): + return len(self.x1) + + +def test_dataloader_auto_batching(data): + x1, x2, y = (pd.Series(i) for i in data) + + dataset = Dataset(x1, x2, y) + + # default collate_fn + dataloader = torch.utils.data.DataLoader(dataset, batch_size=10) + + (x1, x2), y = next(iter(dataloader)) + return x1, x2, y + + +def test_dataloader_manual_batching(data): + x1, x2, y = (pd.Series(i) for i in data) + + dataset = Dataset(x1, x2, y) + + # default collate_fn + dataloader = torch.utils.data.DataLoader(dataset, batch_size=None) + + (x1, x2), y = next(iter(dataloader)) + return x1, x2, y + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.fc1 = torch.nn.Linear(2, 10) + self.relu1 = torch.nn.ReLU() + self.fc2 = torch.nn.Linear(10, 10) + self.relu2 = torch.nn.ReLU() + self.output = torch.nn.Linear(10, 4) + + def forward(self, x1, x2): + x = torch.stack([x1, x2], dim=0).T + x = self.fc1(x) + x = self.relu1(x) + x = self.fc2(x) + x = self.relu2(x) + return torch.nn.functional.softmax(x, dim=1) + + +def train(model, dataloader, optimizer, criterion): + model.train() + for (x1, x2), y in dataloader: + x1 = x1.to("cuda") + x2 = x2.to("cuda") + y = y.to("cuda") + + optimizer.zero_grad() + y_pred = model(x1, x2) + loss = criterion(y_pred, y) + loss.backward() + optimizer.step() + + +def test_torch_train(data): + torch.manual_seed(0) + + x1, x2, y = (pd.Series(i) for i in data) + dataset = Dataset(x1, x2, y) + # default collate_fn + dataloader = torch.utils.data.DataLoader(dataset, batch_size=10) + + model = Model().to("cuda") + optimizer = torch.optim.SGD(model.parameters(), lr=0.001) + criterion = torch.nn.CrossEntropyLoss() + + train(model, dataloader, optimizer, criterion) + + test_x1, test_x2 = next(iter(dataloader))[0] + test_x1 = test_x1.to("cuda") + test_x2 = test_x2.to("cuda") + + return model(test_x1, test_x2) + + +def test_torch_tensor_ctor(): + s = pd.Series(range(5)) + return torch.tensor(s.values) + + +def test_torch_tensor_from_numpy(): + s = pd.Series(range(5)) + return torch.from_numpy(s.values) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py new file mode 100644 index 00000000000..963a8549000 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py @@ -0,0 +1,65 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +import scipy + + +@pytest.mark.parametrize("func", ["hmean", "tvar", "gstd"]) +def test_scipy_stats(func): + rng = np.random.default_rng(42) + data = pd.Series(rng.random(1000)) + return getattr(scipy.stats, func)(data) + + +@pytest.mark.parametrize("func", ["norm"]) +def test_scipy_linalg(func): + rng = np.random.default_rng(42) + data = pd.Series(rng.random(1000)) + return getattr(scipy.linalg, func)(data) + + +pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_almost_equal) + + +def test_compute_pi(): + def circle(x): + return (1 - x**2) ** 0.5 + + x = pd.Series(np.linspace(0, 1, 100)) + y = pd.Series(circle(np.linspace(0, 1, 100))) + + result = scipy.integrate.trapezoid(y, x) + return result * 4 + + +def test_matrix_solve(): + A = pd.DataFrame([[2, 3], [1, 2]]) + b = pd.Series([1, 2]) + + return scipy.linalg.solve(A, b) + + +def test_correlation(): + data = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [5, 4, 3, 2, 1]}) + + return scipy.stats.pearsonr(data["A"], data["B"]) + + +def test_optimization(): + x = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) + + def rosen(x): # banana function from scipy tutorial + return sum( + 100.0 * (x[1:] - x[:-1] ** 2.0) ** 2.0 + (1 - x[:-1]) ** 2.0 + ) + + result = scipy.optimize.fmin(rosen, x) + return result + + +def test_regression(): + data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 5, 4, 5]}) + result = scipy.stats.linregress(data["y"], data["y"]) + return result.slope, result.intercept diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py new file mode 100644 index 00000000000..4b272900acd --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py @@ -0,0 +1,60 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import pandas as pd +import pytest +import seaborn as sns +from matplotlib.axes import Axes +from matplotlib.collections import PathCollection +from matplotlib.lines import Line2D +from matplotlib.patches import Rectangle +from pandas._testing import assert_equal + + +def assert_plots_equal(expect, got): + if isinstance(expect, Axes) and isinstance(got, Axes): + for expect_ch, got_ch in zip( + expect.get_children(), got.get_children() + ): + assert type(expect_ch) == type(got_ch) + if isinstance(expect_ch, Line2D): + assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) + assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) + elif isinstance(expect_ch, Rectangle): + assert expect_ch.get_height() == got_ch.get_height() + elif isinstance(expect, PathCollection) and isinstance( + got, PathCollection + ): + assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0]) + assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1]) + else: + assert_equal(expect, got) + + +pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal) + + +@pytest.fixture(scope="module") +def df(): + df = pd.DataFrame( + { + "x": [2, 3, 4, 5, 11], + "y": [4, 3, 2, 1, 15], + "hue": ["c", "a", "b", "b", "a"], + } + ) + return df + + +def test_bar(df): + ax = sns.barplot(data=df, x="x", y="y") + return ax + + +def test_scatter(df): + ax = sns.scatterplot(data=df, x="x", y="y", hue="hue") + return ax + + +def test_lineplot_with_sns_data(): + df = sns.load_dataset("flights") + ax = sns.lineplot(data=df, x="month", y="passengers") + return ax diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py new file mode 100644 index 00000000000..1635fd3dcda --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import pytest +from sklearn.cluster import KMeans +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + + +def test_regression(): + data = { + "feature1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "feature2": [2, 4, 1, 3, 5, 7, 6, 8, 10, 9], + "target": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + } + df = pd.DataFrame(data) + + X = df[["feature1", "feature2"]] + y = df["target"] + + # Data Splitting + (X_train, X_test, y_train, y_test) = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Basic deterministic LR model + model = LogisticRegression() + model.fit(X_train, y_train) + + # predction phase + y_pred = model.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + + return accuracy + + +@pytest.mark.assert_eq(fn=np.testing.assert_allclose) +def test_clustering(): + rng = np.random.default_rng(42) + nsamps = 300 + X = rng.random((nsamps, 2)) + data = pd.DataFrame(X, columns=["x", "y"]) + + # Create and fit a KMeans clustering model + kmeans = KMeans(n_clusters=3, random_state=42) + kmeans.fit(data) + return kmeans.cluster_centers_ + + +def test_feature_selection(): + rng = np.random.default_rng(42) + n_samples = 100 + n_features = 10 + + X = rng.random((n_samples, n_features)) + y = rng.integers(0, 2, size=n_samples) + + data = pd.DataFrame( + X, columns=[f"feature{i}" for i in range(1, n_features + 1)] + ) + data["target"] = y + + # Select the top k features + k_best = SelectKBest(score_func=f_classif, k=5) + k_best.fit_transform(X, y) + + feat_inds = k_best.get_support(indices=True) + features = data.iloc[:, feat_inds] + + return sorted(features.columns.tolist()) + + +@pytest.mark.assert_eq(fn=np.testing.assert_allclose) +def test_data_scaling(): + data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) + scaler = StandardScaler() + + scaled_data = scaler.fit_transform(data.values.reshape(-1, 1)) + return scaled_data diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py new file mode 100644 index 00000000000..69248002a58 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +import stumpy +from numba import cuda +from pandas._testing import assert_equal + + +def stumpy_assert_equal(expected, got): + def as_float64(x): + if isinstance(x, (tuple, list)): + return [as_float64(y) for y in x] + else: + return x.astype(np.float64) + + assert_equal(as_float64(expected), as_float64(got)) + + +pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal) + + +def test_1d_time_series(): + rng = np.random.default_rng(42) + ts = pd.Series(rng.random(10)) + m = 3 + + return stumpy.stump(ts, m) + + +def test_1d_gpu(): + rng = np.random.default_rng(42) + your_time_series = rng.random(10000) + window_size = ( + 50 # Approximately, how many data points might be found in a pattern + ) + all_gpu_devices = [ + device.id for device in cuda.list_devices() + ] # Get a list of all available GPU devices + + return stumpy.gpu_stump( + your_time_series, m=window_size, device_id=all_gpu_devices + ) + + +def test_multidimensional_timeseries(): + rng = np.random.default_rng(42) + # Each row represents data from a different dimension while each column represents + # data from the same dimension + your_time_series = rng.random((3, 1000)) + # Approximately, how many data points might be found in a pattern + window_size = 50 + + return stumpy.mstump(your_time_series, m=window_size) + + +def test_anchored_time_series_chains(): + rng = np.random.default_rng(42) + your_time_series = rng.random(10000) + window_size = ( + 50 # Approximately, how many data points might be found in a pattern + ) + + matrix_profile = stumpy.stump(your_time_series, m=window_size) + + left_matrix_profile_index = matrix_profile[:, 2] + right_matrix_profile_index = matrix_profile[:, 3] + idx = 10 # Subsequence index for which to retrieve the anchored time series chain for + + anchored_chain = stumpy.atsc( + left_matrix_profile_index, right_matrix_profile_index, idx + ) + + all_chain_set, longest_unanchored_chain = stumpy.allc( + left_matrix_profile_index, right_matrix_profile_index + ) + + return anchored_chain, all_chain_set, longest_unanchored_chain + + +def test_semantic_segmentation(): + rng = np.random.default_rng(42) + your_time_series = rng.random(10000) + window_size = ( + 50 # Approximately, how many data points might be found in a pattern + ) + + matrix_profile = stumpy.stump(your_time_series, m=window_size) + + subseq_len = 50 + return stumpy.fluss( + matrix_profile[:, 1], L=subseq_len, n_regimes=2, excl_factor=1 + ) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py new file mode 100644 index 00000000000..37e3cc34856 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py @@ -0,0 +1,48 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +import stumpy +from pandas._testing import assert_equal + +from dask.distributed import Client, LocalCluster + + +def stumpy_assert_equal(expected, got): + def as_float64(x): + if isinstance(x, (tuple, list)): + return [as_float64(y) for y in x] + else: + return x.astype(np.float64) + + assert_equal(as_float64(expected), as_float64(got)) + + +pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal) + + +# Shared dask client for all tests in this module +@pytest.fixture(scope="module") +def dask_client(): + with LocalCluster(n_workers=4, threads_per_worker=1) as cluster: + with Client(cluster) as dask_client: + yield dask_client + + +def test_1d_distributed(dask_client): + np.random.seed(42) + ts = pd.Series(np.random.rand(100)) + m = 10 + return stumpy.stumped(dask_client, ts, m) + + +def test_multidimensional_distributed_timeseries(dask_client): + np.random.seed(42) + # Each row represents data from a different dimension while each column represents + # data from the same dimension + your_time_series = np.random.rand(3, 1000) + # Approximately, how many data points might be found in a pattern + window_size = 50 + + return stumpy.mstumped(dask_client, your_time_series, m=window_size) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py new file mode 100644 index 00000000000..ba1f518cbfd --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py @@ -0,0 +1,367 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +import tensorflow as tf + +SHUFFLE_BUFFER = 500 +BATCH_SIZE = 2 + +pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_equal) + + +@pytest.fixture(scope="module") +def df(): + rng = np.random.RandomState(42) + + nrows = 303 + columns = { + "age": rng.randint(29, 78, size=(nrows,), dtype="int64"), + "sex": rng.randint(0, 2, size=(nrows,), dtype="int64"), + "cp": rng.randint(0, 5, size=(nrows,), dtype="int64"), + "trestbps": rng.randint(94, 201, size=(nrows,), dtype="int64"), + "chol": rng.randint(126, 565, size=(nrows,), dtype="int64"), + "fbs": rng.randint(0, 2, size=(nrows,), dtype="int64"), + "restecg": rng.randint(0, 3, size=(nrows,), dtype="int64"), + "thalach": rng.randint(71, 203, size=(nrows,), dtype="int64"), + "exang": rng.randint(0, 2, size=(nrows,), dtype="int64"), + "oldpeak": rng.uniform(0.0, 6.2, size=(nrows,)), + "slope": rng.randint(1, 4, size=(nrows,), dtype="int64"), + "ca": rng.randint(0, 4, size=(nrows,), dtype="int64"), + "thal": rng.choice( + ["fixed", "normal", "reversible", "1", "2"], size=(nrows,) + ), + "target": rng.randint(0, 2, size=(nrows,), dtype="int64"), + } + + return pd.DataFrame(columns) + + +@pytest.fixture(scope="module") +def target(df): + return df.pop("target") + + +@pytest.fixture +def model_gen(): + def make_model(numeric_features): + normalizer = tf.keras.layers.Normalization(axis=-1) + normalizer.adapt(numeric_features) + model = tf.keras.Sequential( + [ + normalizer, + tf.keras.layers.Dense(10, activation="relu"), + tf.keras.layers.Dense(1), + ] + ) + + model.compile( + optimizer="adam", + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=["accuracy"], + ) + return model + + return make_model + + +def test_dataframe_as_array(model_gen, df, target): + tf.keras.utils.set_random_seed(42) + + numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"] + numeric_features = df[numeric_feature_names] + + numeric_features = tf.convert_to_tensor( + numeric_features.values, dtype=tf.float32 + ) + + model = model_gen(numeric_features) + model.fit(numeric_features, target, epochs=1, batch_size=BATCH_SIZE) + + test_data = numeric_features[:BATCH_SIZE] + return model.predict(test_data) + + +def test_dataframe_as_dataset(model_gen, df, target): + tf.keras.utils.set_random_seed(42) + + numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"] + numeric_features = df[numeric_feature_names] + + numeric_features = tf.convert_to_tensor( + numeric_features.values, dtype=tf.float32 + ) + + dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target)) + dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE) + + model = model_gen(numeric_features) + model.fit(dataset, epochs=1) + + test_data = dataset.take(1) + return model.predict(test_data) + + +def stack_dict(inputs, func=tf.stack): + values = [] + for key in sorted(inputs.keys()): + values.append(CastLayer()(inputs[key])) + + class MyLayer(tf.keras.layers.Layer): + def call(self, val): + return func(val, axis=-1) + + return MyLayer()(values) + + +def test_dataframe_as_dictionary_with_keras_input_layer(df, target): + # ensure deterministic results + tf.keras.utils.set_random_seed(42) + + numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"] + numeric_features = df[numeric_feature_names] + + inputs = {} + for name in numeric_features: + inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=tf.float32) + + x = stack_dict(inputs, func=tf.concat) + + normalizer = tf.keras.layers.Normalization(axis=-1) + normalizer.adapt(stack_dict(dict(numeric_features))) + + x = normalizer(x) + x = tf.keras.layers.Dense(10, activation="relu")(x) + x = tf.keras.layers.Dense(1)(x) + + model = tf.keras.Model(inputs, x) + + model.compile( + optimizer="adam", + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=["accuracy"], + run_eagerly=True, + ) + + # Train with dictionary of columns as input: + model.fit(dict(numeric_features), target, epochs=1, batch_size=BATCH_SIZE) + + # Train with a dataset of dictionary-elements + numeric_dict_ds = tf.data.Dataset.from_tensor_slices( + (dict(numeric_features), target) + ) + numeric_dict_batches = numeric_dict_ds.shuffle(SHUFFLE_BUFFER).batch( + BATCH_SIZE + ) + model.fit(numeric_dict_batches, epochs=1) + + # Predict + return model.predict(numeric_dict_batches.take(1)) + + +def test_full_example_train_with_ds(df, target): + # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example + # Inputs are converted to tf.dataset and then batched + + # ensure deterministic results + tf.keras.utils.set_random_seed(42) + + numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"] + binary_feature_names = ["sex", "fbs", "exang"] + categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"] + + numeric_features = df[numeric_feature_names] + + inputs = {} + for name, column in df.items(): + if isinstance(column[0], str): + dtype = tf.string + elif name in categorical_feature_names or name in binary_feature_names: + dtype = tf.int64 + else: + dtype = tf.float32 + + inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype) + + preprocessed = [] + + # Process binary features + for name in binary_feature_names: + inp = inputs[name] + inp = inp[:, tf.newaxis] + float_value = CastLayer()(inp) + preprocessed.append(float_value) + + normalizer = tf.keras.layers.Normalization(axis=-1) + normalizer.adapt(stack_dict(dict(numeric_features))) + + # Process numeric features + numeric_inputs = {} + for name in numeric_feature_names: + numeric_inputs[name] = inputs[name] + + numeric_inputs = stack_dict(numeric_inputs) + numeric_normalized = normalizer(numeric_inputs) + + preprocessed.append(numeric_normalized) + + # Process categorical features + for name in categorical_feature_names: + vocab = sorted(set(df[name])) + print(f"name: {name}") + print(f"vocab: {vocab}\n") + + if isinstance(vocab[0], str): + lookup = tf.keras.layers.StringLookup( + vocabulary=vocab, output_mode="one_hot" + ) + else: + lookup = tf.keras.layers.IntegerLookup( + vocabulary=vocab, output_mode="one_hot" + ) + + x = inputs[name][:, tf.newaxis] + x = lookup(x) + preprocessed.append(x) + + # Concatenate all tensors + preprocesssed_result = MyConcatLayer()(preprocessed) + + preprocessor = tf.keras.Model(inputs, preprocesssed_result) + + # Create the model + body = tf.keras.Sequential( + [ + tf.keras.layers.Dense(10, activation="relu"), + tf.keras.layers.Dense(10, activation="relu"), + tf.keras.layers.Dense(1), + ] + ) + + x = preprocessor(inputs) + result = body(x) + + model = tf.keras.Model(inputs, result) + + model.compile( + optimizer="adam", + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=["accuracy"], + ) + + ds = tf.data.Dataset.from_tensor_slices((dict(df), target)) + ds = ds.batch(BATCH_SIZE) + model.fit(ds, epochs=1) + + return model.predict(ds.take(1)) + + +class CastLayer(tf.keras.layers.Layer): + def __init__(self, **kwargs): + super(CastLayer, self).__init__(**kwargs) + + def call(self, inp): + return tf.cast(inp, tf.float32) + + +class MyConcatLayer(tf.keras.layers.Layer): + def call(self, values): + values = [tf.cast(v, tf.float32) for v in values] + return tf.concat(values, axis=-1) + + +def test_full_example_train_with_df(df, target): + # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example + # Inputs are directly passed as dictionary of series + + # ensure deterministic results + tf.keras.utils.set_random_seed(42) + + numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"] + binary_feature_names = ["sex", "fbs", "exang"] + categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"] + + numeric_features = df[numeric_feature_names] + + inputs = {} + + for name, column in df.items(): + if isinstance(column[0], str): + dtype = tf.string + elif name in categorical_feature_names or name in binary_feature_names: + dtype = tf.int64 + else: + dtype = tf.float32 + + inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype) + + preprocessed = [] + + # Process binary features + for name in binary_feature_names: + inp = inputs[name] + inp = inp[:, tf.newaxis] + float_value = CastLayer()(inp) + preprocessed.append(float_value) + + normalizer = tf.keras.layers.Normalization(axis=-1) + normalizer.adapt(stack_dict(dict(numeric_features))) + + # Process numeric features + numeric_inputs = {} + for name in numeric_feature_names: + numeric_inputs[name] = inputs[name] + + numeric_inputs = stack_dict(numeric_inputs) + numeric_normalized = normalizer(numeric_inputs) + + preprocessed.append(numeric_normalized) + + # Process categorical features + for name in categorical_feature_names: + vocab = sorted(set(df[name])) + print(f"name: {name}") + print(f"vocab: {vocab}\n") + + if isinstance(vocab[0], str): + lookup = tf.keras.layers.StringLookup( + vocabulary=vocab, output_mode="one_hot" + ) + else: + lookup = tf.keras.layers.IntegerLookup( + vocabulary=vocab, output_mode="one_hot" + ) + + x = inputs[name][:, tf.newaxis] + x = lookup(x) + preprocessed.append(x) + + # Concatenate all tensors + preprocesssed_result = MyConcatLayer()(preprocessed) + + preprocessor = tf.keras.Model(inputs, preprocesssed_result) + + # Create the model + body = tf.keras.Sequential( + [ + tf.keras.layers.Dense(10, activation="relu"), + tf.keras.layers.Dense(10, activation="relu"), + tf.keras.layers.Dense(1), + ] + ) + + x = preprocessor(inputs) + result = body(x) + + model = tf.keras.Model(inputs, result) + + model.compile( + optimizer="adam", + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=["accuracy"], + ) + + model.fit(dict(df), target, epochs=1, batch_size=BATCH_SIZE) + + return model.predict(dict(df[:BATCH_SIZE])) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py new file mode 100644 index 00000000000..70f1e6a4250 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py @@ -0,0 +1,135 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest +import scipy.sparse +import xgboost as xgb +from sklearn.datasets import make_regression +from xgboost.testing import IteratorForTest, make_categorical + +n_samples = 128 +n_features = 16 + + +def xgboost_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0): + if isinstance(expect, (tuple, list)): + assert len(expect) == len(got) + for e, g in zip(expect, got): + xgboost_assert_equal(e, g, rtol, atol) + elif isinstance(expect, scipy.sparse.csr_matrix): + np.testing.assert_allclose(expect.data, got.data, rtol=rtol, atol=atol) + np.testing.assert_equal(expect.indices, got.indices) + np.testing.assert_equal(expect.indptr, got.indptr) + else: + pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol) + + +pytestmark = pytest.mark.assert_eq(fn=xgboost_assert_equal) + + +@pytest.fixture +def reg_data() -> tuple[np.ndarray, np.ndarray]: + X, y = make_regression(n_samples, n_features, random_state=11) + return X, y + + +@pytest.fixture +def reg_batches_data() -> tuple[list[pd.DataFrame], list[pd.Series]]: + cov = [] + res = [] + for i in range(3): + X, y = make_regression(n_samples, n_features, random_state=i + 1) + cov.append(pd.DataFrame(X)) + res.append(pd.Series(y)) + return cov, res + + +def test_with_dmatrix( + reg_data: tuple[np.ndarray, np.ndarray], +) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]: + """DMatrix is the primary interface for XGBoost.""" + X, y = reg_data + X_df = pd.DataFrame(X) + y_ser = pd.Series(y) + Xy = xgb.DMatrix(X_df, y_ser) + assert Xy.feature_names == list(map(str, X_df.columns)) + csr_0 = Xy.get_data() + + Xc, yc = make_categorical( + n_samples, n_features, n_categories=13, onehot=False + ) + Xy = xgb.DMatrix(Xc, yc, enable_categorical=True) + csr_1 = Xy.get_data() + return csr_0, csr_1 + + +def test_with_quantile_dmatrix( + reg_data: tuple[np.ndarray, np.ndarray], +) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]: + """QuantileDMatrix is an optimization for the `hist` tree method for XGBoost.""" + from xgboost.testing.data import memory + + memory.clear(warn=False) + + X, y = reg_data + X_df = pd.DataFrame(X) + y_ser = pd.Series(y) + Xy = xgb.QuantileDMatrix(X_df, y_ser) + assert Xy.feature_names == list(map(str, X_df.columns)) + csr_0 = Xy.get_data() + + Xc, yc = make_categorical( + n_samples, n_features, n_categories=13, onehot=False + ) + Xy = xgb.QuantileDMatrix(Xc, yc, enable_categorical=True) + csr_1 = Xy.get_data() + return csr_0, csr_1 + + +def test_with_iter_quantile_dmatrix( + reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]], +) -> scipy.sparse.csr_matrix: + """Using iterator to initialize QuantileDMatrix.""" + cov, res = reg_batches_data + it = IteratorForTest(cov, res, w=None, cache=None) + Xy = xgb.QuantileDMatrix(it) + csr = Xy.get_data() + return csr + + +@pytest.mark.parametrize("device", ["cpu", "cuda"]) +def test_with_external_memory( + device: str, + reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]], +) -> np.ndarray: + """Test with iterator-based external memory.""" + cov, res = reg_batches_data + it = IteratorForTest(cov, res, w=None, cache="cache") + Xy = xgb.DMatrix(it) + predt = xgb.train({"device": device}, Xy, num_boost_round=1).predict(Xy) + return predt + + +@pytest.mark.parametrize("device", ["cpu", "cuda"]) +def test_predict(device: str) -> np.ndarray: + reg = xgb.XGBRegressor(n_estimators=2, device=device) + X, y = make_regression(n_samples, n_features, random_state=11) + X_df = pd.DataFrame(X) + reg.fit(X_df, y) + booster = reg.get_booster() + + predt0 = reg.predict(X_df) + + predt1 = booster.inplace_predict(X_df) + np.testing.assert_allclose(predt0, predt1) + + predt2 = booster.predict(xgb.DMatrix(X_df)) + np.testing.assert_allclose(predt0, predt2) + + predt3 = booster.inplace_predict(X) + np.testing.assert_allclose(predt0, predt3) + + return predt0 From c2c88a9b6e15fd20bd41edd82bf21b9fb2884471 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 23 Aug 2024 08:36:50 -0700 Subject: [PATCH 02/24] Add missing ci --- .github/workflows/nightly.yaml | 58 ++++ .../dependencies.yaml | 268 ++++++++++++++++++ .../{ => tests}/conftest.py | 0 .../{ => tests}/pytest.ini | 0 .../{ => tests}/run_library_tests.sh | 0 .../{ => tests}/test_cugraph.py | 0 .../{ => tests}/test_cuml.py | 0 .../{ => tests}/test_dask.py | 0 .../{ => tests}/test_featureengine.py | 0 .../{ => tests}/test_holoviews.py | 0 .../{ => tests}/test_hvplot.py | 0 .../{ => tests}/test_ibis.py | 0 .../{ => tests}/test_matplotlib.py | 0 .../{ => tests}/test_numpy.py | 0 .../{ => tests}/test_plotly.py | 0 .../{ => tests}/test_pytorch.py | 0 .../{ => tests}/test_scipy.py | 0 .../{ => tests}/test_seaborn.py | 0 .../{ => tests}/test_sklearn.py | 0 .../{ => tests}/test_stumpy.py | 0 .../{ => tests}/test_stumpy_distributed.py | 0 .../{ => tests}/test_tensorflow.py | 0 .../{ => tests}/test_xgboost.py | 0 23 files changed, 326 insertions(+) create mode 100644 .github/workflows/nightly.yaml create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/conftest.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/pytest.ini (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/run_library_tests.sh (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_cugraph.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_cuml.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_dask.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_featureengine.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_holoviews.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_hvplot.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_ibis.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_matplotlib.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_numpy.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_plotly.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_pytorch.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_scipy.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_seaborn.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_sklearn.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_stumpy.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_stumpy_distributed.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_tensorflow.py (100%) rename python/cudf/cudf_pandas_tests/third_party_integration_tests/{ => tests}/test_xgboost.py (100%) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml new file mode 100644 index 00000000000..e0660c627a0 --- /dev/null +++ b/.github/workflows/nightly.yaml @@ -0,0 +1,58 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +name: cudf-pandas-integration test on default branch (nightly / manually) + +on: + workflow_dispatch: + # The below exists in alignment with rest of RAPIDS nightly pipeline. They are currently unused. + inputs: + branch: + required: true + type: string + date: + required: true + type: string + sha: + required: true + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + checkout: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + extract_libs: + name: Extract libraries from dependencies.yaml + runs-on: ubuntu-latest + outputs: + LIBS: ${{ steps.extractlib.outputs.LIBS }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Extract libraries + id: extractlib + run: | + python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + + integration-tests: + secrets: inherit + needs: extract_libs + strategy: + fail-fast: false # Continue testing other libraries even if one fails + matrix: + lib: ${{ fromJSON(needs.extract_libs.outputs.LIBS) }} + include: + - CUDA_MAJOR: "12" # By default, test libraries with the latest CUDA 12 + - lib: "tensorflow" + CUDA_MAJOR: "11" # Tensorflow does not have cuda 12 build, use one CUDA 11 instance + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + with: + build_type: nightly + script: "python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh ${{ matrix.lib }}" + run_codecov: false + # Select a single configuration using amd64 and the desired CUDA major version, with the latest (Python, CUDA) versions. + matrix_filter: 'map(select(.ARCH == "amd64" and (.CUDA_VER|startswith("${{ matrix.CUDA_MAJOR }}")))) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]' diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml new file mode 100644 index 00000000000..c894e0cfb2d --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -0,0 +1,268 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Dependency list for https://github.com/rapidsai/dependency-file-generator +files: + checks: + output: none + includes: + - develop + - py_version + test_dask: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_dask + test_matplotlib: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_matplotlib + test_numpy: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_numpy + test_pytorch: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_pytorch + test_seaborn: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_seaborn + test_scipy: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_scipy + test_sklearn: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_sklearn + test_stumpy: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_stumpy + test_tensorflow: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_tensorflow + test_xgboost: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_xgboost + test_cuml: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_cuml + test_cugraph: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_cugraph + test_ibis: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_ibis + test_hvplot: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_hvplot + test_holoviews: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_holoviews + test_plotly: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_plotly + +channels: + - rapidsai-nightly + - rapidsai + - conda-forge + - nvidia + +dependencies: + develop: + common: + - output_types: conda + packages: + - pre-commit + cuda_version: + specific: + - output_types: conda + matrices: + - matrix: + cuda: "11.8" + packages: + - cuda-version=11.8 + - matrix: + cuda: "12.0" + packages: + - cuda-version=12.0 + - matrix: + cuda: "12.2" + packages: + - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 + py_version: + specific: + - output_types: conda + matrices: + - matrix: + py: "3.10" + packages: + - python=3.10 + - matrix: + py: "3.11" + packages: + - python=3.11 + - matrix: + packages: + - python>=3.10,<3.12 + test_base: + common: + - output_types: conda + packages: + - cudf==24.10.*,>=0.0.0a0 + - pandas + - pytest + - pytest-xdist + test_dask: + common: + - output_types: conda + packages: + - dask + test_matplotlib: + common: + - output_types: conda + packages: + - matplotlib-base + test_numpy: + common: + - output_types: conda + packages: + - numpy + test_pytorch: + common: + - output_types: conda + packages: + - numpy + - pytorch>=2.1.0 + test_seaborn: + common: + - output_types: conda + packages: + - seaborn + test_scipy: + common: + - output_types: conda + packages: + - scipy + test_sklearn: + common: + - output_types: conda + packages: + - scikit-learn + test_stumpy: + common: + - output_types: conda + packages: + - dask + - stumpy + test_tensorflow: + common: + - output_types: conda + packages: + - tensorflow + test_xgboost: + common: + - output_types: conda + packages: + - hypothesis + - numpy + - scipy + - scikit-learn + - pip + - pip: + - xgboost>=2.0.1 + test_cuml: + common: + - output_types: conda + packages: + - cuml==24.10.*,>=0.0.0a0 + - scikit-learn + test_cugraph: + common: + - output_types: conda + packages: + - cugraph==24.10.*,>=0.0.0a0 + - networkx + test_ibis: + common: + - output_types: conda + packages: + - pip + - pip: + - ibis-framework[pandas] + test_hvplot: + common: + - output_types: conda + packages: + - hvplot + test_holoviews: + common: + - output_types: conda + packages: + - holoviews + test_plotly: + common: + - output_types: conda + packages: + - plotly diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/conftest.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/pytest.ini rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/run_library_tests.sh rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cugraph.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_cuml.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_dask.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_featureengine.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_holoviews.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_hvplot.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_ibis.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_matplotlib.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_numpy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_plotly.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_pytorch.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_scipy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_seaborn.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_sklearn.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_stumpy_distributed.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_tensorflow.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/test_xgboost.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py From 88789eb024e4861b93933e4d6cb6bc857d244589 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 23 Aug 2024 09:49:32 -0700 Subject: [PATCH 03/24] Combine jobs --- .github/workflows/nightly.yaml | 42 +++++++------------ .../ci/check_style.sh | 18 -------- .../ci/release/update-version.sh | 41 ------------------ .../tests/run_library_tests.sh | 11 ----- 4 files changed, 14 insertions(+), 98 deletions(-) delete mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh delete mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh delete mode 100755 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index e0660c627a0..0c844efdc4e 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -20,39 +20,25 @@ concurrency: cancel-in-progress: true jobs: - checkout: - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v4 - extract_libs: - name: Extract libraries from dependencies.yaml + integration-tests: runs-on: ubuntu-latest - outputs: - LIBS: ${{ steps.extractlib.outputs.LIBS }} steps: - name: Checkout repo uses: actions/checkout@v4 - - name: Extract libraries + + - name: Extract libraries from dependencies.yaml id: extractlib run: | - python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) + echo "LIBS=${LIBS}" >> $GITHUB_ENV - integration-tests: + - name: Run integration tests + run: | + for lib in ${{ env.LIBS }}; do + echo "Running tests for $lib" + CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + done + env: + LIBS: ${{ env.LIBS }} secrets: inherit - needs: extract_libs - strategy: - fail-fast: false # Continue testing other libraries even if one fails - matrix: - lib: ${{ fromJSON(needs.extract_libs.outputs.LIBS) }} - include: - - CUDA_MAJOR: "12" # By default, test libraries with the latest CUDA 12 - - lib: "tensorflow" - CUDA_MAJOR: "11" # Tensorflow does not have cuda 12 build, use one CUDA 11 instance - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 - with: - build_type: nightly - script: "python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh ${{ matrix.lib }}" - run_codecov: false - # Select a single configuration using amd64 and the desired CUDA major version, with the latest (Python, CUDA) versions. - matrix_filter: 'map(select(.ARCH == "amd64" and (.CUDA_VER|startswith("${{ matrix.CUDA_MAJOR }}")))) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]' diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh deleted file mode 100755 index b81b36ddb45..00000000000 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/check_style.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -set -euo pipefail - -rapids-logger "Create checks conda environment" -. /opt/conda/etc/profile.d/conda.sh - -rapids-dependency-file-generator \ - --output conda \ - --file-key checks \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml - -rapids-mamba-retry env create --yes -f env.yaml -n checks -conda activate checks - -# Run pre-commit checks -pre-commit run --hook-stage manual --all-files --show-diff-on-failure diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh deleted file mode 100755 index 5b6f8f5ce1c..00000000000 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/release/update-version.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -########################################### -# cudf.pandas integration Version Updater # -########################################### - -## Usage -# bash update-version.sh - - -# Format is YY.MM.PP - no leading 'v' or trailing 'a' -NEXT_FULL_TAG=$1 - -#Get . for next version -NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') -NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') -NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} - -# Need to distutils-normalize the versions for some use cases -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") - -# Inplace sed replace; workaround for Linux and Mac -function sed_runner() { - sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak -} - -# CI files -for FILE in .github/workflows/*.yaml; do - sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" -done - -DEPENDENCIES=( - cugraph - cudf - cuml -) -for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml; do - sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}" - done -done diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh deleted file mode 100755 index dafd2e77761..00000000000 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/run_library_tests.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"$(dirname "$0")"} -mkdir -p "${RAPIDS_TESTS_DIR}/test-results" - -repo_root=$(git rev-parse --show-toplevel) - -TEST_DIR="${repo_root}/tests/" RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR}" ${repo_root}/ci/ci_run_library_tests.sh "$@" From f3ccceabf08530b7a9a893d83fa2b6bc38d5e370 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sat, 24 Aug 2024 06:05:48 -0700 Subject: [PATCH 04/24] Address review: mv nightly.yml to pr.yml and test.yml --- .github/workflows/nightly.yaml | 44 ---------------------------------- .github/workflows/pr.yaml | 16 +++++++++++++ .github/workflows/test.yaml | 16 +++++++++++++ 3 files changed, 32 insertions(+), 44 deletions(-) delete mode 100644 .github/workflows/nightly.yaml diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml deleted file mode 100644 index 0c844efdc4e..00000000000 --- a/.github/workflows/nightly.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -name: cudf-pandas-integration test on default branch (nightly / manually) - -on: - workflow_dispatch: - # The below exists in alignment with rest of RAPIDS nightly pipeline. They are currently unused. - inputs: - branch: - required: true - type: string - date: - required: true - type: string - sha: - required: true - type: string - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - integration-tests: - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Extract libraries from dependencies.yaml - id: extractlib - run: | - LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) - echo "LIBS=${LIBS}" >> $GITHUB_ENV - - - name: Run integration tests - run: | - for lib in ${{ env.LIBS }}; do - echo "Running tests for $lib" - CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR - done - env: - LIBS: ${{ env.LIBS }} - secrets: inherit diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 2e2a8b6b9bc..1c4f7131699 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -228,3 +228,19 @@ jobs: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" + integration-tests: + needs: wheel-build-cudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: | + LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) + for lib in $LIBS; do + echo "Running tests for $lib" + CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + done diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9feea050b19..ce08d372d89 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -124,3 +124,19 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: ci/cudf_pandas_scripts/run_tests.sh + integration-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + container_image: "rapidsai/ci-conda:latest" + run_script: | + LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) + for lib in $LIBS; do + echo "Running tests for $lib" + CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + done From 8bd1378d068f247bc15ad8866d153d5fdca8c8ee Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sat, 24 Aug 2024 06:11:34 -0700 Subject: [PATCH 05/24] add job to pr bnuilder --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 1c4f7131699..d173b719f46 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -35,6 +35,7 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff + - integration-tests secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 checks: From d2a6fc8ef707b4a5bbca7a66626dc30cfb8a8f46 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 05:45:24 -0700 Subject: [PATCH 06/24] preprocess test names --- .github/workflows/pr.yaml | 5 ++++- .../third_party_integration_tests/ci/extract_lib.sh | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d173b719f46..0f64321dbfb 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -240,7 +240,10 @@ jobs: container_image: "rapidsai/ci-conda:latest" run_script: | LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) - for lib in $LIBS; do + LIBS=${LIBS#[} + LIBS=${LIBS%]} + for lib in ${LIBS//,/ }; do + lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh index 4511363146e..67ec5c773bc 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh @@ -16,7 +16,6 @@ extract_lib_from_dependencies_yaml() { # rest local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries( select(.key | contains("test_")) ) | keys | map(sub("^test_"; ""))')" echo $extracted_libs - write_output "LIBS" $extracted_libs } From de531e2bd1357677a396911cd99927922fad0311 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 07:43:35 -0700 Subject: [PATCH 07/24] Add --config to rdfg --- .github/workflows/test.yaml | 5 ++++- .../third_party_integration_tests/ci/test.sh | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ce08d372d89..26754b1a1b0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -135,7 +135,10 @@ jobs: container_image: "rapidsai/ci-conda:latest" run_script: | LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) - for lib in $LIBS; do + LIBS=${LIBS#[} + LIBS=${LIBS%]} + for lib in ${LIBS//,/ }; do + lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh index a012513b93a..0d66e27e21a 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -11,6 +11,7 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ + --config "../dependencies.yaml" --output conda \ --file-key test_${LIB} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml From aba7509c13996b4cee75889bea58df39674d2ddf Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 09:52:14 -0700 Subject: [PATCH 08/24] Change --config arg in rdfg --- .../cudf_pandas_tests/third_party_integration_tests/ci/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh index 0d66e27e21a..1c32d1d8d3d 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -11,7 +11,7 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ - --config "../dependencies.yaml" + --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" --output conda \ --file-key test_${LIB} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml From 555ffd67266678854d1211e5592a061ebe9b98a7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 11:42:38 -0700 Subject: [PATCH 09/24] continue --output on next line --- .../cudf_pandas_tests/third_party_integration_tests/ci/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh index 1c32d1d8d3d..5c3ebd0af98 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -11,7 +11,7 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ - --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" + --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" \ --output conda \ --file-key test_${LIB} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml From 72c806a0e9124180a71706ad3a16fd1cd68d952b Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 13:29:01 -0700 Subject: [PATCH 10/24] Point to ci script --- .../cudf_pandas_tests/third_party_integration_tests/ci/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh index 5c3ebd0af98..4734e2382fb 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -50,7 +50,7 @@ for serial_library in "${serial_libraries[@]}"; do fi done -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/ci_run_library_tests.sh ${LIB} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh ${LIB} rapids-logger "Test script exiting with value: ${EXITCODE}" exit ${EXITCODE} From e11bff682fd8be9ae135527470bb888537b5feb4 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 26 Aug 2024 18:31:01 -0700 Subject: [PATCH 11/24] preprend pythonpath to pytest --- .../ci/ci_run_library_tests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh index d1627d6436b..0ec46c70028 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh @@ -13,7 +13,7 @@ runtest_gold() { local lib=$1 local test_keys=${@:2} - pytest \ + PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ -v \ --continue-on-collection-errors \ --cache-clear \ @@ -28,7 +28,7 @@ runtest_cudf_pandas() { local lib=$1 local test_keys=${@:2} - pytest \ + PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ -p cudf.pandas \ -v \ --continue-on-collection-errors \ @@ -49,7 +49,7 @@ main() { runtest_cudf_pandas ${lib} ${test_keys} # assertion phase - pytest \ + PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ --compare \ -p cudf.pandas \ -v \ From 8443f55ffeda65e412a0c8dc62c8e8a8a1a0c89a Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 27 Aug 2024 04:28:57 -0700 Subject: [PATCH 12/24] set the test_dir --- .../ci/ci_run_library_tests.sh | 6 +++--- .../third_party_integration_tests/ci/test.sh | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh index 0ec46c70028..d1627d6436b 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh @@ -13,7 +13,7 @@ runtest_gold() { local lib=$1 local test_keys=${@:2} - PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ + pytest \ -v \ --continue-on-collection-errors \ --cache-clear \ @@ -28,7 +28,7 @@ runtest_cudf_pandas() { local lib=$1 local test_keys=${@:2} - PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ + pytest \ -p cudf.pandas \ -v \ --continue-on-collection-errors \ @@ -49,7 +49,7 @@ main() { runtest_cudf_pandas ${lib} ${test_keys} # assertion phase - PYTHONPATH=python/cudf/cudf_pandas_tests/third_party_integration_tests/tests pytest \ + pytest \ --compare \ -p cudf.pandas \ -v \ diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh index 4734e2382fb..09caea65004 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh @@ -24,10 +24,11 @@ conda activate test set -u RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +echo "The working directory is ${PWD}" mkdir -p "${RAPIDS_TESTS_DIR}" repo_root=$(git rev-parse --show-toplevel) -TEST_DIR=${repo_root}/tests +TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests rapids-print-env From 8ffbc2f651340464e90d75931512174bd4638e18 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 27 Aug 2024 06:42:50 -0700 Subject: [PATCH 13/24] xfail pytorch test and move integration tests out of cudf_pandas_tests --- .github/workflows/pr.yaml | 4 ++-- .github/workflows/test.yaml | 4 ++-- .../ci/ci_run_library_tests.sh | 0 .../ci/extract_lib.sh | 0 .../ci/test.sh | 6 +++--- .../dependencies.yaml | 0 .../tests/conftest.py | 0 .../tests/pytest.ini | 0 .../tests/test_cugraph.py | 0 .../tests/test_cuml.py | 0 .../tests/test_dask.py | 0 .../tests/test_featureengine.py | 0 .../tests/test_holoviews.py | 0 .../tests/test_hvplot.py | 0 .../tests/test_ibis.py | 0 .../tests/test_matplotlib.py | 0 .../tests/test_numpy.py | 0 .../tests/test_plotly.py | 0 .../tests/test_pytorch.py | 2 ++ .../tests/test_scipy.py | 0 .../tests/test_seaborn.py | 0 .../tests/test_sklearn.py | 0 .../tests/test_stumpy.py | 0 .../tests/test_stumpy_distributed.py | 0 .../tests/test_tensorflow.py | 0 .../tests/test_xgboost.py | 0 26 files changed, 9 insertions(+), 7 deletions(-) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/ci/ci_run_library_tests.sh (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/ci/extract_lib.sh (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/ci/test.sh (79%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/dependencies.yaml (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/conftest.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/pytest.ini (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_cugraph.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_cuml.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_dask.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_featureengine.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_holoviews.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_hvplot.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_ibis.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_matplotlib.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_numpy.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_plotly.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_pytorch.py (96%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_scipy.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_seaborn.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_sklearn.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_stumpy.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_stumpy_distributed.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_tensorflow.py (100%) rename python/cudf/{cudf_pandas_tests/third_party_integration_tests => cudf_pandas_third_party_integration_tests}/tests/test_xgboost.py (100%) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 0f64321dbfb..459c0fc5189 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -239,12 +239,12 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) + LIBS=$(python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml) LIBS=${LIBS#[} LIBS=${LIBS%]} for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR done diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 26754b1a1b0..e7558511ab2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -134,12 +134,12 @@ jobs: sha: ${{ inputs.sha }} container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) + LIBS=$(python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml) LIBS=${LIBS#[} LIBS=${LIBS%]} for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR done diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh b/python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh rename to python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh b/python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/extract_lib.sh rename to python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh b/python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh similarity index 79% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh rename to python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh index 09caea65004..53c295b7e7a 100755 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/test.sh +++ b/python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh @@ -11,7 +11,7 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ - --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" \ + --config "python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml" \ --output conda \ --file-key test_${LIB} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml @@ -28,7 +28,7 @@ echo "The working directory is ${PWD}" mkdir -p "${RAPIDS_TESTS_DIR}" repo_root=$(git rev-parse --show-toplevel) -TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests +TEST_DIR=${repo_root}/python/cudf/cudf_pandas_third_party_integration_tests/tests rapids-print-env @@ -51,7 +51,7 @@ for serial_library in "${serial_libraries[@]}"; do fi done -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} python/cudf/cudf_pandas_tests/third_party_integration_tests/ci/ci_run_library_tests.sh ${LIB} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh ${LIB} rapids-logger "Test script exiting with value: ${EXITCODE}" exit ${EXITCODE} diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml rename to python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/conftest.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/conftest.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_third_party_integration_tests/tests/pytest.ini similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/pytest.ini diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cugraph.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cugraph.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cuml.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cuml.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_dask.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_dask.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_featureengine.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_featureengine.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_holoviews.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_holoviews.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_hvplot.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_hvplot.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_ibis.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_ibis.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_matplotlib.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_matplotlib.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_numpy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_numpy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_plotly.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_plotly.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_pytorch.py similarity index 96% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_pytorch.py index ad287471aa0..ae9db3836a6 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py +++ b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_pytorch.py @@ -121,6 +121,8 @@ def test_torch_tensor_ctor(): return torch.tensor(s.values) +@pytest.mark.xfail_cudf_pandas(reason="Known failure, see xdf/#210") +@pytest.mark.xfail_compare def test_torch_tensor_from_numpy(): s = pd.Series(range(5)) return torch.from_numpy(s.values) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_scipy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_scipy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_seaborn.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_seaborn.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_sklearn.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_sklearn.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy_distributed.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy_distributed.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_tensorflow.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_tensorflow.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_xgboost.py similarity index 100% rename from python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py rename to python/cudf/cudf_pandas_third_party_integration_tests/tests/test_xgboost.py From ef773c5bb712c12ebdbefc34f979edbdacb76d4e Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 27 Aug 2024 12:15:02 -0700 Subject: [PATCH 14/24] mrefactor --- .github/workflows/pr.yaml | 6 +++--- .github/workflows/test.yaml | 4 ++-- ci/cudf_pandas_scripts/run_tests.sh | 2 ++ .../third-party-integration}/ci_run_library_tests.sh | 2 +- .../third-party-integration}/extract_lib.sh | 0 .../cudf_pandas_scripts/third-party-integration}/test.sh | 8 ++++---- .../third_party_integration_tests}/dependencies.yaml | 0 .../third_party_integration_tests}/tests/conftest.py | 0 .../third_party_integration_tests}/tests/pytest.ini | 0 .../third_party_integration_tests}/tests/test_cugraph.py | 0 .../third_party_integration_tests}/tests/test_cuml.py | 0 .../third_party_integration_tests}/tests/test_dask.py | 0 .../tests/test_featureengine.py | 0 .../tests/test_holoviews.py | 0 .../third_party_integration_tests}/tests/test_hvplot.py | 0 .../third_party_integration_tests}/tests/test_ibis.py | 0 .../tests/test_matplotlib.py | 0 .../third_party_integration_tests}/tests/test_numpy.py | 0 .../third_party_integration_tests}/tests/test_plotly.py | 0 .../third_party_integration_tests}/tests/test_pytorch.py | 0 .../third_party_integration_tests}/tests/test_scipy.py | 0 .../third_party_integration_tests}/tests/test_seaborn.py | 0 .../third_party_integration_tests}/tests/test_sklearn.py | 0 .../third_party_integration_tests}/tests/test_stumpy.py | 0 .../tests/test_stumpy_distributed.py | 0 .../tests/test_tensorflow.py | 0 .../third_party_integration_tests}/tests/test_xgboost.py | 0 27 files changed, 12 insertions(+), 10 deletions(-) rename {python/cudf/cudf_pandas_third_party_integration_tests/ci => ci/cudf_pandas_scripts/third-party-integration}/ci_run_library_tests.sh (97%) rename {python/cudf/cudf_pandas_third_party_integration_tests/ci => ci/cudf_pandas_scripts/third-party-integration}/extract_lib.sh (100%) rename {python/cudf/cudf_pandas_third_party_integration_tests/ci => ci/cudf_pandas_scripts/third-party-integration}/test.sh (78%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/dependencies.yaml (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/conftest.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/pytest.ini (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_cugraph.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_cuml.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_dask.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_featureengine.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_holoviews.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_hvplot.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_ibis.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_matplotlib.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_numpy.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_plotly.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_pytorch.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_scipy.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_seaborn.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_sklearn.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_stumpy.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_stumpy_distributed.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_tensorflow.py (100%) rename python/cudf/{cudf_pandas_third_party_integration_tests => cudf_pandas_tests/third_party_integration_tests}/tests/test_xgboost.py (100%) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 459c0fc5189..7c7fef53d9a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -229,7 +229,7 @@ jobs: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" - integration-tests: + third-party-integration-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 @@ -239,12 +239,12 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml) + LIBS=$(ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) LIBS=${LIBS#[} LIBS=${LIBS%]} for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + ci/cudf_pandas_scripts/third-party-integration/test.sh $lib $CUDA_MAJOR done diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e7558511ab2..10e0ed2bd9f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -134,12 +134,12 @@ jobs: sha: ${{ inputs.sha }} container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml) + LIBS=$(ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) LIBS=${LIBS#[} LIBS=${LIBS%]} for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh $lib $CUDA_MAJOR + ci/cudf_pandas_scripts/third-party-integration/test.sh $lib $CUDA_MAJOR done diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 39056d58d56..38cc785af00 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -62,6 +62,7 @@ else fi python -m pytest -p cudf.pandas \ + --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ --cov-config=./python/cudf/.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ @@ -77,6 +78,7 @@ for version in "${versions[@]}"; do echo "Installing pandas version: ${version}" python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}" python -m pytest -p cudf.pandas \ + --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ --cov-config=./python/cudf/.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh similarity index 97% rename from python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh rename to ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh index d1627d6436b..678ee36b3d9 100755 --- a/python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 cleanup() { - rm tests/results-*.pickle + rm ${TEST_DIR}/results-*.pickle } trap cleanup EXIT diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh b/ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/ci/extract_lib.sh rename to ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh similarity index 78% rename from python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh rename to ci/cudf_pandas_scripts/third-party-integration/test.sh index 53c295b7e7a..6c1c7efe489 100755 --- a/python/cudf/cudf_pandas_third_party_integration_tests/ci/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -11,7 +11,7 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ - --config "python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml" \ + --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" \ --output conda \ --file-key test_${LIB} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml @@ -24,11 +24,11 @@ conda activate test set -u RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} -echo "The working directory is ${PWD}" + mkdir -p "${RAPIDS_TESTS_DIR}" repo_root=$(git rev-parse --show-toplevel) -TEST_DIR=${repo_root}/python/cudf/cudf_pandas_third_party_integration_tests/tests +TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests rapids-print-env @@ -51,7 +51,7 @@ for serial_library in "${serial_libraries[@]}"; do fi done -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} python/cudf/cudf_pandas_third_party_integration_tests/ci/ci_run_library_tests.sh ${LIB} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${LIB} rapids-logger "Test script exiting with value: ${EXITCODE}" exit ${EXITCODE} diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/dependencies.yaml rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/conftest.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/pytest.ini rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cugraph.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cugraph.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_cuml.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_dask.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_dask.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_featureengine.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_featureengine.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_holoviews.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_hvplot.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_hvplot.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_ibis.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_ibis.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_matplotlib.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_numpy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_plotly.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_pytorch.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_scipy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_scipy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_seaborn.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_sklearn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_sklearn.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_stumpy_distributed.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_tensorflow.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py diff --git a/python/cudf/cudf_pandas_third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py similarity index 100% rename from python/cudf/cudf_pandas_third_party_integration_tests/tests/test_xgboost.py rename to python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py From a92ef3f09576b7f5306154c176aa63e4eae85b51 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 27 Aug 2024 13:23:58 -0700 Subject: [PATCH 15/24] change job name to match pr.ymal --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 10e0ed2bd9f..dcd9ade932c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -124,7 +124,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: ci/cudf_pandas_scripts/run_tests.sh - integration-tests: + third-party-integration-tests-cudf-pandas: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: From 7245f768f8779876cbc0c25f07b5ae9ab48e4fad Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:36:49 -0400 Subject: [PATCH 16/24] Update pr.yaml --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 222e276bd11..19fea53e22b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -36,7 +36,7 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff - - integration-tests + - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always() From 9855a0ceb4cf7d5c77a7699c144c0b473fdfad24 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 27 Aug 2024 18:40:43 -0700 Subject: [PATCH 17/24] merge extract_lib.sh and test.sh --- .github/workflows/pr.yaml | 10 +-- .github/workflows/test.yaml | 10 +-- .../third-party-integration/test_new.sh | 87 +++++++++++++++++++ 3 files changed, 89 insertions(+), 18 deletions(-) create mode 100644 ci/cudf_pandas_scripts/third-party-integration/test_new.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 222e276bd11..8578a71ce7c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -307,12 +307,4 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) - LIBS=${LIBS#[} - LIBS=${LIBS%]} - for lib in ${LIBS//,/ }; do - lib=$(echo "$lib" | tr -d '""') - echo "Running tests for $lib" - CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - ci/cudf_pandas_scripts/third-party-integration/test.sh $lib $CUDA_MAJOR - done + ci/cudf_pandas_scripts/third-party-integration/test_new.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index dcd9ade932c..f499e55a713 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -134,12 +134,4 @@ jobs: sha: ${{ inputs.sha }} container_image: "rapidsai/ci-conda:latest" run_script: | - LIBS=$(ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml) - LIBS=${LIBS#[} - LIBS=${LIBS%]} - for lib in ${LIBS//,/ }; do - lib=$(echo "$lib" | tr -d '""') - echo "Running tests for $lib" - CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - ci/cudf_pandas_scripts/third-party-integration/test.sh $lib $CUDA_MAJOR - done + ci/cudf_pandas_scripts/third-party-integration/test_new.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/ci/cudf_pandas_scripts/third-party-integration/test_new.sh b/ci/cudf_pandas_scripts/third-party-integration/test_new.sh new file mode 100644 index 00000000000..c0937ceeb7c --- /dev/null +++ b/ci/cudf_pandas_scripts/third-party-integration/test_new.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +# Common setup steps shared by Python test jobs + +set -euo pipefail + +write_output() { + local key="$1" + local value="$2" + echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" +} + +extract_lib_from_dependencies_yaml() { + local file=$1 + # Parse all keys in dependencies.yaml under the "files" section, + # extract all the keys that start with "test_", and extract the rest + local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')" + echo $extracted_libs +} + +main() { + local dependencies_yaml="$1" + + LIBS=$(extract_lib_from_dependencies_yaml "$dependencies_yaml") + LIBS=${LIBS#[} + LIBS=${LIBS%]} + + for lib in ${LIBS//,/ }; do + lib=$(echo "$lib" | tr -d '""') + echo "Running tests for $lib" + + CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + + . /opt/conda/etc/profile.d/conda.sh + + rapids-logger "Generate Python testing dependencies" + rapids-dependency-file-generator \ + --config "$dependencies_yaml" \ + --output conda \ + --file-key test_${lib} \ + --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + + rapids-mamba-retry env create --yes -f env.yaml -n test + + # Temporarily allow unbound variables for conda activation. + set +u + conda activate test + set -u + + RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} + + mkdir -p "${RAPIDS_TESTS_DIR}" + + repo_root=$(git rev-parse --show-toplevel) + TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests + + rapids-print-env + + rapids-logger "Check GPU usage" + nvidia-smi + + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e + + rapids-logger "pytest ${lib}" + + NUM_PROCESSES=8 + serial_libraries=( + "tensorflow" + ) + for serial_library in "${serial_libraries[@]}"; do + if [ "${lib}" = "${serial_library}" ]; then + NUM_PROCESSES=1 + fi + done + + RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib} + + rapids-logger "Test script exiting with value: ${EXITCODE}" + done + + exit ${EXITCODE} +} + +main "$@" From 560eb827ff5e918d300378514a98535463f9a810 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 28 Aug 2024 05:15:36 -0700 Subject: [PATCH 18/24] chmod test.sh --- ci/cudf_pandas_scripts/third-party-integration/test_new.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cudf_pandas_scripts/third-party-integration/test_new.sh b/ci/cudf_pandas_scripts/third-party-integration/test_new.sh index c0937ceeb7c..61370a1dfdb 100644 --- a/ci/cudf_pandas_scripts/third-party-integration/test_new.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test_new.sh @@ -28,7 +28,7 @@ main() { for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') - echo "Running tests for $lib" + echo "Running tests for library $lib" CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) From 609313c0117a5cd76589696efc4839f0d15100e2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 28 Aug 2024 06:46:03 -0700 Subject: [PATCH 19/24] remove extract_lib.sh --- .github/workflows/pr.yaml | 2 +- .github/workflows/test.yaml | 2 +- .../third-party-integration/extract_lib.sh | 27 ----- .../third-party-integration/test.sh | 104 +++++++++++------- .../third-party-integration/test_new.sh | 87 --------------- 5 files changed, 69 insertions(+), 153 deletions(-) delete mode 100755 ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh delete mode 100644 ci/cudf_pandas_scripts/third-party-integration/test_new.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 41c71d30a0c..d55207dd0dd 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -307,4 +307,4 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: | - ci/cudf_pandas_scripts/third-party-integration/test_new.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f499e55a713..2c68f2861bb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -134,4 +134,4 @@ jobs: sha: ${{ inputs.sha }} container_image: "rapidsai/ci-conda:latest" run_script: | - ci/cudf_pandas_scripts/third-party-integration/test_new.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh b/ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh deleted file mode 100755 index 67ec5c773bc..00000000000 --- a/ci/cudf_pandas_scripts/third-party-integration/extract_lib.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -set -euo pipefail - -write_output() { - local key="$1" - local value="$2" - echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" -} - -extract_lib_from_dependencies_yaml() { - local file=$1 - # Parse all keys in dependencies.yaml under the "files" section, - # extract all the keys that starts with "test_", and extract the - # rest - local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries( select(.key | contains("test_")) ) | keys | map(sub("^test_"; ""))')" - echo $extracted_libs -} - - -main() { - local dependencies_yaml="$1" - extract_lib_from_dependencies_yaml "$dependencies_yaml" -} - -main "$@" diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index 6c1c7efe489..61370a1dfdb 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -3,55 +3,85 @@ # Common setup steps shared by Python test jobs -LIB=$1 - set -euo pipefail -. /opt/conda/etc/profile.d/conda.sh +write_output() { + local key="$1" + local value="$2" + echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" +} + +extract_lib_from_dependencies_yaml() { + local file=$1 + # Parse all keys in dependencies.yaml under the "files" section, + # extract all the keys that start with "test_", and extract the rest + local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')" + echo $extracted_libs +} + +main() { + local dependencies_yaml="$1" + + LIBS=$(extract_lib_from_dependencies_yaml "$dependencies_yaml") + LIBS=${LIBS#[} + LIBS=${LIBS%]} + + for lib in ${LIBS//,/ }; do + lib=$(echo "$lib" | tr -d '""') + echo "Running tests for library $lib" + + CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + + . /opt/conda/etc/profile.d/conda.sh + + rapids-logger "Generate Python testing dependencies" + rapids-dependency-file-generator \ + --config "$dependencies_yaml" \ + --output conda \ + --file-key test_${lib} \ + --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + + rapids-mamba-retry env create --yes -f env.yaml -n test -rapids-logger "Generate Python testing dependencies" -rapids-dependency-file-generator \ - --config "python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml" \ - --output conda \ - --file-key test_${LIB} \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + # Temporarily allow unbound variables for conda activation. + set +u + conda activate test + set -u -rapids-mamba-retry env create --yes -f env.yaml -n test + RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} -# Temporarily allow unbound variables for conda activation. -set +u -conda activate test -set -u + mkdir -p "${RAPIDS_TESTS_DIR}" -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} + repo_root=$(git rev-parse --show-toplevel) + TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests -mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-print-env -repo_root=$(git rev-parse --show-toplevel) -TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests + rapids-logger "Check GPU usage" + nvidia-smi -rapids-print-env + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e -rapids-logger "Check GPU usage" -nvidia-smi + rapids-logger "pytest ${lib}" -EXITCODE=0 -trap "EXITCODE=1" ERR -set +e + NUM_PROCESSES=8 + serial_libraries=( + "tensorflow" + ) + for serial_library in "${serial_libraries[@]}"; do + if [ "${lib}" = "${serial_library}" ]; then + NUM_PROCESSES=1 + fi + done -rapids-logger "pytest ${LIB}" + RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib} -NUM_PROCESSES=8 -serial_libraries=( - "tensorflow" -) -for serial_library in "${serial_libraries[@]}"; do - if [ "${LIB}" = "${serial_library}" ]; then - NUM_PROCESSES=1 - fi -done + rapids-logger "Test script exiting with value: ${EXITCODE}" + done -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${LIB} + exit ${EXITCODE} +} -rapids-logger "Test script exiting with value: ${EXITCODE}" -exit ${EXITCODE} +main "$@" diff --git a/ci/cudf_pandas_scripts/third-party-integration/test_new.sh b/ci/cudf_pandas_scripts/third-party-integration/test_new.sh deleted file mode 100644 index 61370a1dfdb..00000000000 --- a/ci/cudf_pandas_scripts/third-party-integration/test_new.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -# Common setup steps shared by Python test jobs - -set -euo pipefail - -write_output() { - local key="$1" - local value="$2" - echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" -} - -extract_lib_from_dependencies_yaml() { - local file=$1 - # Parse all keys in dependencies.yaml under the "files" section, - # extract all the keys that start with "test_", and extract the rest - local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')" - echo $extracted_libs -} - -main() { - local dependencies_yaml="$1" - - LIBS=$(extract_lib_from_dependencies_yaml "$dependencies_yaml") - LIBS=${LIBS#[} - LIBS=${LIBS%]} - - for lib in ${LIBS//,/ }; do - lib=$(echo "$lib" | tr -d '""') - echo "Running tests for library $lib" - - CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) - - . /opt/conda/etc/profile.d/conda.sh - - rapids-logger "Generate Python testing dependencies" - rapids-dependency-file-generator \ - --config "$dependencies_yaml" \ - --output conda \ - --file-key test_${lib} \ - --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml - - rapids-mamba-retry env create --yes -f env.yaml -n test - - # Temporarily allow unbound variables for conda activation. - set +u - conda activate test - set -u - - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} - - mkdir -p "${RAPIDS_TESTS_DIR}" - - repo_root=$(git rev-parse --show-toplevel) - TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests - - rapids-print-env - - rapids-logger "Check GPU usage" - nvidia-smi - - EXITCODE=0 - trap "EXITCODE=1" ERR - set +e - - rapids-logger "pytest ${lib}" - - NUM_PROCESSES=8 - serial_libraries=( - "tensorflow" - ) - for serial_library in "${serial_libraries[@]}"; do - if [ "${lib}" = "${serial_library}" ]; then - NUM_PROCESSES=1 - fi - done - - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib} - - rapids-logger "Test script exiting with value: ${EXITCODE}" - done - - exit ${EXITCODE} -} - -main "$@" From 0922bfe4ea0a26318b97c0a481c02f84f454fa94 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 28 Aug 2024 08:53:15 -0700 Subject: [PATCH 20/24] default to 11.8 and 12.5 --- .../third_party_integration_tests/dependencies.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index c894e0cfb2d..05e1d8178d5 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -135,6 +135,10 @@ dependencies: specific: - output_types: conda matrices: + - matrix: + cuda: "11" + packages: + - cuda-version=11.8 - matrix: cuda: "11.8" packages: @@ -151,6 +155,10 @@ dependencies: cuda: "12.5" packages: - cuda-version=12.5 + - matrix: + cuda: "12" + packages: + - cuda-version=12.5 py_version: specific: - output_types: conda From 9561ea2d2fe87151eb7bae80393572b6ecb00478 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 28 Aug 2024 15:29:46 -0700 Subject: [PATCH 21/24] remove some pytest flags --- .../third-party-integration/ci_run_library_tests.sh | 3 --- ci/cudf_pandas_scripts/third-party-integration/test.sh | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh index 678ee36b3d9..f082c4dfa58 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh @@ -17,7 +17,6 @@ runtest_gold() { -v \ --continue-on-collection-errors \ --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-gold.xml" \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ ${TEST_DIR}/test_${lib}*.py \ @@ -33,7 +32,6 @@ runtest_cudf_pandas() { -v \ --continue-on-collection-errors \ --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-cudf-pandas.xml" \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ ${TEST_DIR}/test_${lib}*.py \ @@ -55,7 +53,6 @@ main() { -v \ --continue-on-collection-errors \ --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-${lib}-assertion.xml" \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ ${TEST_DIR}/test_${lib}*.py \ diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index 61370a1dfdb..89b28c30e39 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -48,10 +48,6 @@ main() { conda activate test set -u - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} - - mkdir -p "${RAPIDS_TESTS_DIR}" - repo_root=$(git rev-parse --show-toplevel) TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests @@ -76,7 +72,7 @@ main() { fi done - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR} TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib} + TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib} rapids-logger "Test script exiting with value: ${EXITCODE}" done From ef3dc723148f92d683b40d3cc80e03a56b4326b4 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 28 Aug 2024 17:58:23 -0700 Subject: [PATCH 22/24] remove test keys --- .../ci_run_library_tests.sh | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh index f082c4dfa58..54a56508cdc 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh @@ -11,7 +11,6 @@ trap cleanup EXIT runtest_gold() { local lib=$1 - local test_keys=${@:2} pytest \ -v \ @@ -19,13 +18,11 @@ runtest_gold() { --cache-clear \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ - ${TEST_DIR}/test_${lib}*.py \ - ${test_keys} + ${TEST_DIR}/test_${lib}*.py } runtest_cudf_pandas() { local lib=$1 - local test_keys=${@:2} pytest \ -p cudf.pandas \ @@ -34,17 +31,15 @@ runtest_cudf_pandas() { --cache-clear \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ - ${TEST_DIR}/test_${lib}*.py \ - ${test_keys} + ${TEST_DIR}/test_${lib}*.py } main() { local lib=$1 - local test_keys=${@:2} # generation phase - runtest_gold ${lib} ${test_keys} - runtest_cudf_pandas ${lib} ${test_keys} + runtest_gold ${lib} + runtest_cudf_pandas ${lib} # assertion phase pytest \ @@ -55,8 +50,7 @@ main() { --cache-clear \ --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ - ${TEST_DIR}/test_${lib}*.py \ - ${test_keys} + ${TEST_DIR}/test_${lib}*.py } main $@ From 9c10daee1e01e93f769496bf87773e03c44add00 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 29 Aug 2024 15:33:02 -0700 Subject: [PATCH 23/24] address review --- .github/workflows/pr.yaml | 11 ----------- ci/cudf_pandas_scripts/run_tests.sh | 1 + 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d55207dd0dd..c43c523a78e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -297,14 +297,3 @@ jobs: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" - third-party-integration-tests-cudf-pandas: - needs: wheel-build-cudf - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: | - ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index bf618a48001..8b85695c861 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -64,6 +64,7 @@ fi python -m pip install ipykernel python -m ipykernel install --user --name python3 +# The third-party integration tests are ignored because they are run nightly in seperate CI job python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ --cov-config=./python/cudf/.coveragerc \ From 1afea54128126a9498fe865df92f7bb4e75970a2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 29 Aug 2024 15:34:22 -0700 Subject: [PATCH 24/24] remove job completeyly from pr.yaml --- .github/workflows/pr.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c43c523a78e..35c7e3d95b6 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -36,7 +36,6 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff - - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always()