Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add catboost to the third-party integration tests #17267

Merged
merged 10 commits into from
Nov 13, 2024
Merged
12 changes: 12 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
- unit-tests-cudf-pandas
- pandas-tests
- pandas-tests-diff
- third-party-integration-tests-cudf-pandas
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ignore the CI changes. I added them to show the third-party integration tests were passing. I'll delete them once this PR is approved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the ping

secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: always()
Expand Down Expand Up @@ -302,3 +303,14 @@ jobs:
node_type: cpu4
build_type: pull-request
run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
third-party-integration-tests-cudf-pandas:
needs: wheel-build-cudf
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
node_type: "gpu-v100-latest-1"
arch: "amd64"
container_image: "rapidsai/ci-conda:latest"
run_script: |
ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ files:
- py_version
- test_base
- test_xgboost
test_catboost:
output: none
includes:
- cuda_version
- py_version
- test_base
- test_catboost
test_cuml:
output: none
includes:
Expand Down Expand Up @@ -244,6 +251,16 @@ dependencies:
- pip
- pip:
- xgboost>=2.0.1
test_catboost:
common:
- output_types: conda
packages:
- numpy
- scipy
- scikit-learn
- pip
- pip:
- catboost
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
test_cuml:
common:
- output_types: conda
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.datasets import make_classification, make_regression

rng = np.random.default_rng(seed=42)


def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
if isinstance(expect, (tuple, list)):
assert len(expect) == len(got)
for e, g in zip(expect, got):
assert_catboost_equal(e, g, rtol, atol)
elif isinstance(expect, np.ndarray):
np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
elif isinstance(expect, pd.DataFrame):
pd.testing.assert_frame_equal(expect, got)
elif isinstance(expect, pd.Series):
pd.testing.assert_series_equal(expect, got)
else:
assert expect == got


pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)


@pytest.fixture
def regression_data():
X, y = make_regression(n_samples=100, n_features=10, random_state=42)
return pd.DataFrame(X), pd.Series(y)


@pytest.fixture
def classification_data():
X, y = make_classification(
n_samples=100, n_features=10, n_classes=2, random_state=42
)
return pd.DataFrame(X), pd.Series(y)


def test_catboost_regressor_with_dataframe(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_regressor_with_numpy(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_classifier_with_dataframe(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_classifier_with_numpy(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_pool_and_dataframe(regression_data):
X, y = regression_data
train_pool = Pool(X, y)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X)
return predictions


def test_catboost_with_pool_and_numpy(regression_data):
X, y = regression_data
train_pool = Pool(X.values, y.values)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_categorical_features():
data = {
"numerical_feature": rng.standard_normal(100),
"categorical_feature": rng.choice(["A", "B", "C"], size=100),
"target": rng.integers(0, 2, size=100),
}
df = pd.DataFrame(data)
X = df[["numerical_feature", "categorical_feature"]]
y = df["target"]
cat_features = ["categorical_feature"]
model = CatBoostClassifier(
iterations=10, verbose=0, cat_features=cat_features
)
model.fit(X, y)
predictions = model.predict(X)
return predictions


@pytest.mark.parametrize(
"X, y",
[
(
pd.DataFrame(rng.standard_normal((100, 5))),
pd.Series(rng.standard_normal(100)),
),
(rng.standard_normal((100, 5)), rng.standard_normal(100)),
],
)
def test_catboost_train_test_split(X, y):
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
return len(X_train), len(X_test), len(y_train), len(y_test), predictions
Loading