Skip to content

Commit

Permalink
Add catboost to the third-party integration tests (#17267)
Browse files Browse the repository at this point in the history
Closes #15397

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17267
  • Loading branch information
Matt711 authored Nov 13, 2024
1 parent 918266a commit 1b045dd
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ files:
- py_version
- test_base
- test_xgboost
test_catboost:
output: none
includes:
- cuda_version
- py_version
- test_base
- test_catboost
test_cuml:
output: none
includes:
Expand Down Expand Up @@ -244,6 +251,14 @@ dependencies:
- pip
- pip:
- xgboost>=2.0.1
test_catboost:
common:
- output_types: conda
packages:
- numpy
- scipy
- scikit-learn
- catboost
test_cuml:
common:
- output_types: conda
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.datasets import make_classification, make_regression

rng = np.random.default_rng(seed=42)


def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
if isinstance(expect, (tuple, list)):
assert len(expect) == len(got)
for e, g in zip(expect, got):
assert_catboost_equal(e, g, rtol, atol)
elif isinstance(expect, np.ndarray):
np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
elif isinstance(expect, pd.DataFrame):
pd.testing.assert_frame_equal(expect, got)
elif isinstance(expect, pd.Series):
pd.testing.assert_series_equal(expect, got)
else:
assert expect == got


pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)


@pytest.fixture
def regression_data():
X, y = make_regression(n_samples=100, n_features=10, random_state=42)
return pd.DataFrame(X), pd.Series(y)


@pytest.fixture
def classification_data():
X, y = make_classification(
n_samples=100, n_features=10, n_classes=2, random_state=42
)
return pd.DataFrame(X), pd.Series(y)


def test_catboost_regressor_with_dataframe(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_regressor_with_numpy(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_classifier_with_dataframe(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_classifier_with_numpy(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_pool_and_dataframe(regression_data):
X, y = regression_data
train_pool = Pool(X, y)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X)
return predictions


def test_catboost_with_pool_and_numpy(regression_data):
X, y = regression_data
train_pool = Pool(X.values, y.values)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_categorical_features():
data = {
"numerical_feature": rng.standard_normal(100),
"categorical_feature": rng.choice(["A", "B", "C"], size=100),
"target": rng.integers(0, 2, size=100),
}
df = pd.DataFrame(data)
X = df[["numerical_feature", "categorical_feature"]]
y = df["target"]
cat_features = ["categorical_feature"]
model = CatBoostClassifier(
iterations=10, verbose=0, cat_features=cat_features
)
model.fit(X, y)
predictions = model.predict(X)
return predictions


@pytest.mark.parametrize(
"X, y",
[
(
pd.DataFrame(rng.standard_normal((100, 5))),
pd.Series(rng.standard_normal(100)),
),
(rng.standard_normal((100, 5)), rng.standard_normal(100)),
],
)
def test_catboost_train_test_split(X, y):
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
return len(X_train), len(X_test), len(y_train), len(y_test), predictions

0 comments on commit 1b045dd

Please sign in to comment.