Skip to content

Commit

Permalink
Support for Pandas Dataframe and Series (#127)
Browse files Browse the repository at this point in the history
This PR is attempting to add support for Pandas Dataframes and Series
directly.

Resolves #125
  • Loading branch information
kspieks authored Jun 12, 2023
2 parents ff81acf + 6f032ea commit 34ca06d
Show file tree
Hide file tree
Showing 6 changed files with 310 additions and 117 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ __Note for Windows Powershell or MacOS Catalina or newer__: On these systems the
## Using `astartes`
`astartes` is designed as a drop-in replacement for `sklearn`'s `train_test_split` function. To switch to `astartes`, change `from sklearn.model_selection import train_test_split` to `from astartes import train_test_split`.

Like `sklearn`, `astartes` accepts any iterable object as `X`, `y`, and `labels`.
Each will be converted to a `numpy` array for internal operations, and returned as a `numpy` array with limited exceptions: if `X` is a `pandas` `DataFrame`, `y` is a `Series`, or `labels` is a `Series`, `astartes` will cast it back to its original type including its index and column names.

> **Note**
> The developers recommend passing `X`, `y`, and `labels` as `numpy` arrays and handling the conversion to and from other types explicity on your own. Behind-the-scenes type casting can lead to unexpected behavior!
By default, `astartes` will split data randomly. Additionally, a variety of algorithmic sampling approaches can be used by specifying the `sampler` argument to the function:

```python
Expand Down
117 changes: 43 additions & 74 deletions astartes/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
from math import floor
from typing import Union
from warnings import warn

import numpy as np
import pandas as pd

from astartes.samplers import (
IMPLEMENTED_EXTRAPOLATION_SAMPLERS,
IMPLEMENTED_INTERPOLATION_SAMPLERS,
)
from astartes.utils.convert_to_array import convert_to_array
from astartes.utils.array_type_helpers import (
convert_to_array,
panda_handla,
return_helper,
)
from astartes.utils.exceptions import InvalidConfigurationError
from astartes.utils.sampler_factory import SamplerFactory
from astartes.utils.warnings import ImperfectSplittingWarning, NormalizationWarning
Expand All @@ -17,9 +23,9 @@


def train_val_test_split(
X: np.array,
y: np.array = None,
labels: np.array = None,
X: Union[np.array, pd.DataFrame],
y: Union[np.array, pd.Series] = None,
labels: Union[np.array, pd.Series] = None,
train_size: float = 0.8,
val_size: float = 0.1,
test_size: float = 0.1,
Expand All @@ -31,9 +37,9 @@ def train_val_test_split(
"""Deterministic train_test_splitting of arbitrary arrays.
Args:
X (np.array): Numpy array of feature vectors.
y (np.array, optional): Targets corresponding to X, must be of same size. Defaults to None.
labels (np.array, optional): Labels corresponding to X, must be of same size. Defaults to None.
X (np.array, pd.DataFrame): Numpy array or pandas DataFrame of feature vectors.
y (np.array, pd.Series, optional): Targets corresponding to X, must be of same size. Defaults to None.
labels (np.array, pd.Series, optional): Labels corresponding to X, must be of same size. Defaults to None.
train_size (float, optional): Fraction of dataset to use in training set. Defaults to 0.8.
val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
Expand All @@ -43,15 +49,20 @@ def train_val_test_split(
return_indices (bool, optional): True to return indices of train/test after values. Defaults to False.
Returns:
np.array: X, y, and labels train/val/test data, or indices.
np.array(s): X, y, and labels train/val/test data, or indices.
"""
# special case for casting back to pandas
output_is_pandas = panda_handla(X, y, labels)

# now convert everything to numpy arrays for our internal algorithms
if type(X) is not np.ndarray:
X = convert_to_array(X, "X")
if y is not None and type(y) is not np.ndarray:
y = convert_to_array(y, "y")
if labels is not None and type(labels) is not np.ndarray:
labels = convert_to_array(labels, "labels")

# check for consistent length after conversion
msg = ""
if y is not None and len(y) != len(X):
msg += "len(y)={:d} ".format(len(y))
Expand Down Expand Up @@ -80,6 +91,7 @@ def train_val_test_split(
val_size,
train_size,
return_indices,
output_is_pandas,
)
else:
return _extrapolative_sampling(
Expand All @@ -88,6 +100,7 @@ def train_val_test_split(
val_size,
train_size,
return_indices,
output_is_pandas,
random_state,
)

Expand Down Expand Up @@ -139,6 +152,7 @@ def _extrapolative_sampling(
val_size,
train_size,
return_indices,
output_is_pandas,
random_state,
):
"""Helper function to perform extrapolative sampling.
Expand All @@ -153,10 +167,12 @@ def _extrapolative_sampling(
val_size (float): Fraction of data to use in val.
train_size (float): Fraction of data to use in train.
return_indices (bool): Return indices or the arrays themselves.
output_is_pandas (array[str] or bool): True/False if output should cast to pandas,
data needed to perform casting if True.
random_state (int, optional): The random state used to shuffle small clusters. Default to no shuffle.
Returns:
calls: _return_helper
calls: return_helper
"""
# calculate "goal" splitting sizes
n_test_samples = floor(len(sampler_instance.X) * test_size)
Expand Down Expand Up @@ -194,8 +210,13 @@ def _extrapolative_sampling(
_check_actual_split(
train_idxs, val_idxs, test_idxs, train_size, val_size, test_size
)
return _return_helper(
sampler_instance, train_idxs, val_idxs, test_idxs, return_indices
return return_helper(
sampler_instance,
train_idxs,
val_idxs,
test_idxs,
return_indices,
output_is_pandas,
)


Expand All @@ -205,6 +226,7 @@ def _interpolative_sampling(
val_size,
train_size,
return_indices,
output_is_pandas,
):
"""Helper function to perform interpolative sampling.
Expand All @@ -218,9 +240,11 @@ def _interpolative_sampling(
val_size (float): Fraction of data to use in val.
train_size (float): Fraction of data to use in train.
return_indices (bool): Return indices or the arrays themselves.
output_is_pandas (array[str] or bool): True/False if output should cast to pandas,
data needed to perform casting if True.
Returns:
calls: _return_helper
calls: return_helper
"""
n_train_samples = floor(len(sampler_instance.X) * train_size)
n_val_samples = floor(len(sampler_instance.X) * val_size)
Expand All @@ -233,71 +257,16 @@ def _interpolative_sampling(
_check_actual_split(
train_idxs, val_idxs, test_idxs, train_size, val_size, test_size
)
return _return_helper(
sampler_instance, train_idxs, val_idxs, test_idxs, return_indices
return return_helper(
sampler_instance,
train_idxs,
val_idxs,
test_idxs,
return_indices,
output_is_pandas,
)


def _return_helper(
sampler_instance,
train_idxs,
val_idxs,
test_idxs,
return_indices,
):
"""Convenience function to return the requested arrays appropriately.
Args:
sampler_instance (sampler): The fit sampler instance.
test_size (float): Fraction of data to use in test.
val_size (float): Fraction of data to use in val.
train_size (float): Fraction of data to use in train.
return_indices (bool): Return indices after the value arrays.
Returns:
np.array: Either many arrays or indices in arrays.
"""
out = []
X_train = sampler_instance.X[train_idxs]
out.append(X_train)
if len(val_idxs):
X_val = sampler_instance.X[val_idxs]
out.append(X_val)
X_test = sampler_instance.X[test_idxs]
out.append(X_test)

if sampler_instance.y is not None:
y_train = sampler_instance.y[train_idxs]
out.append(y_train)
if len(val_idxs):
y_val = sampler_instance.y[val_idxs]
out.append(y_val)
y_test = sampler_instance.y[test_idxs]
out.append(y_test)
if sampler_instance.labels is not None:
labels_train = sampler_instance.labels[train_idxs]
out.append(labels_train)
if len(val_idxs):
labels_val = sampler_instance.labels[val_idxs]
out.append(labels_val)
labels_test = sampler_instance.labels[test_idxs]
out.append(labels_test)
if len(sampler_instance.get_clusters()): # true when the list has been filled
clusters_train = sampler_instance.get_clusters()[train_idxs]
out.append(clusters_train)
if len(val_idxs):
clusters_val = sampler_instance.get_clusters()[val_idxs]
out.append(clusters_val)
clusters_test = sampler_instance.get_clusters()[test_idxs]
out.append(clusters_test)
if return_indices:
out.append(train_idxs)
if val_idxs.any():
out.append(val_idxs)
out.append(test_idxs)
return (*out,)


def _check_actual_split(
train_idxs,
val_idxs,
Expand Down
Loading

0 comments on commit 34ca06d

Please sign in to comment.