Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
PGijsbers committed Nov 7, 2019
2 parents d73b9a8 + 268b655 commit c424d38
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 23 deletions.
15 changes: 15 additions & 0 deletions docs/source/releases.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
Release Notes
=============

Version 19.11.0
---------------
Features:
- `gama.__version__` can now be used to retrieve gama's version.
- `fit_arff`, `score_arff` and `predict_arff` now accept a `target_column` parameter to specify the target.
If left unset, the last column of the ARFF file is assumed to be the target column.

Bugfixes:
- fit(x, y) may now be called with y as (N,1) array.
- ensemble post-processing is now compatible with non-zero indexed class labels

Maintenance:
- `__version__.py` is now the only place with hard-coded version.


Version 19.08.0
---------------
- Prototype dash app for visualizing GAMA logs.
Expand Down
1 change: 1 addition & 0 deletions gama/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .GamaClassifier import GamaClassifier
from .GamaRegressor import GamaRegressor
from .__version__ import __version__

name = "gama"

Expand Down
2 changes: 2 additions & 0 deletions gama/__version__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# We employ YY.0M.micro scheme in 2019. In 2020 we move to YY.minor.micro.
__version__ = '19.11.0'
15 changes: 9 additions & 6 deletions gama/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" This module contains functions for loading data. """
from typing import Tuple
from typing import Tuple, Optional

import arff
import pandas as pd
Expand All @@ -19,6 +19,9 @@ def arff_to_pandas(file_path: str) -> pd.DataFrame:
A dataframe of the data in the ARFF file,
with categorical columns having category dtype.
"""
if not isinstance(file_path, str):
raise TypeError(f"`file_path` must be of type `str` but is of type {type(file_path)}")

with open(file_path, 'r') as arff_file:
arff_dict = arff.load(arff_file)

Expand All @@ -31,17 +34,17 @@ def arff_to_pandas(file_path: str) -> pd.DataFrame:
return data


def X_y_from_arff(file_path: str, split_column: str = 'last') -> Tuple[pd.DataFrame, pd.Series]:
def X_y_from_arff(file_path: str, split_column: Optional[str] = None) -> Tuple[pd.DataFrame, pd.Series]:
""" Load data from the ARFF file into pandas DataFrame and specified column to pd.Series. "
Parameters
----------
file_path: str
path to the ARFF file.
split_column: str (default='last')
split_column: str, optional (default=None)
Column to split and return separately (e.g. target column).
Value should either match a column name or 'last'.
If 'last' is specified, the last column is returned separately.
Value should either match a column name or None.
If None is specified, the last column is returned separately.
Returns
-------
Expand All @@ -50,7 +53,7 @@ def X_y_from_arff(file_path: str, split_column: str = 'last') -> Tuple[pd.DataFr
"""
data = arff_to_pandas(file_path)

if split_column == 'last':
if split_column is None:
return data.iloc[:, :-1], data.iloc[:, -1]
elif split_column in data.columns:
return data.loc[:, data.columns != split_column], data.loc[:, split_column]
Expand Down
33 changes: 20 additions & 13 deletions gama/gama.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from gama.utilities.metrics import scoring_to_metric
from .utilities.observer import Observer

from gama.__version__ import __version__
from gama.data import X_y_from_arff
from gama.search_methods.async_ea import AsyncEA
from gama.utilities.generic.timekeeper import TimeKeeper
Expand All @@ -42,7 +43,6 @@
STR_NO_OPTIMAL_PIPELINE = """Gama did not yet establish an optimal pipeline.
This can be because `fit` was not yet called, or
did not terminate successfully."""
__version__ = '19.01.0'

for module_to_ignore in ["sklearn", "numpy"]:
warnings.filterwarnings("ignore", module=module_to_ignore)
Expand Down Expand Up @@ -202,23 +202,24 @@ def predict(self, x: Union[pd.DataFrame, np.ndarray]):
x[col] = x[col].astype(self._X[col].dtype)
return self._predict(x)

def predict_arff(self, arff_file_path: str):
def predict_arff(self, arff_file_path: str, target_column: Optional[str] = None) -> np.ndarray:
""" Predict the target for input found in the ARFF file.
Parameters
----------
arff_file_path: str
An ARFF file with the same columns as the one that used in fit.
The target column is ignored (but must be present).
Target column must be present in file, but its values are ignored (can be '?').
target_column: str, optional (default=None)
Specifies which column the model should predict.
If left None, the last column is taken to be the target.
Returns
-------
numpy.ndarray
array with predictions for each row in the ARFF file.
"""
if not isinstance(arff_file_path, str):
raise TypeError(f"`arff_file_path` must be of type `str` but is of type {type(arff_file_path)}")
X, _ = X_y_from_arff(arff_file_path)
X, _ = X_y_from_arff(arff_file_path, split_column=target_column)
return self._predict(X)

def score(self, x: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> float:
Expand All @@ -239,32 +240,38 @@ def score(self, x: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarr
predictions = self.predict_proba(x) if self._metrics[0].requires_probabilities else self.predict(x)
return self._metrics[0].score(y, predictions)

def score_arff(self, arff_file_path: str) -> float:
def score_arff(self, arff_file_path: str, target_column: Optional[str] = None) -> float:
""" Calculate the score of the model according to the `scoring` metric and input in the ARFF file.
Parameters
----------
arff_file_path: string
arff_file_path: str
An ARFF file with which to calculate the score.
target_column: str, optional (default=None)
Specifies which column the model should predict.
If left None, the last column is taken to be the target.
Returns
-------
float
The score obtained on the given test data according to the `scoring` metric.
"""
X, y = X_y_from_arff(arff_file_path)
X, y = X_y_from_arff(arff_file_path, split_column=target_column)
return self.score(X, y)

def fit_arff(self, arff_file_path: str, *args, **kwargs):
def fit_arff(self, arff_file_path: str, target_column: Optional[str] = None, *args, **kwargs):
""" Find and fit a model to predict the target column (last) from other columns.
Parameters
----------
arff_file_path: string
arff_file_path: str
Path to an ARFF file containing the training data.
The last column is always taken to be the target.
target_column: str, optional (default=None)
Specifies which column the model should predict.
If left None, the last column is taken to be the target.
"""
X, y = X_y_from_arff(arff_file_path)
X, y = X_y_from_arff(arff_file_path, split_column=target_column)
self.fit(X, y, *args, **kwargs)

def fit(self,
Expand Down
8 changes: 6 additions & 2 deletions gama/utilities/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,12 @@ def format_x_y(x: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, pd.Ser

if isinstance(x, np.ndarray):
x = heuristic_numpy_to_dataframe(x)
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
y = np.argmax(y, axis=1)
if isinstance(y, np.ndarray) and y.ndim == 2:
# Either indicator matrix or should be a vector.
if y.shape[1] > 1:
y = np.argmax(y, axis=1)
else:
y = y.squeeze()

if y_type == pd.Series:
if isinstance(y, pd.DataFrame):
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

from setuptools import setup, find_packages

with open("gama/__version__.py", 'r') as fh:
version = fh.readlines()[-1].split()[-1].strip("\"'")

requirements = [
'numpy>=1.14.0',
'scipy>=1.0.0',
Expand Down Expand Up @@ -37,7 +40,7 @@

setup(
name='gama',
version='19.08.0',
version=version,
description='A package for automated machine learning based on scikit-learn.',
long_description=README,
long_description_content_type='text/markdown',
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def well_formatted_x_y(x, y, y_type):
X_np, y_np = load_digits(return_X_y=True)
X_df, y_df = pd.DataFrame(X_np), pd.DataFrame(y_np)
y_series = pd.Series(y_np)
y_2d = y_np.reshape(-1, 1)

for X, y in itertools.product([X_np, X_df], [y_np, y_series, y_df]):
for X, y in itertools.product([X_np, X_df], [y_np, y_series, y_df, y_2d]):
well_formatted_x_y(*format_x_y(X, y), y_type=pd.Series)
well_formatted_x_y(*format_x_y(X, y, y_type=pd.DataFrame), y_type=pd.DataFrame)

0 comments on commit c424d38

Please sign in to comment.