Skip to content

Commit

Permalink
Allow missing values in CategoricalMatrix (#281)
Browse files Browse the repository at this point in the history
* Add missing support to categoricals

* Rename functions

* Parametrize missing behavior in constructors

* Return a maskedarray from recover_orig

* Propagate missing_method when indexing

* Add tests

* Template all the things!

* Privatize has_missing attribute

* Add changelog entry

* Add option to treat missing values as a category

* Update changelog

* Raise if the missing category already exists

* Add tests for missing name and raise on existing

* Don't skip tests (they are fast)

* Apply suggestions from review

* Fix indxing

* Fix intercept name in formulas

* Add missing cateegorical functinoality to formulas

* Much cooler handlong of missing categoricals
  • Loading branch information
stanmart authored Aug 17, 2023
1 parent 3bec539 commit a830107
Show file tree
Hide file tree
Showing 10 changed files with 657 additions and 163 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Unreleased

- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.
- Add a formula interface for creating tabmat matrices from pandas data frames. See :func:`tabmat.from_formula` for details.
- Add support for missing values in ``CategoricalMatrix`` by either creating a separate category for them or treating them as all-zero rows.

**Other changes:**

Expand Down
134 changes: 104 additions & 30 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,14 @@ def matvec(mat, vec):
from scipy import sparse as sps

from .ext.categorical import (
matvec,
matvec_drop_first,
multiply_drop_first,
sandwich_categorical,
sandwich_categorical_drop_first,
subset_categorical_drop_first,
transpose_matvec,
transpose_matvec_drop_first,
matvec_complex,
matvec_fast,
multiply_complex,
sandwich_categorical_complex,
sandwich_categorical_fast,
subset_categorical_complex,
transpose_matvec_complex,
transpose_matvec_fast,
)
from .ext.split import sandwich_cat_cat, sandwich_cat_dense
from .matrix_base import MatrixBase
Expand Down Expand Up @@ -237,6 +237,17 @@ class CategoricalMatrix(MatrixBase):
drop the first level of the dummy encoding. This allows a CategoricalMatrix
to be used in an unregularized setting.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
- if 'fail', raise an error if there are missing values.
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the ``cat_missing_name``
category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``. If this category already exists, an error
will be raised.
dtype:
data type
"""
Expand All @@ -249,15 +260,46 @@ def __init__(
column_name: Optional[str] = None,
term_name: Optional[str] = None,
column_name_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
):
if pd.isnull(cat_vec).any():
raise ValueError("Categorical data can't have missing values.")
if cat_missing_method not in ["fail", "zero", "convert"]:
raise ValueError(
"cat_missing_method must be one of 'fail' 'zero' or 'convert', "
f" got {cat_missing_method}"
)
self._missing_method = cat_missing_method
self._missing_category = cat_missing_name

if isinstance(cat_vec, pd.Categorical):
self.cat = cat_vec
else:
self.cat = pd.Categorical(cat_vec)

if pd.isnull(self.cat).any():
if self._missing_method == "fail":
raise ValueError(
"Categorical data can't have missing values "
"if cat_missing_method='fail'."
)

elif self._missing_method == "convert":
if self._missing_category in self.cat.categories:
raise ValueError(
f"Missing category {self._missing_category} already exists."
)

self.cat = self.cat.add_categories([self._missing_category])

self.cat[pd.isnull(self.cat)] = self._missing_category
self._has_missings = False

else:
self._has_missings = True

else:
self._has_missings = False

self.drop_first = drop_first
self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first))
self.indices = self.cat.codes.astype(np.int32)
Expand All @@ -279,7 +321,20 @@ def recover_orig(self) -> np.ndarray:
Test: matrix/test_categorical_matrix::test_recover_orig
"""
return self.cat.categories[self.cat.codes]
orig = self.cat.categories[self.cat.codes].to_numpy()

if self._has_missings:
orig = orig.view(np.ma.MaskedArray)
orig.mask = self.cat.codes == -1
elif (
self._missing_method == "convert"
and self._missing_category in self.cat.categories
):
orig = orig.view(np.ma.MaskedArray)
missing_code = self.cat.categories.get_loc(self._missing_category)
orig.mask = self.cat.codes == missing_code

return orig

def _matvec_setup(
self,
Expand Down Expand Up @@ -335,12 +390,18 @@ def matvec(
if out is None:
out = np.zeros(self.shape[0], dtype=other_m.dtype)

if self.drop_first:
matvec_drop_first(
self.indices, other_m, self.shape[0], cols, self.shape[1], out
if self.drop_first or self._has_missings:
matvec_complex(
self.indices,
other_m,
self.shape[0],
cols,
self.shape[1],
out,
self.drop_first,
)
else:
matvec(self.indices, other_m, self.shape[0], cols, self.shape[1], out)
matvec_fast(self.indices, other_m, self.shape[0], cols, self.shape[1], out)

if is_int:
return out.astype(int)
Expand Down Expand Up @@ -402,12 +463,19 @@ def transpose_matvec(
if cols is not None:
cols = set_up_rows_or_cols(cols, self.shape[1])

if self.drop_first:
transpose_matvec_drop_first(
self.indices, vec, self.shape[1], vec.dtype, rows, cols, out
if self.drop_first or self._has_missings:
transpose_matvec_complex(
self.indices,
vec,
self.shape[1],
vec.dtype,
rows,
cols,
out,
self.drop_first,
)
else:
transpose_matvec(
transpose_matvec_fast(
self.indices, vec, self.shape[1], vec.dtype, rows, cols, out
)

Expand Down Expand Up @@ -438,12 +506,12 @@ def sandwich(
"""
d = np.asarray(d)
rows = set_up_rows_or_cols(rows, self.shape[0])
if self.drop_first:
res_diag = sandwich_categorical_drop_first(
self.indices, d, rows, d.dtype, self.shape[1]
if self.drop_first or self._has_missings:
res_diag = sandwich_categorical_complex(
self.indices, d, rows, d.dtype, self.shape[1], self.drop_first
)
else:
res_diag = sandwich_categorical(
res_diag = sandwich_categorical_fast(
self.indices, d, rows, d.dtype, self.shape[1]
)

Expand Down Expand Up @@ -490,9 +558,9 @@ def getcol(self, i: int) -> SparseMatrix:

def tocsr(self) -> sps.csr_matrix:
"""Return scipy csr representation of matrix."""
if self.drop_first:
nnz, indices, indptr = subset_categorical_drop_first(
self.indices, self.shape[1]
if self.drop_first or self._has_missings:
nnz, indices, indptr = subset_categorical_complex(
self.indices, self.shape[1], self.drop_first
)
return sps.csr_matrix(
(np.ones(nnz, dtype=int), indices, indptr), shape=self.shape
Expand Down Expand Up @@ -549,6 +617,7 @@ def __getitem__(self, item):
dtype=self.dtype,
column_name=self._colname,
column_name_format=self._colname_format,
cat_missing_method=self._missing_method,
)
else:
# return a SparseMatrix if we subset columns
Expand Down Expand Up @@ -576,15 +645,17 @@ def _cross_dense(

res = sandwich_cat_dense(
self.indices,
self.shape[1] + self.drop_first,
self.shape[1],
d,
other,
rows,
R_cols,
is_c_contiguous,
has_missings=self._has_missings,
drop_first=self.drop_first,
)

res = _row_col_indexing(res[self.drop_first :], L_cols, None)
res = _row_col_indexing(res, L_cols, None)
return res

def _cross_categorical(
Expand Down Expand Up @@ -612,6 +683,8 @@ def _cross_categorical(
d.dtype,
self.drop_first,
other.drop_first,
self._has_missings,
other._has_missings,
)

res = _row_col_indexing(res, L_cols, R_cols)
Expand Down Expand Up @@ -642,14 +715,15 @@ def multiply(self, other) -> SparseMatrix:
f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}."
)

if self.drop_first:
if self.drop_first or self._has_missings:
return SparseMatrix(
sps.csr_matrix(
multiply_drop_first(
multiply_complex(
indices=self.indices,
d=np.squeeze(other),
ncols=self.shape[1],
dtype=other.dtype,
drop_first=self.drop_first,
),
shape=self.shape,
)
Expand Down
24 changes: 24 additions & 0 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def from_pandas(
cat_position: str = "expand",
drop_first: bool = False,
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
) -> MatrixBase:
"""
Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
Expand Down Expand Up @@ -58,6 +60,14 @@ def from_pandas(
If true, categoricals variables will have their first category dropped.
This allows multiple categorical variables to be included in an
unregularized model. If False, all categories are included.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
How to handle missing values in categorical columns:
- if 'fail', raise an error if there are missing values
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the '(MISSING)' category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``.
Returns
-------
Expand Down Expand Up @@ -87,6 +97,8 @@ def from_pandas(
column_name=colname,
term_name=colname,
column_name_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)
if len(coldata.cat.categories) < cat_threshold:
(
Expand Down Expand Up @@ -207,6 +219,8 @@ def from_formula(
cat_threshold: int = 4,
interaction_separator: str = ":",
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
intercept_name: str = "Intercept",
include_intercept: bool = False,
add_column_for_intercept: bool = True,
Expand Down Expand Up @@ -237,6 +251,14 @@ def from_formula(
categorical_format: str, default "{name}[T.{category}]"
The format string used to generate the names of categorical variables.
Has to include the placeholders ``{name}`` and ``{category}``.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
How to handle missing values in categorical columns:
- if 'fail', raise an error if there are missing values
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the '(MISSING)' category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``.
intercept_name: str, default "Intercept"
The name of the intercept column.
include_intercept: bool, default False
Expand Down Expand Up @@ -274,6 +296,8 @@ def from_formula(
sparse_threshold=sparse_threshold,
cat_threshold=cat_threshold,
add_column_for_intercept=add_column_for_intercept,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)
result = materializer.get_model_matrix(spec)

Expand Down
Loading

0 comments on commit a830107

Please sign in to comment.