From a830107c145b882fe72e076d1b122df231dd9fca Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 17 Aug 2023 11:54:03 +0200 Subject: [PATCH] Allow missing values in `CategoricalMatrix` (#281) * Add missing support to categoricals * Rename functions * Parametrize missing behavior in constructors * Return a maskedarray from recover_orig * Propagate missing_method when indexing * Add tests * Template all the things! * Privatize has_missing attribute * Add changelog entry * Add option to treat missing values as a category * Update changelog * Raise if the missing category already exists * Add tests for missing name and raise on existing * Don't skip tests (they are fast) * Apply suggestions from review * Fix indxing * Fix intercept name in formulas * Add missing cateegorical functinoality to formulas * Much cooler handlong of missing categoricals --- CHANGELOG.rst | 1 + src/tabmat/categorical_matrix.py | 134 +++++++++++---- src/tabmat/constructor.py | 24 +++ src/tabmat/ext/cat_split_helpers-tmpl.cpp | 97 +++++++---- src/tabmat/ext/categorical.pyx | 70 ++++---- src/tabmat/ext/split.pyx | 50 ++++-- src/tabmat/formula.py | 43 ++++- tests/test_categorical_matrix.py | 188 ++++++++++++++++++---- tests/test_formula.py | 110 +++++++++++++ tests/test_split_matrix.py | 103 +++++++++--- 10 files changed, 657 insertions(+), 163 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b3750c98..b948d334 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,7 @@ Unreleased - Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties. - Add a formula interface for creating tabmat matrices from pandas data frames. See :func:`tabmat.from_formula` for details. +- Add support for missing values in ``CategoricalMatrix`` by either creating a separate category for them or treating them as all-zero rows. **Other changes:** diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 1180646c..e981bb2e 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -169,14 +169,14 @@ def matvec(mat, vec): from scipy import sparse as sps from .ext.categorical import ( - matvec, - matvec_drop_first, - multiply_drop_first, - sandwich_categorical, - sandwich_categorical_drop_first, - subset_categorical_drop_first, - transpose_matvec, - transpose_matvec_drop_first, + matvec_complex, + matvec_fast, + multiply_complex, + sandwich_categorical_complex, + sandwich_categorical_fast, + subset_categorical_complex, + transpose_matvec_complex, + transpose_matvec_fast, ) from .ext.split import sandwich_cat_cat, sandwich_cat_dense from .matrix_base import MatrixBase @@ -237,6 +237,17 @@ class CategoricalMatrix(MatrixBase): drop the first level of the dummy encoding. This allows a CategoricalMatrix to be used in an unregularized setting. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the ``cat_missing_name`` + category. + + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. If this category already exists, an error + will be raised. + dtype: data type """ @@ -249,15 +260,46 @@ def __init__( column_name: Optional[str] = None, term_name: Optional[str] = None, column_name_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", ): - if pd.isnull(cat_vec).any(): - raise ValueError("Categorical data can't have missing values.") + if cat_missing_method not in ["fail", "zero", "convert"]: + raise ValueError( + "cat_missing_method must be one of 'fail' 'zero' or 'convert', " + f" got {cat_missing_method}" + ) + self._missing_method = cat_missing_method + self._missing_category = cat_missing_name if isinstance(cat_vec, pd.Categorical): self.cat = cat_vec else: self.cat = pd.Categorical(cat_vec) + if pd.isnull(self.cat).any(): + if self._missing_method == "fail": + raise ValueError( + "Categorical data can't have missing values " + "if cat_missing_method='fail'." + ) + + elif self._missing_method == "convert": + if self._missing_category in self.cat.categories: + raise ValueError( + f"Missing category {self._missing_category} already exists." + ) + + self.cat = self.cat.add_categories([self._missing_category]) + + self.cat[pd.isnull(self.cat)] = self._missing_category + self._has_missings = False + + else: + self._has_missings = True + + else: + self._has_missings = False + self.drop_first = drop_first self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first)) self.indices = self.cat.codes.astype(np.int32) @@ -279,7 +321,20 @@ def recover_orig(self) -> np.ndarray: Test: matrix/test_categorical_matrix::test_recover_orig """ - return self.cat.categories[self.cat.codes] + orig = self.cat.categories[self.cat.codes].to_numpy() + + if self._has_missings: + orig = orig.view(np.ma.MaskedArray) + orig.mask = self.cat.codes == -1 + elif ( + self._missing_method == "convert" + and self._missing_category in self.cat.categories + ): + orig = orig.view(np.ma.MaskedArray) + missing_code = self.cat.categories.get_loc(self._missing_category) + orig.mask = self.cat.codes == missing_code + + return orig def _matvec_setup( self, @@ -335,12 +390,18 @@ def matvec( if out is None: out = np.zeros(self.shape[0], dtype=other_m.dtype) - if self.drop_first: - matvec_drop_first( - self.indices, other_m, self.shape[0], cols, self.shape[1], out + if self.drop_first or self._has_missings: + matvec_complex( + self.indices, + other_m, + self.shape[0], + cols, + self.shape[1], + out, + self.drop_first, ) else: - matvec(self.indices, other_m, self.shape[0], cols, self.shape[1], out) + matvec_fast(self.indices, other_m, self.shape[0], cols, self.shape[1], out) if is_int: return out.astype(int) @@ -402,12 +463,19 @@ def transpose_matvec( if cols is not None: cols = set_up_rows_or_cols(cols, self.shape[1]) - if self.drop_first: - transpose_matvec_drop_first( - self.indices, vec, self.shape[1], vec.dtype, rows, cols, out + if self.drop_first or self._has_missings: + transpose_matvec_complex( + self.indices, + vec, + self.shape[1], + vec.dtype, + rows, + cols, + out, + self.drop_first, ) else: - transpose_matvec( + transpose_matvec_fast( self.indices, vec, self.shape[1], vec.dtype, rows, cols, out ) @@ -438,12 +506,12 @@ def sandwich( """ d = np.asarray(d) rows = set_up_rows_or_cols(rows, self.shape[0]) - if self.drop_first: - res_diag = sandwich_categorical_drop_first( - self.indices, d, rows, d.dtype, self.shape[1] + if self.drop_first or self._has_missings: + res_diag = sandwich_categorical_complex( + self.indices, d, rows, d.dtype, self.shape[1], self.drop_first ) else: - res_diag = sandwich_categorical( + res_diag = sandwich_categorical_fast( self.indices, d, rows, d.dtype, self.shape[1] ) @@ -490,9 +558,9 @@ def getcol(self, i: int) -> SparseMatrix: def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" - if self.drop_first: - nnz, indices, indptr = subset_categorical_drop_first( - self.indices, self.shape[1] + if self.drop_first or self._has_missings: + nnz, indices, indptr = subset_categorical_complex( + self.indices, self.shape[1], self.drop_first ) return sps.csr_matrix( (np.ones(nnz, dtype=int), indices, indptr), shape=self.shape @@ -549,6 +617,7 @@ def __getitem__(self, item): dtype=self.dtype, column_name=self._colname, column_name_format=self._colname_format, + cat_missing_method=self._missing_method, ) else: # return a SparseMatrix if we subset columns @@ -576,15 +645,17 @@ def _cross_dense( res = sandwich_cat_dense( self.indices, - self.shape[1] + self.drop_first, + self.shape[1], d, other, rows, R_cols, is_c_contiguous, + has_missings=self._has_missings, + drop_first=self.drop_first, ) - res = _row_col_indexing(res[self.drop_first :], L_cols, None) + res = _row_col_indexing(res, L_cols, None) return res def _cross_categorical( @@ -612,6 +683,8 @@ def _cross_categorical( d.dtype, self.drop_first, other.drop_first, + self._has_missings, + other._has_missings, ) res = _row_col_indexing(res, L_cols, R_cols) @@ -642,14 +715,15 @@ def multiply(self, other) -> SparseMatrix: f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}." ) - if self.drop_first: + if self.drop_first or self._has_missings: return SparseMatrix( sps.csr_matrix( - multiply_drop_first( + multiply_complex( indices=self.indices, d=np.squeeze(other), ncols=self.shape[1], dtype=other.dtype, + drop_first=self.drop_first, ), shape=self.shape, ) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 3426a76b..9849cbde 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -29,6 +29,8 @@ def from_pandas( cat_position: str = "expand", drop_first: bool = False, categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this @@ -58,6 +60,14 @@ def from_pandas( If true, categoricals variables will have their first category dropped. This allows multiple categorical variables to be included in an unregularized model. If False, all categories are included. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. Returns ------- @@ -87,6 +97,8 @@ def from_pandas( column_name=colname, term_name=colname, column_name_format=categorical_format, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, ) if len(coldata.cat.categories) < cat_threshold: ( @@ -207,6 +219,8 @@ def from_formula( cat_threshold: int = 4, interaction_separator: str = ":", categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", intercept_name: str = "Intercept", include_intercept: bool = False, add_column_for_intercept: bool = True, @@ -237,6 +251,14 @@ def from_formula( categorical_format: str, default "{name}[T.{category}]" The format string used to generate the names of categorical variables. Has to include the placeholders ``{name}`` and ``{category}``. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. intercept_name: str, default "Intercept" The name of the intercept column. include_intercept: bool, default False @@ -274,6 +296,8 @@ def from_formula( sparse_threshold=sparse_threshold, cat_threshold=cat_threshold, add_column_for_intercept=add_column_for_intercept, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, ) result = materializer.get_model_matrix(spec) diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp index c40f851c..5cd7ef5a 100644 --- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp +++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp @@ -1,23 +1,26 @@ #include -<%def name="transpose_matvec(dropfirst)"> +<%def name="transpose_matvec(type)"> template -void _transpose_matvec_${dropfirst}( +void _transpose_matvec_${type}( Int n_rows, Int* indices, F* other, F* res, Int res_size + % if type == 'all_rows_complex': + , bool drop_first + % endif ) { #pragma omp parallel { std::vector restemp(res_size, 0.0); #pragma omp for for (Py_ssize_t i = 0; i < n_rows; i++) { - % if dropfirst == 'all_rows_drop_first': - Py_ssize_t col_idx = indices[i] - 1; - if (col_idx != -1) { + % if type == 'all_rows_complex': + Py_ssize_t col_idx = indices[i] - drop_first; + if (col_idx >= 0) { restemp[col_idx] += other[i]; } % else: @@ -33,8 +36,9 @@ void _transpose_matvec_${dropfirst}( +<%def name="sandwich_cat_cat(type)"> template -void _sandwich_cat_cat( +void _sandwich_cat_cat_${type}( F* d, const Int* i_indices, const Int* j_indices, @@ -42,9 +46,11 @@ void _sandwich_cat_cat( Int len_rows, F* res, Int res_n_col, - Int res_size, - bool i_drop_first, - bool j_drop_first + Int res_size + % if type == 'complex': + , bool i_drop_first + , bool j_drop_first + % endif ) { #pragma omp parallel @@ -53,14 +59,25 @@ void _sandwich_cat_cat( # pragma omp for for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) { Int k = rows[k_idx]; - Int i = i_indices[k] - i_drop_first; - if (i == -1) { - continue; - } - Int j = j_indices[k] - j_drop_first; - if (j == -1) { - continue; - } + + % if type == 'complex': + Int i = i_indices[k] - i_drop_first; + if (i < 0) { + continue; + } + % else: + Int i = i_indices[k]; + % endif + + % if type == 'complex': + Int j = j_indices[k] - j_drop_first; + if (j < 0) { + continue; + } + % else: + Int j = j_indices[k]; + % endif + restemp[(Py_ssize_t) i * res_n_col + j] += d[k]; } for (Py_ssize_t i = 0; i < res_size; i++) { @@ -69,11 +86,12 @@ void _sandwich_cat_cat( } } } + -<%def name="sandwich_cat_dense_tmpl(order)"> +<%def name="sandwich_cat_dense_tmpl(order, type)"> template -void _sandwich_cat_dense${order}( +void _sandwich_cat_dense${order}_${type}( F* d, const Int* indices, Int* rows, @@ -85,6 +103,9 @@ void _sandwich_cat_dense${order}( F* mat_j, Int mat_j_nrow, Int mat_j_ncol + % if type == 'complex': + , bool drop_first + % endif ) { #pragma omp parallel @@ -93,20 +114,28 @@ void _sandwich_cat_dense${order}( #pragma omp for for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) { Py_ssize_t k = rows[k_idx]; - Py_ssize_t i = indices[k]; // MAYBE TODO: explore whether the column restriction slows things down a // lot, particularly if not restricting the columns allows using SIMD // instructions // MAYBE TODO: explore whether swapping the loop order for F-ordered mat_j // is useful. - for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { - Py_ssize_t j = j_cols[j_idx]; - % if order == 'C': - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; - % else: - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; - % endif - } + % if type == 'complex': + Py_ssize_t i = indices[k] - drop_first; + if (i >= 0) { + % else: + Py_ssize_t i = indices[k]; + % endif + for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { + Py_ssize_t j = j_cols[j_idx]; + % if order == 'C': + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; + % else: + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; + % endif + } + % if type == 'complex': + } + % endif } for (Py_ssize_t i = 0; i < res_size; i++) { #pragma omp atomic @@ -116,7 +145,11 @@ void _sandwich_cat_dense${order}( } -${sandwich_cat_dense_tmpl('C')} -${sandwich_cat_dense_tmpl('F')} -${transpose_matvec('all_rows')} -${transpose_matvec('all_rows_drop_first')} +${sandwich_cat_dense_tmpl('C', 'fast')} +${sandwich_cat_dense_tmpl('F', 'fast')} +${sandwich_cat_dense_tmpl('C', 'complex')} +${sandwich_cat_dense_tmpl('F', 'complex')} +${sandwich_cat_cat('fast')} +${sandwich_cat_cat('complex')} +${transpose_matvec('all_rows_fast')} +${transpose_matvec('all_rows_complex')} diff --git a/src/tabmat/ext/categorical.pyx b/src/tabmat/ext/categorical.pyx index 59ea308a..0c90b2c7 100644 --- a/src/tabmat/ext/categorical.pyx +++ b/src/tabmat/ext/categorical.pyx @@ -11,11 +11,11 @@ from libcpp cimport bool cdef extern from "cat_split_helpers.cpp": - void _transpose_matvec_all_rows[Int, F](Int, Int*, F*, F*, Int) - void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int) + void _transpose_matvec_all_rows_fast[Int, F](Int, Int*, F*, F*, Int) + void _transpose_matvec_all_rows_complex[Int, F](Int, Int*, F*, F*, Int, bool) -def transpose_matvec( +def transpose_matvec_fast( int[:] indices, floating[:] other, int n_cols, @@ -34,7 +34,7 @@ def transpose_matvec( # Case 1: No row or col restrictions if no_row_restrictions and no_col_restrictions: - _transpose_matvec_all_rows(n_rows, &indices[0], &other[0], &out[0], out_size) + _transpose_matvec_all_rows_fast(n_rows, &indices[0], &other[0], &out[0], out_size) # Case 2: row restrictions but no col restrictions elif no_col_restrictions: rows_view = rows @@ -62,14 +62,15 @@ def transpose_matvec( out[col] += other[row] -def transpose_matvec_drop_first( +def transpose_matvec_complex( int[:] indices, floating[:] other, int n_cols, dtype, rows, cols, - floating[:] out + floating[:] out, + bint drop_first ): cdef int row, row_idx, n_keep_rows, col_idx cdef int n_rows = len(indices) @@ -81,15 +82,15 @@ def transpose_matvec_drop_first( # Case 1: No row or col restrictions if no_row_restrictions and no_col_restrictions: - _transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size) + _transpose_matvec_all_rows_complex(n_rows, &indices[0], &other[0], &out[0], out_size, drop_first) # Case 2: row restrictions but no col restrictions elif no_col_restrictions: rows_view = rows n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if col_idx != -1: + col_idx = indices[row] - drop_first + if col_idx >= 0: out[col_idx] += other[row] # Cases 3 and 4: col restrictions else: @@ -97,8 +98,8 @@ def transpose_matvec_drop_first( # Case 3: Col restrictions but no row restrictions if no_row_restrictions: for row_idx in range(n_rows): - col_idx = indices[row_idx] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row_idx] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row_idx] # Case 4: Both col restrictions and row restrictions else: @@ -106,8 +107,8 @@ def transpose_matvec_drop_first( n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row] @@ -119,7 +120,7 @@ def get_col_included(int[:] cols, int n_cols): return col_included -def matvec( +def matvec_fast( const int[:] indices, floating[:] other, int n_rows, @@ -145,13 +146,14 @@ def matvec( return -def matvec_drop_first( +def matvec_complex( const int[:] indices, floating[:] other, int n_rows, int[:] cols, int n_cols, - floating[:] out_vec + floating[:] out_vec, + bint drop_first ): """See `matvec`. Here we drop the first category of the CategoricalMatrix so the indices refer to the column index + 1. @@ -161,19 +163,19 @@ def matvec_drop_first( if cols is None: for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[i] - drop_first # reference category is always 0. + if col_idx >= 0: out_vec[i] += other[col_idx] else: col_included = get_col_included(cols, n_cols) for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 - if (col_idx != -1) and (col_included[col_idx] == 1): + col_idx = indices[i] - drop_first + if (col_idx >= 0) and (col_included[col_idx] == 1): out_vec[i] += other[col_idx] return -def sandwich_categorical( +def sandwich_categorical_fast( const int[:] indices, floating[:] d, int[:] rows, @@ -191,12 +193,13 @@ def sandwich_categorical( return np.asarray(res) -def sandwich_categorical_drop_first( +def sandwich_categorical_complex( const int[:] indices, floating[:] d, int[:] rows, dtype, - int n_cols + int n_cols, + bint drop_first ): cdef floating[:] res = np.zeros(n_cols, dtype=dtype) cdef int col_idx, k, k_idx @@ -204,26 +207,24 @@ def sandwich_categorical_drop_first( for k_idx in range(n_rows): k = rows[k_idx] - col_idx = indices[k] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[k] - drop_first # reference category is always 0. + if col_idx >= 0: res[col_idx] += d[k] return np.asarray(res) -def multiply_drop_first( +def multiply_complex( int[:] indices, numeric[:] d, int ncols, dtype, + bint drop_first, ): """Multiply a CategoricalMatrix by a vector d. The output cannot be a CategoricalMatrix anymore. Here we return the inputs to transform to a csr_matrix. - Note that *_drop_first function assume the CategoricalMatrix - has its first category dropped. - Parameters ---------- indices: @@ -255,9 +256,9 @@ def multiply_drop_first( for i in range(nrows): vnew_indptr[i] = nonref_cnt - if indices[i] != 0: + if indices[i] >= drop_first: vnew_data[nonref_cnt] = d[i] - vnew_indices[nonref_cnt] = indices[i] - 1 + vnew_indices[nonref_cnt] = indices[i] - drop_first nonref_cnt += 1 vnew_indptr[i+1] = nonref_cnt @@ -265,9 +266,10 @@ def multiply_drop_first( return new_data[:nonref_cnt], new_indices[:nonref_cnt], new_indptr -def subset_categorical_drop_first( +def subset_categorical_complex( int[:] indices, int ncols, + bint drop_first ): """Construct the inputs to transform a CategoricalMatrix into a csr_matrix. @@ -299,8 +301,8 @@ def subset_categorical_drop_first( for i in range(nrows): vnew_indptr[i] = nonzero_cnt - if indices[i] != 0: - vnew_indices[nonzero_cnt] = indices[i] - 1 + if indices[i] >= drop_first: + vnew_indices[nonzero_cnt] = indices[i] - drop_first nonzero_cnt += 1 vnew_indptr[i+1] = nonzero_cnt diff --git a/src/tabmat/ext/split.pyx b/src/tabmat/ext/split.pyx index 11f5d3c0..21d2f52e 100644 --- a/src/tabmat/ext/split.pyx +++ b/src/tabmat/ext/split.pyx @@ -21,9 +21,12 @@ ctypedef fused win_integral: cdef extern from "cat_split_helpers.cpp": - void _sandwich_cat_denseC[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil - void _sandwich_cat_denseF[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil - void _sandwich_cat_cat[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int, bool, bool) + void _sandwich_cat_denseC_fast[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil + void _sandwich_cat_denseF_fast[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil + void _sandwich_cat_denseC_complex[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int, bool) nogil + void _sandwich_cat_denseF_complex[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int, bool) nogil + void _sandwich_cat_cat_fast[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int) + void _sandwich_cat_cat_complex[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int, bool, bool) def sandwich_cat_dense( @@ -33,7 +36,9 @@ def sandwich_cat_dense( np.ndarray mat_j, int[:] rows, int[:] j_cols, - bool is_c_contiguous + bool is_c_contiguous, + bool has_missings, + bint drop_first ): cdef int nj_rows = mat_j.shape[0] cdef int nj_cols = mat_j.shape[1] @@ -53,14 +58,24 @@ def sandwich_cat_dense( cdef floating* mat_j_p = mat_j.data - if is_c_contiguous: - _sandwich_cat_denseC(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, - nj_active_cols, &res[0, 0], res_size, mat_j_p, - nj_rows, nj_cols) + if (not drop_first) and (not has_missings): + if is_c_contiguous: + _sandwich_cat_denseC_fast(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols) + else: + _sandwich_cat_denseF_fast(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols) else: - _sandwich_cat_denseF(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, - nj_active_cols, &res[0, 0], res_size, mat_j_p, - nj_rows, nj_cols) + if is_c_contiguous: + _sandwich_cat_denseC_complex(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols, drop_first) + else: + _sandwich_cat_denseF_complex(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols, drop_first) return np.asarray(res) @@ -74,7 +89,9 @@ def sandwich_cat_cat( int[:] rows, dtype, bint i_drop_first, - bint j_drop_first + bint j_drop_first, + bool i_has_missings, + bool j_has_missings ): """ (X1.T @ diag(d) @ X2)[i, j] = sum_k X1[k, i] d[k] X2[k, j] @@ -83,8 +100,13 @@ def sandwich_cat_cat( cdef int n_rows = len(rows) cdef int res_size = res.size - _sandwich_cat_cat(&d[0], &i_indices[0], &j_indices[0], &rows[0], n_rows, - &res[0, 0], j_ncol, res_size, i_drop_first, j_drop_first) + if i_drop_first or j_drop_first or i_has_missings or j_has_missings: + _sandwich_cat_cat_complex(&d[0], &i_indices[0], &j_indices[0], &rows[0], + n_rows, &res[0, 0], j_ncol, res_size, + i_drop_first, j_drop_first) + else: + _sandwich_cat_cat_fast(&d[0], &i_indices[0], &j_indices[0], &rows[0], + n_rows, &res[0, 0], j_ncol, res_size) return np.asarray(res) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index b97d5617..61335af3 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -44,6 +44,8 @@ def _init(self): self.add_column_for_intercept = self.params.get( "add_column_for_intercept", True ) + self.cat_missing_method = self.params.get("cat_missing_method", "fail") + self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)") # We can override formulaic's C() function here self.context["C"] = _C @@ -100,6 +102,8 @@ def _encode_categorical( return encode_contrasts( values, reduced_rank=reduced_rank, + missing_method=self.cat_missing_method, + missing_name=self.cat_missing_name, _metadata=metadata, _state=encoder_state, _spec=spec, @@ -115,7 +119,11 @@ def _combine_columns(self, cols, spec, drop_rows): # Otherwise, concatenate columns into SplitMatrix return SplitMatrix( [ - col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold) + col[1].to_tabmat( + self.dtype, + self.sparse_threshold, + self.cat_threshold, + ) for col in cols ] ) @@ -145,7 +153,7 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): if not self.add_column_for_intercept: continue scoped_cols[ - "Intercept" + self.intercept_name ] = scoped_term.scale * self._encode_constant( 1, None, {}, spec, drop_rows ) @@ -412,15 +420,31 @@ def __init__( @classmethod def from_categorical( - cls, cat: pandas.Categorical, reduced_rank: bool + cls, + cat: pandas.Categorical, + reduced_rank: bool, + missing_method: str = "fail", + missing_name: str = "(MISSING)", ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) codes = cat.codes.copy().astype(numpy.int64) + if reduced_rank: codes[codes == 0] = -2 codes[codes > 0] -= 1 categories = categories[1:] + + if missing_method == "fail" and -1 in codes: + raise ValueError( + "Categorical data can't have missing values " + "if [cat_]missing_method='fail'." + ) + + if missing_method == "convert" and -1 in codes: + codes[codes == -1] = len(categories) + categories.append(missing_name) + return cls( codes=codes, categories=categories, @@ -464,6 +488,7 @@ def to_tabmat( dtype=dtype, column_name=self.name, column_name_format="{category}", + cat_missing_method="zero", # missing values are already handled ) if (self.codes == -2).all(): @@ -631,6 +656,8 @@ def _C( data, *, levels: Optional[Iterable[str]] = None, + missing_method: str = "fail", + missing_name: str = "(MISSING)", spans_intercept: bool = True, ): """ @@ -654,6 +681,8 @@ def encoder( values, levels=levels, reduced_rank=reduced_rank, + missing_method=missing_method, + missing_name=missing_name, _state=encoder_state, _spec=model_spec, ) @@ -671,6 +700,8 @@ def encode_contrasts( data, *, levels: Optional[Iterable[str]] = None, + missing_method: str = "fail", + missing_name: str = "(MISSING)", reduced_rank: bool = False, _state=None, _spec=None, @@ -689,8 +720,12 @@ def encode_contrasts( levels = levels if levels is not None else _state.get("categories") cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories + return _InteractableCategoricalVector.from_categorical( - cat, reduced_rank=reduced_rank + cat, + reduced_rank=reduced_rank, + missing_method=missing_method, + missing_name=missing_name, ) diff --git a/tests/test_categorical_matrix.py b/tests/test_categorical_matrix.py index f1be385a..9e7cb7a9 100644 --- a/tests/test_categorical_matrix.py +++ b/tests/test_categorical_matrix.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import pytest @@ -6,55 +8,146 @@ @pytest.fixture -def cat_vec(): +def cat_vec(missing): m = 10 seed = 0 rng = np.random.default_rng(seed) - return rng.choice([0, 1, 2, np.inf, -np.inf], size=m) + vec = rng.choice([0, 1, 2, np.inf, -np.inf], size=m) + if missing: + vec[vec == 1] = np.nan + return vec @pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32]) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_recover_orig(cat_vec, vec_dtype, drop_first): - orig_recovered = CategoricalMatrix(cat_vec, drop_first=drop_first).recover_orig() +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_recover_orig(cat_vec, vec_dtype, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + orig_recovered = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ).recover_orig() np.testing.assert_equal(orig_recovered, cat_vec) @pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32]) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_csr_matvec_categorical(cat_vec, vec_dtype, drop_first): - mat = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_csr_matvec_categorical( + cat_vec, vec_dtype, drop_first, missing, cat_missing_method +): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + mat = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=(cat_missing_method == "convert" and missing), + ) + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) vec = np.random.choice(np.arange(4, dtype=vec_dtype), mat.shape[1]) res = cat_mat.matvec(vec) np.testing.assert_allclose(res, cat_mat.A.dot(vec)) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_tocsr(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_tocsr(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) res = cat_mat.tocsr().A - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ) np.testing.assert_allclose(res, expected) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_transpose_matvec(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_transpose_matvec(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) other = np.random.random(cat_mat.shape[0]) res = cat_mat.transpose_matvec(other) - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8").T.dot( - other - ) + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ).T.dot(other) np.testing.assert_allclose(res, expected) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_multiply(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_multiply(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) other = np.arange(len(cat_vec))[:, None] actual = cat_mat.multiply(other) - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") * other + expected = ( + pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ) + * other + ) np.testing.assert_allclose(actual.A, expected) @@ -65,9 +158,48 @@ def test_nulls(mi_element): CategoricalMatrix(vec) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_categorical_indexing(drop_first): - catvec = [0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 3] - mat = CategoricalMatrix(catvec, drop_first=drop_first) - expected = pd.get_dummies(catvec, drop_first=drop_first).to_numpy()[:, [0, 1]] +@pytest.mark.parametrize("cat_missing_name", ["(MISSING)", "__None__", "[NULL]"]) +def test_cat_missing_name(cat_missing_name): + vec = [None, "(MISSING)", "__None__", "a", "b"] + if cat_missing_name in vec: + with pytest.raises( + ValueError, + match=re.escape(f"Missing category {cat_missing_name} already exists."), + ): + CategoricalMatrix( + vec, cat_missing_method="convert", cat_missing_name=cat_missing_name + ) + else: + cat = CategoricalMatrix( + vec, cat_missing_method="convert", cat_missing_name=cat_missing_name + ) + assert set(cat.cat.categories) == set(vec) - {None} | {cat_missing_name} + + +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_categorical_indexing(drop_first, missing, cat_missing_method): + if not missing: + cat_vec = [0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 3] + else: + cat_vec = [0, None, 2, 0, None, 2, 0, None, 2, 3, 3] + + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dummy_na=cat_missing_method == "convert" and missing, + ).to_numpy()[:, [0, 1]] np.testing.assert_allclose(mat[:, [0, 1]].A, expected) diff --git a/tests/test_formula.py b/tests/test_formula.py index 88a55c71..65930106 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -630,6 +630,116 @@ def test_interactable_vectors(left, right, reverse): assert result_vec.name == right.name + ":" + left.name +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"]) +@pytest.mark.parametrize( + "cat_missing_name", + ["__missing__", "(MISSING)"], +) +def test_cat_missing_handling(cat_missing_method, cat_missing_name): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + } + ) + + mat_from_pandas = tm.from_pandas( + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + mat_from_formula = tm.from_formula( + "cat_1 - 1", + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + assert mat_from_pandas.column_names == mat_from_formula.column_names + assert mat_from_pandas.term_names == mat_from_formula.term_names + np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A) + + mat_from_formula_new = mat_from_formula.model_spec.get_model_matrix(df) + assert mat_from_pandas.column_names == mat_from_formula_new.column_names + assert mat_from_pandas.term_names == mat_from_formula_new.term_names + np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula_new.A) + + +def test_cat_missing_C(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + "cat_2": pd.Categorical(["1", "2", None, "1", "2"]), + } + ) + formula = ( + "C(cat_1, missing_method='convert', missing_name='M') " + "+ C(cat_2, missing_method='zero')" + ) + expected_names = [ + "C(cat_1, missing_method='convert', missing_name='M')[a]", + "C(cat_1, missing_method='convert', missing_name='M')[b]", + "C(cat_1, missing_method='convert', missing_name='M')[M]", + "C(cat_2, missing_method='zero')[1]", + "C(cat_2, missing_method='zero')[2]", + ] + + result = tm.from_formula(formula, df) + + assert result.column_names == expected_names + assert result.model_spec.get_model_matrix(df).column_names == expected_names + + +@pytest.mark.parametrize( + "cat_missing_method", ["zero", "convert"], ids=["zero", "convert"] +) +def test_cat_missing_unseen(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", None]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + result_unseen = result_seen.model_spec.get_model_matrix(df_unseen) + + assert result_seen.column_names == result_unseen.column_names + if cat_missing_method == "convert": + expected_array = np.array([[1, 0, 0], [0, 0, 1]], dtype=np.float64) + elif cat_missing_method == "zero": + expected_array = np.array([[1, 0], [0, 0]], dtype=np.float64) + + np.testing.assert_array_equal(result_unseen.A, expected_array) + + +def test_cat_missing_interactions(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + "cat_2": pd.Categorical(["1", "2", None, "1", "2"]), + } + ) + formula = "C(cat_1, missing_method='convert') : C(cat_2, missing_method='zero') - 1" + expected_names = [ + "C(cat_1, missing_method='convert')[a]:C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[b]:C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[(MISSING)]:C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[a]:C(cat_2, missing_method='zero')[2]", + "C(cat_1, missing_method='convert')[b]:C(cat_2, missing_method='zero')[2]", + "C(cat_1, missing_method='convert')[(MISSING)]:C(cat_2, missing_method='zero')[2]", + ] + + assert tm.from_formula(formula, df).column_names == expected_names + + # Tests from formulaic's test suite # --------------------------------- diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index 5de01638..a102e8a6 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -61,27 +61,32 @@ def split_mat() -> SplitMatrix: return mat -def get_split_with_cat_components() -> ( - List[Union[tm.SparseMatrix, tm.DenseMatrix, tm.CategoricalMatrix]] -): +def get_split_with_cat_components( + missing, +) -> List[Union[tm.SparseMatrix, tm.DenseMatrix, tm.CategoricalMatrix]]: n_rows = 10 np.random.seed(0) dense_1 = tm.DenseMatrix(np.random.random((n_rows, 3))) sparse_1 = tm.SparseMatrix(sps.random(n_rows, 3).tocsc()) - cat = tm.CategoricalMatrix(np.random.choice(range(3), n_rows)) + if missing: + cat = tm.CategoricalMatrix( + np.random.choice([0, 1, 2, None], n_rows), cat_missing_method="zero" + ) + else: + cat = tm.CategoricalMatrix(np.random.choice(range(3), n_rows)) dense_2 = tm.DenseMatrix(np.random.random((n_rows, 3))) sparse_2 = tm.SparseMatrix(sps.random(n_rows, 3, density=0.5).tocsc()) cat_2 = tm.CategoricalMatrix(np.random.choice(range(3), n_rows), drop_first=True) return [dense_1, sparse_1, cat, dense_2, sparse_2, cat_2] -def split_with_cat() -> SplitMatrix: +def split_with_cat(missing) -> SplitMatrix: """Initialized with multiple sparse and dense parts and no indices.""" - return tm.SplitMatrix(get_split_with_cat_components()) + return tm.SplitMatrix(get_split_with_cat_components(missing)) -def split_with_cat_64() -> SplitMatrix: - mat = tm.SplitMatrix(get_split_with_cat_components()) +def split_with_cat_64(missing) -> SplitMatrix: + mat = tm.SplitMatrix(get_split_with_cat_components(missing)) matrices = mat.matrices for i, mat_ in enumerate(mat.matrices): @@ -99,7 +104,15 @@ def split_with_cat_64() -> SplitMatrix: return tm.SplitMatrix(matrices, mat.indices) -@pytest.mark.parametrize("mat", [split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) def test_init(mat: SplitMatrix): assert len(mat.indices) == 4 assert len(mat.matrices) == 4 @@ -109,7 +122,15 @@ def test_init(mat: SplitMatrix): assert mat.matrices[2].shape == (10, 3) -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) def test_init_from_split(mat): np.testing.assert_array_equal(mat.A, tm.SplitMatrix([mat]).A) np.testing.assert_array_equal( @@ -146,7 +167,15 @@ def test_sandwich_sparse_dense(X: np.ndarray, Acols, Bcols): # TODO: ensure cols are in order -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) @pytest.mark.parametrize( "cols", [None, [0], [1, 2, 3], [1, 5]], @@ -160,7 +189,15 @@ def test_sandwich(mat: tm.SplitMatrix, cols): np.testing.assert_allclose(y1, y2, atol=1e-12) -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) @pytest.mark.parametrize("cols", [None, [0], [1, 2, 3], [1, 5]]) def test_split_col_subsets(mat: tm.SplitMatrix, cols): subset_cols_indices, subset_cols, n_cols = mat._split_col_subsets(cols) @@ -189,56 +226,66 @@ def _get_lengths(vec_list: List[Optional[np.ndarray]]): assert (mat.indices[i] == subset_cols_indices[i]).all() -def random_split_matrix(seed=0, n_rows=10, n_cols_per=3): +def random_split_matrix(seed=0, n_rows=10, n_cols_per=3, missing=False): if seed is not None: np.random.seed(seed) dense_1 = tm.DenseMatrix(np.random.random((n_rows, n_cols_per))) sparse = tm.SparseMatrix(sps.random(n_rows, n_cols_per).tocsc()) - cat = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) + if missing: + cat = tm.CategoricalMatrix( + np.random.choice(list(range(n_cols_per)) + [None], n_rows), + cat_missing_method="zero", + ) + else: + cat = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) dense_2 = tm.DenseMatrix(np.random.random((n_rows, n_cols_per))) cat_2 = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) mat = tm.SplitMatrix([dense_1, sparse, cat, dense_2, cat_2]) return mat -def many_random_tests(checker): +def many_random_tests(checker, missing): for i in range(10): mat = random_split_matrix( seed=(1 if i == 0 else None), n_rows=np.random.randint(130), n_cols_per=1 + np.random.randint(10), + missing=missing, ) checker(mat) -def test_sandwich_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_sandwich_many_types(missing): def check(mat): d = np.random.random(mat.shape[0]) res = mat.sandwich(d) expected = (mat.A.T * d[None, :]) @ mat.A np.testing.assert_allclose(res, expected) - many_random_tests(check) + many_random_tests(check, missing) -def test_transpose_matvec_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_transpose_matvec_many_types(missing): def check(mat): d = np.random.random(mat.shape[0]) res = mat.transpose_matvec(d) expected = mat.A.T.dot(d) np.testing.assert_almost_equal(res, expected) - many_random_tests(check) + many_random_tests(check, missing) -def test_matvec_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_matvec_many_types(missing): def check(mat): d = np.random.random(mat.shape[1]) res = mat.matvec(d) expected = mat.A.dot(d) np.testing.assert_almost_equal(res, expected) - many_random_tests(check) + many_random_tests(check, missing) def test_init_from_1d(): @@ -259,3 +306,17 @@ def test_matvec(n_rows): ) mat = from_pandas(X, cat_threshold=0) np.testing.assert_allclose(mat.matvec(np.array(mat.shape[1] * [1])), n_cols) + + +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_from_pandas_missing(cat_missing_method): + df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])}) + if cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + from_pandas(df, cat_missing_method=cat_missing_method) + elif cat_missing_method == "zero": + assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2) + elif cat_missing_method == "convert": + assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3)