diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index be2ef760..69a1464d 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -2,13 +2,13 @@ import pandas as pd -import numpy as np from hed.tools.remodeling.operations.base_op import BaseOp from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar from hed.models import query_service from hed.tools.analysis.event_manager import EventManager from hed.tools.analysis.hed_tag_manager import HedTagManager +from hed.tools.util.data_util import replace_na class FactorHedTagsOp(BaseOp): @@ -126,7 +126,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) - df_new.replace('n/a', np.nan, inplace=True) + replace_na(df_new) return df_new @staticmethod diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index d8300c51..3d6f523f 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -1,11 +1,11 @@ """ Append to columnar file the factors computed from type variables. """ import pandas as pd -import numpy as np from hed.tools.remodeling.operations.base_op import BaseOp from hed.models.tabular_input import TabularInput from hed.tools.analysis.event_manager import EventManager from hed.tools.analysis.hed_type_manager import HedTypeManager +from hed.tools.util.data_util import replace_na class FactorHedTypeOp(BaseOp): @@ -82,7 +82,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) - df_new.replace('n/a', np.nan, inplace=True) + replace_na(df_new) return df_new @staticmethod diff --git a/hed/tools/util/data_util.py b/hed/tools/util/data_util.py index 758db5e1..d7465aba 100644 --- a/hed/tools/util/data_util.py +++ b/hed/tools/util/data_util.py @@ -211,6 +211,15 @@ def make_info_dataframe(col_info, selected_col): df = pd.DataFrame(sorted(list(col_values)), columns=[selected_col]) return df +def replace_na(df): + """ Replace (in place) the n/a with np.nan taking care of categorical columns. """ + for column in df.columns: + if df[column].dtype.name != 'category': + df[column] = df[column].replace('n/a', np.nan) + elif 'n/a' in df[column].cat.categories: + df[column] = df[column].astype('object') + df[column] = df[column].replace('n/a', np.nan) + df[column] = pd.Categorical(df[column]) def replace_values(df, values=None, replace_value='n/a', column_list=None): """ Replace string values in specified columns. diff --git a/tests/tools/util/test_data_util.py b/tests/tools/util/test_data_util.py index f77185d4..e3aea797 100644 --- a/tests/tools/util/test_data_util.py +++ b/tests/tools/util/test_data_util.py @@ -1,11 +1,12 @@ import os import unittest import numpy as np -from pandas import DataFrame + +from pandas import DataFrame, Categorical from hed.errors.exceptions import HedFileError from hed.tools.util.data_util import add_columns, check_match, delete_columns, delete_rows_by_column, \ get_key_hash, get_new_dataframe, get_row_hash, get_value_dict, \ - make_info_dataframe, reorder_columns, replace_values, separate_values + make_info_dataframe, reorder_columns, replace_na, replace_values, separate_values class Test(unittest.TestCase): @@ -107,6 +108,43 @@ def test_make_info_dataframe(self): df2 = make_info_dataframe(col_dict, "Baloney") self.assertFalse(df2, "make_frame should return None if column name invalid") + def test_replace_na(self): + # With categorical column containing n/a's + df = DataFrame({ + 'A': Categorical(['apple', 'n/a', 'cherry']), + 'B': ['n/a', 'pear', 'banana'] + }) + replace_na(df) + self.assertTrue(df['A'].isnull().any()) + self.assertTrue(df['B'].isnull().any()) + + # With categorical column not containing n/a's + df = DataFrame({ + 'A': Categorical(['apple', 'orange', 'cherry']), + 'B': ['pear', 'melon', 'banana'] + }) + replace_na(df) + self.assertFalse(df['A'].isnull().any()) + self.assertFalse(df['B'].isnull().any()) + + # preserving other values + df = DataFrame({ + 'A': Categorical(['apple', 'n/a', 'cherry']), + 'B': ['n/a', 'pear', 'banana'], + 'C': [1, 2, 3] + }) + replace_na(df) + self.assertEqual(list(df['C']), [1, 2, 3]) + + # Non-categorical n/a replacement + df = DataFrame({ + 'A': ['apple', 'n/a', 'cherry'], + 'B': ['n/a', 'pear', 'banana'] + }) + replace_na(df) + self.assertTrue(df['A'].isnull().any()) + self.assertTrue(df['B'].isnull().any()) + def test_replace_values(self): data = {'Name': ['n/a', '', 'tom', 'alice', 0, 1], 'Age': [np.nan, 10, '', 'n/a', '0', '10']}