Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge NA #961

Merged
merged 3 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions hed/tools/remodeling/operations/factor_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@


import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.models.tabular_input import TabularInput
from hed.models.sidecar import Sidecar
from hed.models import query_service
from hed.tools.analysis.event_manager import EventManager
from hed.tools.analysis.hed_tag_manager import HedTagManager
from hed.tools.util.data_util import replace_na


class FactorHedTagsOp(BaseOp):
Expand Down Expand Up @@ -126,7 +126,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
df_new.replace('n/a', np.nan, inplace=True)
replace_na(df_new)
return df_new

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions hed/tools/remodeling/operations/factor_hed_type_op.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
""" Append to columnar file the factors computed from type variables. """

import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.models.tabular_input import TabularInput
from hed.tools.analysis.event_manager import EventManager
from hed.tools.analysis.hed_type_manager import HedTypeManager
from hed.tools.util.data_util import replace_na


class FactorHedTypeOp(BaseOp):
Expand Down Expand Up @@ -82,7 +82,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
df_new.replace('n/a', np.nan, inplace=True)
replace_na(df_new)
return df_new

@staticmethod
Expand Down
9 changes: 9 additions & 0 deletions hed/tools/util/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,15 @@ def make_info_dataframe(col_info, selected_col):
df = pd.DataFrame(sorted(list(col_values)), columns=[selected_col])
return df

def replace_na(df):
""" Replace (in place) the n/a with np.nan taking care of categorical columns. """
for column in df.columns:
if df[column].dtype.name != 'category':
df[column] = df[column].replace('n/a', np.nan)
elif 'n/a' in df[column].cat.categories:
df[column] = df[column].astype('object')
df[column] = df[column].replace('n/a', np.nan)
df[column] = pd.Categorical(df[column])

def replace_values(df, values=None, replace_value='n/a', column_list=None):
""" Replace string values in specified columns.
Expand Down
42 changes: 40 additions & 2 deletions tests/tools/util/test_data_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import unittest
import numpy as np
from pandas import DataFrame

from pandas import DataFrame, Categorical
from hed.errors.exceptions import HedFileError
from hed.tools.util.data_util import add_columns, check_match, delete_columns, delete_rows_by_column, \
get_key_hash, get_new_dataframe, get_row_hash, get_value_dict, \
make_info_dataframe, reorder_columns, replace_values, separate_values
make_info_dataframe, reorder_columns, replace_na, replace_values, separate_values


class Test(unittest.TestCase):
Expand Down Expand Up @@ -107,6 +108,43 @@ def test_make_info_dataframe(self):
df2 = make_info_dataframe(col_dict, "Baloney")
self.assertFalse(df2, "make_frame should return None if column name invalid")

def test_replace_na(self):
# With categorical column containing n/a's
df = DataFrame({
'A': Categorical(['apple', 'n/a', 'cherry']),
'B': ['n/a', 'pear', 'banana']
})
replace_na(df)
self.assertTrue(df['A'].isnull().any())
self.assertTrue(df['B'].isnull().any())

# With categorical column not containing n/a's
df = DataFrame({
'A': Categorical(['apple', 'orange', 'cherry']),
'B': ['pear', 'melon', 'banana']
})
replace_na(df)
self.assertFalse(df['A'].isnull().any())
self.assertFalse(df['B'].isnull().any())

# preserving other values
df = DataFrame({
'A': Categorical(['apple', 'n/a', 'cherry']),
'B': ['n/a', 'pear', 'banana'],
'C': [1, 2, 3]
})
replace_na(df)
self.assertEqual(list(df['C']), [1, 2, 3])

# Non-categorical n/a replacement
df = DataFrame({
'A': ['apple', 'n/a', 'cherry'],
'B': ['n/a', 'pear', 'banana']
})
replace_na(df)
self.assertTrue(df['A'].isnull().any())
self.assertTrue(df['B'].isnull().any())

def test_replace_values(self):
data = {'Name': ['n/a', '', 'tom', 'alice', 0, 1],
'Age': [np.nan, 10, '', 'n/a', '0', '10']}
Expand Down