diff --git a/data_analysis.py b/data_analysis.py index 115fe6d..7f3ec5b 100644 --- a/data_analysis.py +++ b/data_analysis.py @@ -4,6 +4,7 @@ import time from datetime import datetime +import numpy as np import pandas as pd from utils.data_reader import ( @@ -155,8 +156,8 @@ def top_N(series, N=0, lower_limit=1): def summary(series): - # 1. count the number of missing (null) entries - missing = series.isna().sum() + # 1. count the number of missing (null or blank string) entries + missing = series.replace(r"^\s*$", np.nan, regex=True).isna().sum() # 2. basic descriptive statistics on the length of the values length = series.str.len().describe().to_dict() diff --git a/households/matching.py b/households/matching.py index 93c8e19..5ca4fa7 100644 --- a/households/matching.py +++ b/households/matching.py @@ -360,7 +360,7 @@ def get_household_matches( pairs_writer.writerow(matching_pairs[i]) print(f"[{datetime.now()}] Wrote matching pairs to {pairs_path}") - five_percent = int(len(matching_pairs) / 20) + five_percent = max(int(len(matching_pairs) / 20), 1) pos_to_pairs = {} # note: "for pair in matching_pairs:" had unexpectedly poor performance here for i in range(len(matching_pairs)): @@ -407,23 +407,34 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug= # start with an empty index we can append to candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1]) + # only include lines with an address, since otherwise + # missing addresses will be considered a match ("" == "") + pii_lines_with_address = pii_lines[pii_lines.household_street_address != ""] + + if len(pii_lines_with_address) == 0: + # essentially just a null check + # don't bother with the rest if we have no addresses + # this should never happen + return candidate_links + # break up the dataframe into subframes, # and iterate over every pair of subframes. # we improve performance somewhat by only comparing looking forward, # that is, only comparing a given set of rows # against rows with higher indices. - for subset_A in np.array_split(pii_lines, split_factor): + for subset_A in np.array_split(pii_lines_with_address, split_factor): first_item_in_A = subset_A.index.min() + # don't compare against earlier items # Note: this assumes that the index is the row number # (NOT the record_id/patid) and the df is sequential # this is currently the case in households.py#parse_source_file() - lines_to_compare = pii_lines[first_item_in_A:] + lines_to_compare = pii_lines_with_address[first_item_in_A:] # pick a sub split factor to give us ~same size subset_A and subset_B. # the idea is that there's some implicit overhead to splitting, # so don't split more tha necessary - sub_split_factor = int(len(lines_to_compare) / len(subset_A)) + sub_split_factor = max(int(len(lines_to_compare) / len(subset_A)), 1) for subset_B in np.array_split(lines_to_compare, sub_split_factor): if debug: print( @@ -431,6 +442,7 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug= f"[{subset_A.index.min()}..{subset_A.index.max()}]" " against " f"[{subset_B.index.min()}..{subset_B.index.max()}]" + f". {len(candidate_links)} candidates so far" ) # note pairs_subset and candidate_links are MultiIndexes @@ -452,13 +464,6 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug= gc.collect() - # rows with blank address match ("" == "") so drop those here - # TODO: ideally we wouldn't compare blank address lines in the first place - # but the indexing and splitting bits get complicated if we drop them earlier - blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index - candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore") - candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore") - if debug: print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs") @@ -509,7 +514,7 @@ def get_matching_pairs( matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1]) # we know that we could support len(subset_A) in memory above, # so use the same amount here - len_subset_A = int(len(pii_lines) / split_factor) + len_subset_A = max(int(len(pii_lines) / split_factor), 1) # note: np.array_split had unexpectedly poor performance here for very large indices for i in range(0, len(candidate_links), len_subset_A): diff --git a/requirements.txt b/requirements.txt index e5f93b4..25581df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ clkhash>=0.16.0 psycopg2>=2.8.3 anonlink-client==0.1.5 ijson>=3.1.2 -textdistance[extras]>=4.5.0 +textdistance>=4.5.0 usaddress>=0.5.10 pylint>=2.4.2 tqdm>=4.36.1 diff --git a/utils/data_reader.py b/utils/data_reader.py index c523983..7c77c37 100644 --- a/utils/data_reader.py +++ b/utils/data_reader.py @@ -124,14 +124,18 @@ def map_key(row, key): return row_key -def empty_str_from_none(string): - if string is None: +def empty_str_from_none(obj): + if obj is None: return "" + elif isinstance(obj, pd.Series): + return obj.fillna("") else: - return string + return obj def case_insensitive_lookup(row, key, version): + # IMPORTANT: this function gets called from extract.py and data_analysis.py + # with different types for `row` data_key = DATA_DICTIONARY[version][key] if isinstance(data_key, list): first_key = map_key(row, data_key[0]) @@ -141,6 +145,8 @@ def case_insensitive_lookup(row, key, version): if mapped_subkey: subdata = empty_str_from_none(row[mapped_subkey]) data = data + " " + subdata + if isinstance(data, pd.Series): + data.name = key return data