From 848e8dd8bbb7d2a0cfa704778c4d722f7b088cd9 Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Wed, 16 Jun 2021 00:12:42 -0400 Subject: [PATCH 01/12] features adapted to data from forced photometry service --- .../features/custom/ztf_feature_extractor.py | 30 +++--- .../sn_parametric_model_computer.py | 4 +- lc_classifier/features/preprocess/base.py | 1 - .../features/preprocess/preprocess_ztf.py | 100 ++++++++++++++++++ 4 files changed, 117 insertions(+), 18 deletions(-) diff --git a/lc_classifier/features/custom/ztf_feature_extractor.py b/lc_classifier/features/custom/ztf_feature_extractor.py index 55ab7ab..fd7a0e4 100644 --- a/lc_classifier/features/custom/ztf_feature_extractor.py +++ b/lc_classifier/features/custom/ztf_feature_extractor.py @@ -134,12 +134,12 @@ def filter_out_short_lightcurves(self, detections): class ZTFForcedPhotometryFeatureExtractor(FeatureExtractor): def __init__(self, bands=(1, 2), stream=False): self.bands = list(bands) - self.stream = stream + # self.stream = stream extractors = [ GalacticCoordinatesExtractor(), ZTFColorFeatureExtractor(), - RealBogusExtractor(), + # RealBogusExtractor(), MHPSExtractor(bands), IQRExtractor(bands), TurboFatsFeatureExtractor(bands), @@ -151,16 +151,16 @@ def __init__(self, bands=(1, 2), stream=False): HarmonicsExtractor(bands), GPDRWExtractor(bands) ] - if self.stream: - extractors += [ - StreamSGScoreExtractor(), - WiseStreamExtractor() - ] - else: - extractors += [ - SGScoreExtractor(), - WiseStaticExtractor() - ] + # if self.stream: + # extractors += [ + # StreamSGScoreExtractor(), + # WiseStreamExtractor() + # ] + # else: + # extractors += [ + # SGScoreExtractor(), + # WiseStaticExtractor() + # ] self.composed_feature_extractor = FeatureExtractorComposer(extractors) @lru_cache(1) @@ -201,8 +201,8 @@ def _compute_features(self, detections, **kwargs): """ required = [] - if self.stream: - required += ['metadata', 'xmatches'] + # if self.stream: + # required += ['metadata', 'xmatches'] for key in required: if key not in kwargs: raise Exception(f"HierarchicalFeaturesComputer requires {key} argument") @@ -226,4 +226,4 @@ def filter_out_short_lightcurves(self, detections): has_enough_alerts = self.get_enough_alerts_mask(detections) too_short_oids = has_enough_alerts[~has_enough_alerts] detections = detections.loc[has_enough_alerts] - return detections, too_short_oids.index.values \ No newline at end of file + return detections, too_short_oids.index.values diff --git a/lc_classifier/features/extractors/sn_parametric_model_computer.py b/lc_classifier/features/extractors/sn_parametric_model_computer.py index baf7d8c..0515550 100644 --- a/lc_classifier/features/extractors/sn_parametric_model_computer.py +++ b/lc_classifier/features/extractors/sn_parametric_model_computer.py @@ -122,8 +122,8 @@ def fit(self, times, fluxpsf, obs_errors): argmax_fluxpsf = np.argmax(fluxpsf) max_fluxpsf = fluxpsf[argmax_fluxpsf] A_bounds = [max_fluxpsf / 3.0, max_fluxpsf * 3.0] - t0_bounds = [-50.0, 70.0] - gamma_bounds = [1.0, 100.0] + t0_bounds = [-50.0, 90.0] + gamma_bounds = [1.0, 120.0] beta_bounds = [0.0, 1.0] trise_bounds = [1.0, 100.0] tfall_bounds = [1.0, 180.0] diff --git a/lc_classifier/features/preprocess/base.py b/lc_classifier/features/preprocess/base.py index c9635b6..8eb96bc 100644 --- a/lc_classifier/features/preprocess/base.py +++ b/lc_classifier/features/preprocess/base.py @@ -13,7 +13,6 @@ def verify_dataframe(self, dataframe): raise ValueError("Input isn't a Pandas DataFrame") return - @abstractmethod def preprocess(self, dataframe): """ diff --git a/lc_classifier/features/preprocess/preprocess_ztf.py b/lc_classifier/features/preprocess/preprocess_ztf.py index ddfd007..87152c8 100644 --- a/lc_classifier/features/preprocess/preprocess_ztf.py +++ b/lc_classifier/features/preprocess/preprocess_ztf.py @@ -148,3 +148,103 @@ def rename_columns_non_detections(self, non_detections): def rename_columns_detections(self, detections): return detections.rename( columns=self.column_translation, errors='ignore') + + +class ZTFForcedPhotometryLightcurvePreprocessor(GenericPreprocessor): + def __init__(self): + super().__init__() + + self.required_columns = [ + 'time', + 'band', + 'magnitude', + 'error', + 'magpsf', # TODO: rename to diff_magnitude + 'sigmapsf', # diff_error + 'diff_flux', + 'diff_err', + 'ra', + 'dec', + 'infobitssci' + ] + + self.column_translation = { + 'mjd': 'time', + 'fid': 'band', + } + self.max_sigma = 1.0 + + def has_necessary_columns(self, dataframe): + """ + :param dataframe: + :return: + """ + booleans = list(map(lambda x: x in dataframe.columns, self.required_columns)) + return reduce(lambda x, y: x & y, booleans) + + def discard_invalid_value_detections(self, detections): + """ + :param detections: + :return: + """ + detections = detections.replace([np.inf, -np.inf], np.nan) + valid_alerts = detections[self.required_columns].notna().all(axis=1) + detections = detections[valid_alerts.values] + detections[self.required_columns] = detections[self.required_columns].apply( + lambda x: pd.to_numeric(x, errors='coerce')) + return detections + + def drop_duplicates(self, detections): + """ + :param detections: + :return: + """ + assert detections.index.name == 'oid' + detections = detections.copy() + detections['oid'] = detections.index + detections = detections.drop_duplicates(['oid', 'time']) + detections = detections[[col for col in detections.columns if col != 'oid']] + return detections + + def discard_noisy_detections(self, detections): + """ + :param detections: + :return: + """ + detections = detections[((detections['error'] > 0.0) & + (detections['error'] < self.max_sigma)) + ] + return detections + + def discard_defectuous_detections(self, detections): + detections = detections[detections['infobitssci'] == 0.0] + return detections + + def enough_alerts(self, detections, min_dets=5): + objects = detections.groupby("oid") + indexes = [] + for oid, group in objects: + if len(group.band == 1) > min_dets or len(group.band == 2) > min_dets: + indexes.append(oid) + return detections.loc[indexes] + + def preprocess(self, dataframe, objects=None): + """ + :param dataframe: + :param objects: + :return: + """ + self.verify_dataframe(dataframe) + dataframe = self.rename_columns_detections(dataframe) + if not self.has_necessary_columns(dataframe): + raise Exception('dataframe does not have all the necessary columns') + dataframe = self.drop_duplicates(dataframe) + dataframe = self.discard_invalid_value_detections(dataframe) + dataframe = self.discard_noisy_detections(dataframe) + dataframe = self.discard_defectuous_detections(dataframe) + dataframe = self.enough_alerts(dataframe) + return dataframe + + def rename_columns_detections(self, detections): + return detections.rename( + columns=self.column_translation, errors='ignore') From b8c56a21d5d7f94be50c8c5cb057f4fa3635cf0a Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Wed, 16 Jun 2021 02:40:10 -0400 Subject: [PATCH 02/12] fix warnings in folded kim extractor --- .../extractors/folded_kim_extractor.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/lc_classifier/features/extractors/folded_kim_extractor.py b/lc_classifier/features/extractors/folded_kim_extractor.py index 80b146a..859de2c 100644 --- a/lc_classifier/features/extractors/folded_kim_extractor.py +++ b/lc_classifier/features/extractors/folded_kim_extractor.py @@ -55,20 +55,26 @@ def aux_function(oid_detections, band, **kwargs): return self.nan_series_in_band(band) oid_band_detections = oid_detections[oid_detections['band'] == band] + lc_len = len(oid_band_detections) + if lc_len <= 2: + psi_cumsum = psi_eta = np.nan + else: + time = oid_band_detections['time'].values + magnitude = oid_band_detections['magnitude'].values - time = oid_band_detections['time'].values - magnitude = oid_band_detections['magnitude'].values + folded_time = np.mod(time, 2 * oid_period) / (2 * oid_period) + sorted_mags = magnitude[np.argsort(folded_time)] + sigma = np.std(sorted_mags) + if sigma != 0.0: + m = np.mean(sorted_mags) + s = np.cumsum(sorted_mags - m) * 1.0 / (lc_len * sigma) + psi_cumsum = np.max(s) - np.min(s) + sigma_squared = sigma ** 2 + psi_eta = (1.0 / ((lc_len - 1) * sigma_squared) * + np.sum(np.power(sorted_mags[1:] - sorted_mags[:-1], 2))) + else: + psi_cumsum = psi_eta = np.nan - folded_time = np.mod(time, 2 * oid_period) / (2 * oid_period) - sorted_mags = magnitude[np.argsort(folded_time)] - sigma = np.std(sorted_mags) - m = np.mean(sorted_mags) - lc_len = len(sorted_mags) - s = np.cumsum(sorted_mags - m) * 1.0 / (lc_len * sigma) - psi_cumsum = np.max(s) - np.min(s) - sigma_squared = sigma ** 2 - psi_eta = (1.0 / ((lc_len - 1) * sigma_squared) * - np.sum(np.power(sorted_mags[1:] - sorted_mags[:-1], 2))) out = pd.Series( data=[psi_cumsum, psi_eta], index=columns) From d8770d99b0e8b9e34d005871d0329f542a571e1f Mon Sep 17 00:00:00 2001 From: Javier Date: Wed, 7 Jul 2021 10:34:02 -0400 Subject: [PATCH 03/12] fix bugs in preprocess --- .../features/custom/custom_hierarchical.py | 27 ++-- .../features/preprocess/preprocess_ztf.py | 125 ++---------------- 2 files changed, 22 insertions(+), 130 deletions(-) diff --git a/lc_classifier/features/custom/custom_hierarchical.py b/lc_classifier/features/custom/custom_hierarchical.py index df74a62..5bd1a22 100644 --- a/lc_classifier/features/custom/custom_hierarchical.py +++ b/lc_classifier/features/custom/custom_hierarchical.py @@ -9,7 +9,6 @@ from lc_classifier.features import MHPSExtractor from lc_classifier.features import IQRExtractor from lc_classifier.features import SNParametricModelExtractor -from lc_classifier.features import WiseStaticExtractor from lc_classifier.features import WiseStreamExtractor from lc_classifier.features import PeriodExtractor from lc_classifier.features import PowerRateExtractor @@ -18,7 +17,7 @@ from lc_classifier.features import GPDRWExtractor from ..core.base import FeatureExtractor, FeatureExtractorSingleBand -from ..preprocess import DetectionsPreprocessorZTF, StreamDetectionsPreprocessorZTF +from ..preprocess import DetectionsPreprocessorZTF import pandas as pd import logging @@ -78,8 +77,9 @@ def get_enough_alerts_mask(self, detections): ------- """ - n_detections = detections[["mjd"]].groupby(level=0).count() - has_enough_alerts = n_detections.mjd > 5 + n_detections_by_fid = detections[["mjd", "fid"]].groupby(["oid", "fid"]).count() + has_enough_alerts = n_detections_by_fid.mjd > 5 + has_enough_alerts = has_enough_alerts.groupby(level=0).sum() > 0 return has_enough_alerts def _compute_features(self, detections, **kwargs): @@ -105,7 +105,6 @@ def _compute_features(self, detections, **kwargs): too_short_oids = has_enough_alerts[~has_enough_alerts] too_short_features = pd.DataFrame(index=too_short_oids.index) detections = detections.loc[has_enough_alerts] - detections = detections.sort_values("mjd") non_detections = kwargs["non_detections"] if len(non_detections) == 0: @@ -146,7 +145,7 @@ def __init__(self, bands=None): HarmonicsExtractor(), GPDRWExtractor() ] - self.preprocessor = StreamDetectionsPreprocessorZTF() + self.preprocessor = DetectionsPreprocessorZTF() @lru_cache(1) def get_features_keys(self) -> List[str]: @@ -179,8 +178,9 @@ def get_enough_alerts_mask(self, detections): ------- """ - n_detections = detections[["mjd"]].groupby(level=0).count() - has_enough_alerts = n_detections.mjd > 5 + n_detections_by_fid = detections[["mjd", "fid"]].groupby(["oid", "fid"]).count() + has_enough_alerts = n_detections_by_fid.mjd > 5 + has_enough_alerts = has_enough_alerts.groupby(level=0).sum() > 0 return has_enough_alerts def _compute_features(self, detections, **kwargs): @@ -199,14 +199,15 @@ def _compute_features(self, detections, **kwargs): if not isinstance(detections, pd.core.frame.DataFrame): raise TypeError('detections has to be a DataFrame') - required = ["non_detections", "xmatches", "metadata"] + required = ["non_detections", "xmatches", "metadata", "objects"] for key in required: if key not in kwargs: raise Exception(f"HierarchicalFeaturesComputer requires {key} argument") - detections = self.preprocessor.preprocess(detections) + objects = kwargs["objects"] + detections = self.preprocessor.preprocess(detections, objects=objects) has_enough_alerts = self.get_enough_alerts_mask(detections) - too_short_oids = has_enough_alerts[~has_enough_alerts] - too_short_features = pd.DataFrame(index=too_short_oids.index) + # too_short_oids = has_enough_alerts[~has_enough_alerts] + # too_short_features = pd.DataFrame(index=too_short_oids.index) detections = detections.loc[has_enough_alerts] detections = detections.sort_values("mjd") if len(detections) == 0: @@ -231,5 +232,5 @@ def _compute_features(self, detections, **kwargs): logging.info(f"EXTRACTOR={ex}, FEATURE_SHAPE={df.shape}") features.append(df) df = pd.concat(features, axis=1, join="inner") - df = pd.concat([df, too_short_features], axis=0, join="outer", sort=True) + # df = pd.concat([df, too_short_features], axis=0, join="outer", sort=True) return df diff --git a/lc_classifier/features/preprocess/preprocess_ztf.py b/lc_classifier/features/preprocess/preprocess_ztf.py index 2aa7c7d..3994137 100644 --- a/lc_classifier/features/preprocess/preprocess_ztf.py +++ b/lc_classifier/features/preprocess/preprocess_ztf.py @@ -1,5 +1,4 @@ from .base import GenericPreprocessor -from functools import reduce import numpy as np import pandas as pd @@ -16,8 +15,7 @@ def __init__(self): 'sigmapsf_ml', 'ra', 'dec', - 'rb', - 'sgscore1' + 'rb' ] self.max_sigma = 1.0 self.rb_threshold = 0.55 @@ -27,8 +25,10 @@ def has_necessary_columns(self, dataframe): :param dataframe: :return: """ - booleans = list(map(lambda x: x in dataframe.columns, self.not_null_columns)) - return reduce(lambda x, y: x & y, booleans) + input_columns = set(dataframe.columns) + constraint = set(self.not_null_columns) + difference = constraint.difference(input_columns) + return len(difference) == 0 def discard_invalid_value_detections(self, detections): """ @@ -49,6 +49,7 @@ def drop_duplicates(self, detections): """ assert detections.index.name == 'oid' detections = detections.copy() + detections = detections.sort_values("mjd", ascending=True) detections['oid'] = detections.index detections = detections.drop_duplicates(['oid', 'mjd']) detections = detections[[col for col in detections.columns if col != 'oid']] @@ -110,120 +111,10 @@ def preprocess(self, dataframe, objects=None): dataframe = self.get_magpsf_ml(dataframe, objects) if not self.has_necessary_columns(dataframe): raise Exception('dataframe does not have all the necessary columns') - dataframe = self.drop_duplicates(dataframe) - dataframe = self.discard_invalid_value_detections(dataframe) - dataframe = self.discard_noisy_detections(dataframe) + dataframe.sort_values("mjd", inplace=True) dataframe = self.discard_bogus(dataframe) - dataframe = self.enough_alerts(dataframe) - return dataframe - - -class StreamDetectionsPreprocessorZTF(GenericPreprocessor): - def __init__(self): - super().__init__() - self.not_null_columns = [ - 'mjd', - 'fid', - 'magpsf', - 'sigmapsf', - 'magpsf_ml', - 'sigmapsf_ml', - 'ra', - 'dec', - 'rb', - ] - self.max_sigma = 1.0 - self.rb_threshold = 0.55 - - def has_necessary_columns(self, dataframe): - """ - :param dataframe: - :return: - """ - missing = set(self.not_null_columns).difference(set(dataframe.columns)) - return missing - - def discard_invalid_value_detections(self, detections): - """ - :param detections: - :return: - """ - detections = detections.replace([np.inf, -np.inf], np.nan) - valid_alerts = detections[self.not_null_columns].notna().all(axis=1) - detections = detections[valid_alerts.values] - detections[self.not_null_columns] = detections[self.not_null_columns].apply( - lambda x: pd.to_numeric(x, errors='coerce')) - return detections - - def drop_duplicates(self, detections): - """ - :param detections: - :return: - """ - assert detections.index.name == 'oid' - detections = detections.copy() - detections['oid'] = detections.index - detections = detections.drop_duplicates(['oid', 'mjd']) - detections = detections[[col for col in detections.columns if col != 'oid']] - return detections - - def discard_noisy_detections(self, detections): - """ - :param detections: - :return: - """ - detections = detections[((detections['sigmapsf_ml'] > 0.0) & - (detections['sigmapsf_ml'] < self.max_sigma)) - ] - return detections - - def discard_bogus(self, detections): - """ - - :param detections: - :return: - """ - detections = detections[detections['rb'] >= self.rb_threshold] - return detections - - def enough_alerts(self, detections, min_dets=5): - objects = detections.groupby("oid") - indexes = [] - for oid, group in objects: - if len(group.fid == 1) > min_dets or len(group.fid == 2) > min_dets: - indexes.append(oid) - return detections.loc[indexes] - - def get_magpsf_ml(self, detections): - def magpsf_ml(detections): - detections = detections.copy() - is_corrected = detections.corrected.all() - if is_corrected: - detections["magpsf_ml"] = detections["magpsf_corr"] - detections["sigmapsf_ml"] = detections["sigmapsf_corr_ext"] - else: - detections["magpsf_ml"] = detections["magpsf"] - detections["sigmapsf_ml"] = detections["sigmapsf"] - return detections - - detections = detections.groupby(level=0, sort=False)\ - .apply(magpsf_ml).droplevel(level=1) - return detections - - def preprocess(self, dataframe): - """ - :param dataframe: - :param objects: - :return: - """ - self.verify_dataframe(dataframe) - dataframe = self.get_magpsf_ml(dataframe) - missing = self.has_necessary_columns(dataframe) - if len(missing) > 0: - raise Exception(f'dataframe does not have all the necessary columns. Missing {missing}') - dataframe = self.drop_duplicates(dataframe) dataframe = self.discard_invalid_value_detections(dataframe) dataframe = self.discard_noisy_detections(dataframe) - dataframe = self.discard_bogus(dataframe) + dataframe = self.drop_duplicates(dataframe) dataframe = self.enough_alerts(dataframe) return dataframe From e0d09a4eda1e7274fab397c6eb5a977aa90e2437 Mon Sep 17 00:00:00 2001 From: Diego Rodriguez Date: Wed, 14 Jul 2021 12:18:10 -0400 Subject: [PATCH 04/12] update model to 1.1.0 --- lc_classifier/classifier/models.py | 200 ++++++++++++----------------- 1 file changed, 85 insertions(+), 115 deletions(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index 0b50f72..78494dc 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -22,9 +22,8 @@ def predict(self, samples: pd.DataFrame) -> pd.DataFrame: probs = self.predict_proba(samples) predicted_class = probs.idxmax(axis=1) predicted_class_df = pd.DataFrame( - predicted_class, - columns=['classALeRCE'], - index=samples.index) + predicted_class, columns=["classALeRCE"], index=samples.index + ) predicted_class_df.index.name = samples.index.name return predicted_class_df @@ -57,16 +56,17 @@ class BaselineRandomForest(BaseClassifier): def __init__(self): self.random_forest_classifier = RandomForestClassifier( n_estimators=500, - max_features='auto', + max_features="auto", max_depth=None, n_jobs=1, class_weight=None, - criterion='entropy', + criterion="entropy", min_samples_split=2, - min_samples_leaf=1) + min_samples_leaf=1, + ) self.feature_preprocessor = FeaturePreprocessor() self.feature_list = None - self.model_filename = 'baseline_rf.pkl' + self.model_filename = "baseline_rf.pkl" def fit(self, samples: pd.DataFrame, labels: pd.DataFrame): samples = self.feature_preprocessor.preprocess_features(samples) @@ -77,7 +77,7 @@ def fit(self, samples: pd.DataFrame, labels: pd.DataFrame): self.feature_list = samples.columns samples_np_array = samples.values - labels_np_array = labels['classALeRCE'].loc[samples.index].values + labels_np_array = labels["classALeRCE"].loc[samples.index].values self.random_forest_classifier.fit(samples_np_array, labels_np_array) def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: @@ -87,76 +87,63 @@ def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: predicted_probs_df = pd.DataFrame( predicted_probs, columns=self.get_list_of_classes(), - index=samples.index.values + index=samples.index.values, ) - predicted_probs_df.index.name = 'oid' + predicted_probs_df.index.name = "oid" return predicted_probs_df def get_list_of_classes(self) -> list: return self.random_forest_classifier.classes_ def save_model(self, directory: str) -> None: - with open(os.path.join(directory, self.model_filename), 'wb') as f: - pickle.dump( - self.random_forest_classifier, - f, - pickle.HIGHEST_PROTOCOL) - with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f: - pickle.dump( - self.feature_list, - f, - pickle.HIGHEST_PROTOCOL) + with open(os.path.join(directory, self.model_filename), "wb") as f: + pickle.dump(self.random_forest_classifier, f, pickle.HIGHEST_PROTOCOL) + with open(os.path.join(directory, "feature_list.pkl"), "wb") as f: + pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: rf = pd.read_pickle(os.path.join(directory, self.model_filename)) self.random_forest_classifier = rf - self.feature_list = pd.read_pickle( - os.path.join(directory, 'feature_list.pkl')) + self.feature_list = pd.read_pickle(os.path.join(directory, "feature_list.pkl")) class HierarchicalRandomForest(BaseClassifier): MODEL_NAME = "hierarchical_random_forest" - MODEL_VERSION = "1.0.0" + MODEL_VERSION = "1.1.0" MODEL_VERSION_NAME = f"{MODEL_NAME}_{MODEL_VERSION}" MODEL_PICKLE_PATH = os.path.join(PICKLE_PATH, f"{MODEL_VERSION_NAME}") def __init__(self, taxonomy_dictionary, non_used_features=None): n_trees = 500 self.top_classifier = RandomForestClassifier( - n_estimators=n_trees, - max_depth=None, - max_features='auto' + n_estimators=n_trees, max_depth=None, max_features="auto" ) self.stochastic_classifier = RandomForestClassifier( - n_estimators=n_trees, - max_depth=None, - max_features=0.2 + n_estimators=n_trees, max_depth=None, max_features=0.2 ) self.periodic_classifier = RandomForestClassifier( - n_estimators=n_trees, - max_depth=None, - max_features='auto' + n_estimators=n_trees, max_depth=None, max_features="auto" ) self.transient_classifier = RandomForestClassifier( - n_estimators=n_trees, - max_depth=None, - max_features='auto' + n_estimators=n_trees, max_depth=None, max_features="auto" ) - self.feature_preprocessor = FeaturePreprocessor(non_used_features=non_used_features) + self.feature_preprocessor = FeaturePreprocessor( + non_used_features=non_used_features + ) self.taxonomy_dictionary = taxonomy_dictionary self.feature_list = None self.inverted_dictionary = invert_dictionary(self.taxonomy_dictionary) self.pickles = { - "features_list":"features_RF_model.pkl", - "top_rf":"hierarchical_level_RF_model.pkl", - "periodic_rf":"periodic_level_RF_model.pkl", - "stochastic_rf":"stochastic_level_RF_model.pkl", - "transient_rf":"transient_level_RF_model.pkl" + "features_list": "features_RF_model.pkl", + "top_rf": "top_level_BRF_model.pkl", + "periodic_rf": "periodic_level_BRF_model.pkl", + "stochastic_rf": "stochastic_level_BRF_model.pkl", + "transient_rf": "transient_level_BRF_model.pkl", } self.url_model = f"https://assets.alerce.online/pipeline/hierarchical_rf_{self.MODEL_VERSION}/" @@ -169,10 +156,10 @@ def fit(self, samples: pd.DataFrame, labels: pd.DataFrame) -> None: for label in feeded_labels: if label not in expected_labels: - raise Exception(f'{label} is not in the taxonomy dictionary') + raise Exception(f"{label} is not in the taxonomy dictionary") # Create top class - labels['top_class'] = labels['classALeRCE'].map(self.inverted_dictionary) + labels["top_class"] = labels["classALeRCE"].map(self.inverted_dictionary) # Preprocessing samples = self.feature_preprocessor.preprocess_features(samples) @@ -183,28 +170,25 @@ def fit(self, samples: pd.DataFrame, labels: pd.DataFrame) -> None: self.feature_list = samples.columns # Train top classifier - self.top_classifier.fit(samples.values, labels['top_class'].values) + self.top_classifier.fit(samples.values, labels["top_class"].values) # Train specialized classifiers - is_stochastic = labels['top_class'] == 'Stochastic' + is_stochastic = labels["top_class"] == "Stochastic" self.stochastic_classifier.fit( - samples[is_stochastic].values, - labels[is_stochastic]['classALeRCE'].values + samples[is_stochastic].values, labels[is_stochastic]["classALeRCE"].values ) - is_periodic = labels['top_class'] == 'Periodic' + is_periodic = labels["top_class"] == "Periodic" self.periodic_classifier.fit( - samples[is_periodic].values, - labels[is_periodic]['classALeRCE'].values + samples[is_periodic].values, labels[is_periodic]["classALeRCE"].values ) - is_transient = labels['top_class'] == 'Transient' + is_transient = labels["top_class"] == "Transient" self.transient_classifier.fit( - samples[is_transient].values, - labels[is_transient]['classALeRCE'].values + samples[is_transient].values, labels[is_transient]["classALeRCE"].values ) - def check_missing_features(self,columns, feature_list): + def check_missing_features(self, columns, feature_list): missing = set(feature_list).difference(set(columns)) return missing @@ -222,76 +206,68 @@ def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: periodic_probs = self.periodic_classifier.predict_proba(samples.values) transient_probs = self.transient_classifier.predict_proba(samples.values) - stochastic_index = self.top_classifier.classes_.tolist().index('Stochastic') - periodic_index = self.top_classifier.classes_.tolist().index('Periodic') - transient_index = self.top_classifier.classes_.tolist().index('Transient') + stochastic_index = self.top_classifier.classes_.tolist().index("Stochastic") + periodic_index = self.top_classifier.classes_.tolist().index("Periodic") + transient_index = self.top_classifier.classes_.tolist().index("Transient") - stochastic_probs = stochastic_probs * top_probs[:, stochastic_index].reshape([-1, 1]) + stochastic_probs = stochastic_probs * top_probs[:, stochastic_index].reshape( + [-1, 1] + ) periodic_probs = periodic_probs * top_probs[:, periodic_index].reshape([-1, 1]) - transient_probs = transient_probs * top_probs[:, transient_index].reshape([-1, 1]) + transient_probs = transient_probs * top_probs[:, transient_index].reshape( + [-1, 1] + ) final_probs = np.concatenate( - [stochastic_probs, periodic_probs, transient_probs], - axis=1 + [stochastic_probs, periodic_probs, transient_probs], axis=1 ) df = pd.DataFrame( - data=final_probs, - index=samples.index, - columns=self.get_list_of_classes() + data=final_probs, index=samples.index, columns=self.get_list_of_classes() ) df.index.name = samples.index.name return df def get_list_of_classes(self) -> list: final_columns = ( - self.stochastic_classifier.classes_.tolist() - + self.periodic_classifier.classes_.tolist() - + self.transient_classifier.classes_.tolist()) + self.stochastic_classifier.classes_.tolist() + + self.periodic_classifier.classes_.tolist() + + self.transient_classifier.classes_.tolist() + ) return final_columns def save_model(self, directory: str) -> None: - with open(os.path.join(directory, self.pickles['top_rf']), 'wb') as f: - pickle.dump( - self.top_classifier, - f, - pickle.HIGHEST_PROTOCOL) - - with open(os.path.join(directory, self.pickles['stochastic_rf']), 'wb') as f: - pickle.dump( - self.stochastic_classifier, - f, - pickle.HIGHEST_PROTOCOL) - - with open(os.path.join(directory, self.pickles['periodic_rf']), 'wb') as f: - pickle.dump( - self.periodic_classifier, - f, - pickle.HIGHEST_PROTOCOL) - - with open(os.path.join(directory, self.pickles['transient_rf']), 'wb') as f: - pickle.dump( - self.transient_classifier, - f, - pickle.HIGHEST_PROTOCOL) - - with open(os.path.join(directory, self.pickles['features_list']), 'wb') as f: - pickle.dump( - self.feature_list, - f, - pickle.HIGHEST_PROTOCOL) + with open(os.path.join(directory, self.pickles["top_rf"]), "wb") as f: + pickle.dump(self.top_classifier, f, pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(directory, self.pickles["stochastic_rf"]), "wb") as f: + pickle.dump(self.stochastic_classifier, f, pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(directory, self.pickles["periodic_rf"]), "wb") as f: + pickle.dump(self.periodic_classifier, f, pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(directory, self.pickles["transient_rf"]), "wb") as f: + pickle.dump(self.transient_classifier, f, pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(directory, self.pickles["features_list"]), "wb") as f: + pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: self.top_classifier = pd.read_pickle( - os.path.join(directory, self.pickles['top_rf'] )) + os.path.join(directory, self.pickles["top_rf"]) + ) self.stochastic_classifier = pd.read_pickle( - os.path.join(directory, self.pickles['stochastic_rf'])) + os.path.join(directory, self.pickles["stochastic_rf"]) + ) self.periodic_classifier = pd.read_pickle( - os.path.join(directory, self.pickles['periodic_rf'])) + os.path.join(directory, self.pickles["periodic_rf"]) + ) self.transient_classifier = pd.read_pickle( - os.path.join(directory, self.pickles['transient_rf'])) + os.path.join(directory, self.pickles["transient_rf"]) + ) self.feature_list = pd.read_pickle( - os.path.join(directory, self.pickles['features_list'])) + os.path.join(directory, self.pickles["features_list"]) + ) def download_model(self): if not os.path.exists(self.MODEL_PICKLE_PATH): @@ -304,8 +280,8 @@ def download_model(self): def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: if not isinstance(input_features, pd.core.frame.DataFrame): - raise TypeError('predict_in_pipeline expects a DataFrame.') - + raise TypeError("predict_in_pipeline expects a DataFrame.") + missing = self.check_missing_features(input_features.columns, self.feature_list) if len(missing) > 0: raise Exception(f"Missing features: {missing}") @@ -316,7 +292,7 @@ def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: prob_root = pd.DataFrame( self.top_classifier.predict_proba(input_features), columns=self.top_classifier.classes_, - index=input_features.index + index=input_features.index, ) prob_children = [] @@ -325,18 +301,14 @@ def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: child_models = [ self.stochastic_classifier, self.periodic_classifier, - self.transient_classifier - ] - child_names = [ - 'Stochastic', - 'Periodic', - 'Transient' + self.transient_classifier, ] + child_names = ["Stochastic", "Periodic", "Transient"] for name, model in zip(child_names, child_models): prob_child = pd.DataFrame( model.predict_proba(input_features), columns=model.classes_, - index=input_features.index + index=input_features.index, ) resp_children[name] = prob_child @@ -345,9 +317,7 @@ def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: prob_all = pd.concat(prob_children, axis=1, sort=False) return { - "hierarchical": { - "top": prob_root, - "children": resp_children}, + "hierarchical": {"top": prob_root, "children": resp_children}, "probabilities": prob_all, - "class": prob_all.idxmax(axis=1) + "class": prob_all.idxmax(axis=1), } From 67c1f14a67d3f454bf3a5ed4175efc9a1b2d9e16 Mon Sep 17 00:00:00 2001 From: Diego Rodriguez Date: Wed, 14 Jul 2021 13:07:24 -0400 Subject: [PATCH 05/12] fix bug when trying to use missing preprocessor --- .../custom/forced_photometry_extractor.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lc_classifier/features/custom/forced_photometry_extractor.py b/lc_classifier/features/custom/forced_photometry_extractor.py index 72d0d06..6ae442d 100644 --- a/lc_classifier/features/custom/forced_photometry_extractor.py +++ b/lc_classifier/features/custom/forced_photometry_extractor.py @@ -18,7 +18,7 @@ from lc_classifier.features import GPDRWExtractor from ..core.base import FeatureExtractor, FeatureExtractorSingleBand -from ..preprocess import DetectionsPreprocessorZTF, StreamDetectionsPreprocessorZTF +from ..preprocess import DetectionsPreprocessorZTF import pandas as pd import logging @@ -42,7 +42,7 @@ def __init__(self, bands=None): PowerRateExtractor(), FoldedKimExtractor(), HarmonicsExtractor(), - GPDRWExtractor() + GPDRWExtractor(), ] self.preprocessor = DetectionsPreprocessorZTF() @@ -107,9 +107,7 @@ def _compute_features(self, detections, **kwargs): shared_data = dict() grouped_detections = detections.groupby(level=0) for ex in self.extractors: - df = ex.compute_features( - grouped_detections, - shared_data=shared_data) + df = ex.compute_features(grouped_detections, shared_data=shared_data) logging.info(f"FLAG={ex}") features.append(df) df = pd.concat(features, axis=1, join="inner") @@ -135,9 +133,9 @@ def __init__(self, bands=None): PowerRateExtractor(), FoldedKimExtractor(), HarmonicsExtractor(), - GPDRWExtractor() + GPDRWExtractor(), ] - self.preprocessor = StreamDetectionsPreprocessorZTF() + self.preprocessor = DetectionsPreprocessorZTF() def get_features_keys(self) -> List[str]: features_keys = [] @@ -185,13 +183,16 @@ def _compute_features(self, detections, **kwargs): """ if not isinstance(detections, pd.core.frame.DataFrame): - raise TypeError('detections has to be a DataFrame') - - required = ["xmatches", "metadata"] + raise TypeError("detections has to be a DataFrame") + + required = ["xmatches", "metadata", "objects"] for key in required: if key not in kwargs: - raise Exception(f"StreamedForcedPhotometryExtractor requires {key} argument") - detections = self.preprocessor.preprocess(detections) + raise Exception( + f"StreamedForcedPhotometryExtractor requires {key} argument" + ) + objects = kwargs["objects"] + detections = self.preprocessor.preprocess(detections, objects=objects) has_enough_alerts = self.get_enough_alerts_mask(detections) too_short_oids = has_enough_alerts[~has_enough_alerts] too_short_features = pd.DataFrame(index=too_short_oids.index) From bac74c0da05cf7bbbb26377c7fdcf5b8e517aa40 Mon Sep 17 00:00:00 2001 From: Javier Date: Thu, 22 Jul 2021 14:59:31 -0400 Subject: [PATCH 06/12] the script doesnt use sgscore in batch --- lc_classifier/features/custom/custom_hierarchical.py | 8 ++++---- .../features/extractors/sn_non_detections_extractor.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lc_classifier/features/custom/custom_hierarchical.py b/lc_classifier/features/custom/custom_hierarchical.py index 5bd1a22..195789b 100644 --- a/lc_classifier/features/custom/custom_hierarchical.py +++ b/lc_classifier/features/custom/custom_hierarchical.py @@ -29,7 +29,7 @@ def __init__(self, bands=None): self.bands = bands if bands is not None else [1, 2] self.extractors = [ GalacticCoordinatesExtractor(), - SGScoreExtractor(), + # SGScoreExtractor(), ColorFeatureExtractor(), RealBogusExtractor(), MHPSExtractor(), @@ -102,8 +102,8 @@ def _compute_features(self, detections, **kwargs): objects = kwargs["objects"] detections = self.preprocessor.preprocess(detections, objects=objects) has_enough_alerts = self.get_enough_alerts_mask(detections) - too_short_oids = has_enough_alerts[~has_enough_alerts] - too_short_features = pd.DataFrame(index=too_short_oids.index) + # too_short_oids = has_enough_alerts[~has_enough_alerts] + # too_short_features = pd.DataFrame(index=too_short_oids.index) detections = detections.loc[has_enough_alerts] non_detections = kwargs["non_detections"] @@ -121,7 +121,7 @@ def _compute_features(self, detections, **kwargs): logging.info(f"FLAG={ex}") features.append(df) df = pd.concat(features, axis=1, join="inner") - df = pd.concat([df, too_short_features], axis=0, join="outer", sort=True) + # df = pd.concat([df, too_short_features], axis=0, join="outer", sort=True) return df diff --git a/lc_classifier/features/extractors/sn_non_detections_extractor.py b/lc_classifier/features/extractors/sn_non_detections_extractor.py index 395c30f..6b4c016 100644 --- a/lc_classifier/features/extractors/sn_non_detections_extractor.py +++ b/lc_classifier/features/extractors/sn_non_detections_extractor.py @@ -217,4 +217,5 @@ def aux_function(oid_detections, **kwargs): sn_features = detections.apply(aux_function) sn_features.index.name = 'oid' + sn_features = sn_features.astype(float) return sn_features From 4cca0eed0fbebe58086f0992064413ba8cc1d314 Mon Sep 17 00:00:00 2001 From: Javier Date: Fri, 23 Jul 2021 12:07:44 -0400 Subject: [PATCH 07/12] fixed probabilites operation between levels of classifier --- lc_classifier/classifier/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index 78494dc..5f73e68 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -297,13 +297,13 @@ def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: prob_children = [] resp_children = {} - child_models = [ self.stochastic_classifier, - self.periodic_classifier, self.transient_classifier, + self.periodic_classifier, ] - child_names = ["Stochastic", "Periodic", "Transient"] + child_names = ["Transient", "Stochastic", "Periodic"] + for name, model in zip(child_names, child_models): prob_child = pd.DataFrame( model.predict_proba(input_features), @@ -312,7 +312,7 @@ def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: ) resp_children[name] = prob_child - prob_child = prob_child.mul(prob_root[name].values, axis="rows") + prob_child = prob_child.mul(prob_root[name].values, axis=0) prob_children.append(prob_child) prob_all = pd.concat(prob_children, axis=1, sort=False) From 2b3f5fefb99182cd46be8536a7ba393863dc5747 Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Thu, 5 Aug 2021 22:20:32 -0400 Subject: [PATCH 08/12] fixing tests --- tests/features/test_forced_photometry_extractor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/features/test_forced_photometry_extractor.py b/tests/features/test_forced_photometry_extractor.py index 655276b..5d028dc 100644 --- a/tests/features/test_forced_photometry_extractor.py +++ b/tests/features/test_forced_photometry_extractor.py @@ -34,7 +34,9 @@ def test_forced_photometry_extractor(self): detections=detections) self.assertEqual(98, features_df.shape[0]) - self.assertEqual(162, features_df.shape[1]) + self.assertEqual( + len(self.forced_photometry_extractor.get_features_keys()), + features_df.shape[1]) self.assertFalse(np.all(np.isnan(features_df.values), axis=(0, 1))) def test_forced_photometry_extractor_stream(self): @@ -61,8 +63,8 @@ def test_forced_photometry_extractor_stream(self): def test_get_features_keys(self): keys = self.forced_photometry_extractor.get_features_keys() keys_stream = self.streamed_forced_photometry_extractor.get_features_keys() - self.assertEqual(162, len(keys)) - self.assertEqual(162, len(keys_stream)) + self.assertEqual(154, len(keys)) + self.assertEqual(154, len(keys_stream)) if __name__ == '__main__': From 13a2040fad04d9f1c048f63e2b42d34ed2f597ca Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Mon, 9 Aug 2021 11:11:29 -0400 Subject: [PATCH 09/12] external resources should be available now --- lc_classifier/classifier/models.py | 2 +- lc_classifier/features/extractors/wise_static_extractor.py | 2 +- tests/classifier/test_hierarchical_random_forest.py | 2 +- tests/classifier/test_metrics.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index a55bfcb..4f82009 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -109,7 +109,7 @@ def load_model(self, directory: str) -> None: class HierarchicalRandomForest(BaseClassifier): MODEL_NAME = "hierarchical_random_forest" - MODEL_VERSION = "1.1.0" + MODEL_VERSION = "1.1.1" MODEL_VERSION_NAME = f"{MODEL_NAME}_{MODEL_VERSION}" MODEL_PICKLE_PATH = os.path.join(PICKLE_PATH, f"{MODEL_VERSION_NAME}") taxonomy_dictionary = { diff --git a/lc_classifier/features/extractors/wise_static_extractor.py b/lc_classifier/features/extractors/wise_static_extractor.py index e7ea695..1df3074 100644 --- a/lc_classifier/features/extractors/wise_static_extractor.py +++ b/lc_classifier/features/extractors/wise_static_extractor.py @@ -10,7 +10,7 @@ FILE_PATH = os.path.dirname(os.path.abspath(__file__)) WISE_CSV = os.path.abspath(os.path.join(FILE_PATH, "data/wise_bands.csv")) -WISE_CSV_URL = "https://droppy.alerce.online/$/5tZUY" +WISE_CSV_URL = "https://alerce-static.s3.amazonaws.com/datasets/wise_bands.csv" def compute_colors_from_bands(bands: pd.DataFrame) -> pd.DataFrame: diff --git a/tests/classifier/test_hierarchical_random_forest.py b/tests/classifier/test_hierarchical_random_forest.py index 4ca572c..9dd28e4 100644 --- a/tests/classifier/test_hierarchical_random_forest.py +++ b/tests/classifier/test_hierarchical_random_forest.py @@ -15,7 +15,7 @@ def setUp(self) -> None: del self.test_labels["objectId"] self.model_trained = HierarchicalRandomForest() - # self.model_trained.download_model() + self.model_trained.download_model() self.model_trained.load_model(self.model_trained.MODEL_PICKLE_PATH) self.taxonomy = { diff --git a/tests/classifier/test_metrics.py b/tests/classifier/test_metrics.py index 085be1d..4972478 100644 --- a/tests/classifier/test_metrics.py +++ b/tests/classifier/test_metrics.py @@ -26,7 +26,7 @@ def test_all_label_classes_in_prediction(self): class HRFMetricsTest(unittest.TestCase): def setUp(self): self.model_trained = HierarchicalRandomForest() - # self.model_trained.download_model() + self.model_trained.download_model() self.model_trained.load_model(self.model_trained.MODEL_PICKLE_PATH) self.train_features = pd.read_csv('data_examples/2000_features.csv') From 3198432ba0d170e5f4bd5dd2ca3182c117965be4 Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Mon, 9 Aug 2021 11:31:56 -0400 Subject: [PATCH 10/12] pickle5 for python 3.6 and 3.7 compatibility --- lc_classifier/classifier/models.py | 32 +++++++++++++++--------------- requirements.txt | 1 + 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index 4f82009..3516f29 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd import os -import pickle +import pickle5 as pickle import wget from imblearn.ensemble import BalancedRandomForestClassifier as RandomForestClassifier from lc_classifier.classifier.preprocessing import FeaturePreprocessor @@ -266,21 +266,21 @@ def save_model(self, directory: str) -> None: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: - self.top_classifier = pd.read_pickle( - os.path.join(directory, self.pickles["top_rf"]) - ) - self.stochastic_classifier = pd.read_pickle( - os.path.join(directory, self.pickles["stochastic_rf"]) - ) - self.periodic_classifier = pd.read_pickle( - os.path.join(directory, self.pickles["periodic_rf"]) - ) - self.transient_classifier = pd.read_pickle( - os.path.join(directory, self.pickles["transient_rf"]) - ) - self.feature_list = pd.read_pickle( - os.path.join(directory, self.pickles["features_list"]) - ) + with open(os.path.join(directory, self.pickles["top_rf"]), 'rb') as f: + self.top_classifier = pickle.load(f) + + with open(os.path.join(directory, self.pickles["stochastic_rf"]), 'rb') as f: + self.stochastic_classifier = pickle.load(f) + + with open(os.path.join(directory, self.pickles["periodic_rf"]), 'rb') as f: + self.periodic_classifier = pickle.load(f) + + with open(os.path.join(directory, self.pickles["transient_rf"]), 'rb') as f: + self.transient_classifier = pickle.load(f) + + with open(os.path.join(directory, self.pickles["features_list"]), 'rb') as f: + self.feature_list = pickle.load(f) + self.check_loaded_models() def check_loaded_models(self): diff --git a/requirements.txt b/requirements.txt index ee9e3d7..6f13c9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ tensorflow >=2.3 pandas >=1.1 wget >=3.2 celerite2 >=0.1 +pickle5 -e git+https://git@github.com/alercebroker/turbo-fats#egg=turbofats -e git+https://git@github.com/alercebroker/mhps#egg=mhps -e git+https://git@github.com/alercebroker/P4J#egg=P4J From cccaf18adec302ef4296f675fb6ee1b39801df09 Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Mon, 9 Aug 2021 11:44:57 -0400 Subject: [PATCH 11/12] [2nd attempt] pickle5 for python 3.6 and 3.7 compatibility --- lc_classifier/classifier/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index 3516f29..a0a7009 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -102,7 +102,9 @@ def save_model(self, directory: str) -> None: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: - rf = pd.read_pickle(os.path.join(directory, self.model_filename)) + with open(os.path.join(directory, self.model_filename), 'rb') as f: + rf = pickle.load(f) + self.random_forest_classifier = rf self.feature_list = pd.read_pickle(os.path.join(directory, "feature_list.pkl")) From 014d9dc7e71d128bf0cb74448844a6cded61f24f Mon Sep 17 00:00:00 2001 From: Ignacio Reyes Date: Mon, 9 Aug 2021 11:56:01 -0400 Subject: [PATCH 12/12] [3rd attempt] pickle5 for python 3.6 and 3.7 compatibility --- lc_classifier/classifier/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lc_classifier/classifier/models.py b/lc_classifier/classifier/models.py index a0a7009..93f8271 100644 --- a/lc_classifier/classifier/models.py +++ b/lc_classifier/classifier/models.py @@ -106,7 +106,9 @@ def load_model(self, directory: str) -> None: rf = pickle.load(f) self.random_forest_classifier = rf - self.feature_list = pd.read_pickle(os.path.join(directory, "feature_list.pkl")) + + with open(os.path.join(directory, "feature_list.pkl"), 'rb') as f: + self.feature_list = pickle.load(f) class HierarchicalRandomForest(BaseClassifier):