From 696015d8f944083e977f66392714f6f327001e4c Mon Sep 17 00:00:00 2001 From: ladyson Date: Thu, 2 Mar 2017 16:42:33 -0600 Subject: [PATCH] added back in todense and added dedupe option --- pipeline/model_loop.py | 20 ++++++++++---------- pipeline/run.py | 4 ++++ pipeline/transform_features.py | 8 ++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pipeline/model_loop.py b/pipeline/model_loop.py index d27ec00..04c3f72 100644 --- a/pipeline/model_loop.py +++ b/pipeline/model_loop.py @@ -71,16 +71,16 @@ def define_clfs_params(self): ''' # These are the classifiers self.clfs = { - 'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1), - 'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'), - 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200), + #'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1), + #'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'), + #'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200), 'LR': LogisticRegression(penalty = 'l1', C = 1e5), - 'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0), - 'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10), + #'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0), + #'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10), 'NB': GaussianNB(), - 'DT': DecisionTreeClassifier(), - 'SGD': SGDClassifier(loss = 'log', penalty = 'l2'), - 'KNN': KNeighborsClassifier(n_neighbors = 3) + #'DT': DecisionTreeClassifier(), + #'SGD': SGDClassifier(loss = 'log', penalty = 'l2'), + #'KNN': KNeighborsClassifier(n_neighbors = 3) } # These are the parameters which will be run through self.params = { @@ -214,8 +214,8 @@ def run(self): # Generate features parser = spacy.load('en') f = get_feature_transformer(parser) - self.X_train = f.fit_transform(self.raw_X_train) - self.X_test = f.transform(self.raw_X_test) + self.X_train = f.fit_transform(self.raw_X_train).todense() + self.X_test = f.transform(self.raw_X_test).todense() # Run the loop self.clf_loop(self.X_train, self.X_test, self.y_train, self.y_test) diff --git a/pipeline/run.py b/pipeline/run.py index 74f3150..f473fe7 100644 --- a/pipeline/run.py +++ b/pipeline/run.py @@ -23,12 +23,16 @@ help='Thresholds', default = [0.1, 0.2, 0.5]) parser.add_argument('--output_dir', type=str, help='Output directory', default = 'output/') + parser.add_argument('--dedupe', help="dedupe content column", + action="store_true") args = parser.parse_args() print(args) df = pd.read_csv(args.filename) + if args.dedupe: + df = df.drop_duplicates(subset='content') # print(df.head()) X = df[args.x_label] # print(X.head()) diff --git a/pipeline/transform_features.py b/pipeline/transform_features.py index 166c4a7..6f11410 100644 --- a/pipeline/transform_features.py +++ b/pipeline/transform_features.py @@ -48,10 +48,10 @@ def countgrammar(self, texts): for text, i in lookup.items(): try: rv[i] = grammar_counts[text] - except: + except Exception as e: # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail. # In that case, just set to None. - print("error") + print("Error in GrammarTransformer, setting to None") # print(text) rv[i] = {} continue @@ -93,7 +93,7 @@ def tokenizeText(self, texts): lemmas.append(tok.text.lower().strip() if tok.ent_type_ == "" else "<{}>".format(tok.ent_type_)) except: - print("error: {}").format(tok) + print("Error when tokenizing, setting to Unknown") lemmas.append("") continue tokens = lemmas @@ -119,7 +119,7 @@ def tokenizeText(self, texts): except Exception as e: # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail. # In that case, just set to None. - print("Tokenize Text error: ", e) + print("Tokenize Text error, setting to None") rv[i] = "None" continue # rv = [x for x in rv if type(x) == str]