added back in todense and added dedupe option

aldengolab · Mar 2, 2017 · 696015d · 696015d
1 parent e8d6582
commit 696015d
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 14 deletions.
diff --git a/pipeline/model_loop.py b/pipeline/model_loop.py
@@ -71,16 +71,16 @@ def define_clfs_params(self):
         '''
         # These are the classifiers
         self.clfs = {
-            'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
-            'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
-            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
+            #'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
+            #'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
+            #'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
             'LR': LogisticRegression(penalty = 'l1', C = 1e5),
-            'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
-            'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
+            #'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
+            #'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
             'NB': GaussianNB(),
-            'DT': DecisionTreeClassifier(),
-            'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
-            'KNN': KNeighborsClassifier(n_neighbors = 3)
+            #'DT': DecisionTreeClassifier(),
+            #'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
+            #'KNN': KNeighborsClassifier(n_neighbors = 3)
             }
         # These are the parameters which will be run through
         self.params = {
@@ -214,8 +214,8 @@ def run(self):
         # Generate features
         parser = spacy.load('en')
         f = get_feature_transformer(parser)
-        self.X_train = f.fit_transform(self.raw_X_train)
-        self.X_test = f.transform(self.raw_X_test)
+        self.X_train = f.fit_transform(self.raw_X_train).todense()
+        self.X_test = f.transform(self.raw_X_test).todense()
 
         # Run the loop
         self.clf_loop(self.X_train, self.X_test, self.y_train, self.y_test)
diff --git a/pipeline/run.py b/pipeline/run.py
@@ -23,12 +23,16 @@
                     help='Thresholds', default = [0.1, 0.2, 0.5])
     parser.add_argument('--output_dir', type=str,
                     help='Output directory', default = 'output/')
+    parser.add_argument('--dedupe', help="dedupe content column",
+                    action="store_true")
 
 
     args = parser.parse_args()
     print(args)
 
     df = pd.read_csv(args.filename)
+    if args.dedupe:
+        df = df.drop_duplicates(subset='content')
     # print(df.head())
     X = df[args.x_label]
     # print(X.head())

diff --git a/pipeline/transform_features.py b/pipeline/transform_features.py
@@ -48,10 +48,10 @@ def countgrammar(self, texts):
         for text, i in lookup.items():
             try:
                 rv[i] = grammar_counts[text]
-            except:
+            except Exception as e:
                 # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail.
                 # In that case, just set to None.
-                print("error")
+                print("Error in GrammarTransformer, setting to None")
                 # print(text)
                 rv[i] = {}
                 continue
@@ -93,7 +93,7 @@ def tokenizeText(self, texts):
 
                     lemmas.append(tok.text.lower().strip() if tok.ent_type_ == "" else "<{}>".format(tok.ent_type_))
                 except:
-                    print("error: {}").format(tok)
+                    print("Error when tokenizing, setting to Unknown")
                     lemmas.append("<UNK>")
                     continue
             tokens = lemmas
@@ -119,7 +119,7 @@ def tokenizeText(self, texts):
             except Exception as e:
                 # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail.
                 # In that case, just set to None.
-                print("Tokenize Text error: ", e)
+                print("Tokenize Text error, setting to None")
                 rv[i] = "None"
                 continue
         # rv = [x for x in rv if type(x) == str]