Skip to content

Commit

Permalink
added back in todense and added dedupe option
Browse files Browse the repository at this point in the history
  • Loading branch information
ladyson committed Mar 2, 2017
1 parent e8d6582 commit 696015d
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 14 deletions.
20 changes: 10 additions & 10 deletions pipeline/model_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,16 @@ def define_clfs_params(self):
'''
# These are the classifiers
self.clfs = {
'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
#'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
#'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
#'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
'LR': LogisticRegression(penalty = 'l1', C = 1e5),
'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
#'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
#'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
'KNN': KNeighborsClassifier(n_neighbors = 3)
#'DT': DecisionTreeClassifier(),
#'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
#'KNN': KNeighborsClassifier(n_neighbors = 3)
}
# These are the parameters which will be run through
self.params = {
Expand Down Expand Up @@ -214,8 +214,8 @@ def run(self):
# Generate features
parser = spacy.load('en')
f = get_feature_transformer(parser)
self.X_train = f.fit_transform(self.raw_X_train)
self.X_test = f.transform(self.raw_X_test)
self.X_train = f.fit_transform(self.raw_X_train).todense()
self.X_test = f.transform(self.raw_X_test).todense()

# Run the loop
self.clf_loop(self.X_train, self.X_test, self.y_train, self.y_test)
4 changes: 4 additions & 0 deletions pipeline/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@
help='Thresholds', default = [0.1, 0.2, 0.5])
parser.add_argument('--output_dir', type=str,
help='Output directory', default = 'output/')
parser.add_argument('--dedupe', help="dedupe content column",
action="store_true")


args = parser.parse_args()
print(args)

df = pd.read_csv(args.filename)
if args.dedupe:
df = df.drop_duplicates(subset='content')
# print(df.head())
X = df[args.x_label]
# print(X.head())
Expand Down
8 changes: 4 additions & 4 deletions pipeline/transform_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def countgrammar(self, texts):
for text, i in lookup.items():
try:
rv[i] = grammar_counts[text]
except:
except Exception as e:
# Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail.
# In that case, just set to None.
print("error")
print("Error in GrammarTransformer, setting to None")
# print(text)
rv[i] = {}
continue
Expand Down Expand Up @@ -93,7 +93,7 @@ def tokenizeText(self, texts):

lemmas.append(tok.text.lower().strip() if tok.ent_type_ == "" else "<{}>".format(tok.ent_type_))
except:
print("error: {}").format(tok)
print("Error when tokenizing, setting to Unknown")
lemmas.append("<UNK>")
continue
tokens = lemmas
Expand All @@ -119,7 +119,7 @@ def tokenizeText(self, texts):
except Exception as e:
# Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail.
# In that case, just set to None.
print("Tokenize Text error: ", e)
print("Tokenize Text error, setting to None")
rv[i] = "None"
continue
# rv = [x for x in rv if type(x) == str]
Expand Down

0 comments on commit 696015d

Please sign in to comment.