Skip to content

Commit

Permalink
Reorganization and retrained classifiers with newer version of sklearn
Browse files Browse the repository at this point in the history
  • Loading branch information
dgarijo committed Jan 12, 2024
1 parent a2bd9f4 commit b01f09c
Show file tree
Hide file tree
Showing 77 changed files with 2,059 additions and 355 deletions.
12 changes: 6 additions & 6 deletions experiments/create_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@
'tfdtc': make_pipeline(CountVectorizer(), DecisionTreeClassifier()),
'tflr': make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear')),
'tfnb': make_pipeline(TfidfVectorizer(), MultinomialNB()),
'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
'tfrfc': make_pipeline(TfidfVectorizer(), RandomForestClassifier()), #(max_depth=3, random_state=0))
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log')),
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log_loss')),
'tfxgb': make_pipeline(TfidfVectorizer(), XGBClassifier(use_label_encoder=False,eval_metric="logloss"))
}
#'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
evaluation_names = ('cvlr', 'tflr', 'tfnb', 'cvnb', 'cvbb', 'tfsgd', 'tfxgb', 'tfper', 'tfrfc', 'tfdtc', 'tfada')
evaluation_text = {
'cvbb': '"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,',
Expand All @@ -54,24 +54,24 @@
'tfdtc': '"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,',
'tflr': '"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,',
'tfnb': '"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,',
'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',
'tfrfc': '"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,',
'tfsgd': '"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = \'log\',Allen,',
'tfxgb': '"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,'
}
#'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',

def evaluate_category(corpora,category):
dec = 3
cv = StratifiedKFold(n_splits = 5, shuffle=True)
file_content = "sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
file_content = f"sklearn Primitive - {category},Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
limit = 0.0
file_to_copy = ""
for name in evaluation_text:
X = corpora[category].excerpt
Y = corpora[category][category]
#print(X)
for e in X:
print(e)
#for e in X:
# print(e)
#Y = Y.astype(int)
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
pipeline = pipelines[name]
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
category,total correct,total incorrect,total missed,precision,recall
citation,33,7,3,0.825,0.917
run,20,3,1,0.870,0.952
install,72,16,10,0.818,0.878
download,2,3,0,0.400,1.000
requirements,29,2,2,0.935,0.935
contact,1,0,2,1.000,0.333
description,13,5,7,0.722,0.650
contributor,3,0,0,1.000,1.000
documentation,18,2,0,0.900,1.000
license,30,0,0,1.000,1.000
usage,55,32,20,0.632,0.733
faq,3,6,2,0.333,0.600
support,6,8,3,0.429,0.667
ack,7,0,3,1.000,0.700

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
category,total correct,total incorrect,total missed,precision,recall
citation,33,7,3,0.825,0.917
run,20,3,1,0.870,0.952
install,72,16,10,0.818,0.878
download,2,3,0,0.400,1.000
requirements,29,2,2,0.935,0.935
contact,1,0,2,1.000,0.333
description,13,5,7,0.722,0.650
contributor,3,0,0,1.000,1.000
documentation,18,2,0,0.900,1.000
license,29,1,0,0.967,1.000
usage,55,32,20,0.632,0.733
faq,3,6,2,0.333,0.600
support,6,8,3,0.429,0.667
ack,7,0,3,1.000,0.700
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
24 changes: 12 additions & 12 deletions experiments/ranking/citation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.732,0.687,0.984,0.808,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.877,0.838,0.975,0.901,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.917,0.913,0.946,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.828,0.913,0.785,0.837,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.792,0.77,0.908,0.833,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.875,0.837,0.971,0.899,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.904,0.869,0.981,0.921,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.884,0.868,0.94,0.902,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.814,0.782,0.937,0.852,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.895,0.863,0.972,0.914,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.783,0.783,0.867,0.817,cittfxgb.p
sklearn Primitive - citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.675,0.993,0.803,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.922,0.912,0.938,0.924,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.927,0.909,0.952,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.886,0.948,0.821,0.877,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.879,0.95,0.8,0.868,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.938,0.958,0.917,0.936,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.92,0.91,0.934,0.922,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.907,0.899,0.921,0.909,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.879,0.864,0.907,0.882,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.939,0.957,0.921,0.938,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.839,0.906,0.759,0.825,cittfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/description_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.725,0.934,0.559,0.699,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.818,0.876,0.795,0.833,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.816,0.776,0.955,0.856,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.798,0.838,0.8,0.818,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.767,0.823,0.759,0.787,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.827,0.802,0.926,0.859,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.779,0.728,0.982,0.836,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.826,0.853,0.842,0.847,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.784,0.83,0.783,0.805,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.847,0.875,0.857,0.865,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.76,0.806,0.768,0.786,destfxgb.p
sklearn Primitive - description,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.733,0.914,0.516,0.657,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.813,0.854,0.758,0.801,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.784,0.716,0.954,0.816,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.731,0.73,0.737,0.733,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.686,0.701,0.648,0.67,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.802,0.788,0.836,0.809,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.775,0.695,0.982,0.814,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.804,0.777,0.854,0.814,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.745,0.744,0.751,0.747,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.815,0.799,0.843,0.82,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.72,0.729,0.726,0.724,destfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/installation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.75,0.7,0.986,0.819,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.871,0.87,0.912,0.89,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.879,0.844,0.968,0.902,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.79,0.899,0.714,0.794,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.816,0.877,0.789,0.83,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.908,0.906,0.907,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.842,0.791,0.986,0.877,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.865,0.895,0.864,0.879,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.851,0.9,0.833,0.865,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.896,0.918,0.899,0.908,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.812,0.881,0.776,0.825,instfxgb.p
sklearn Primitive - installation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.76,0.678,0.991,0.805,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.894,0.874,0.923,0.897,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.898,0.853,0.961,0.904,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.797,0.882,0.685,0.771,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.825,0.841,0.807,0.822,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.938,0.845,0.889,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.897,0.851,0.965,0.904,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.887,0.887,0.887,0.887,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.87,0.854,0.894,0.873,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.903,0.922,0.882,0.901,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.833,0.874,0.781,0.824,instfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/invocation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.718,0.939,0.814,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.84,0.817,0.929,0.869,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.865,0.875,0.891,0.883,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.773,0.753,0.899,0.819,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.763,0.847,0.716,0.776,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.841,0.822,0.923,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.869,0.848,0.94,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.841,0.862,0.861,0.861,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.826,0.868,0.82,0.843,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.867,0.859,0.919,0.888,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.798,0.778,0.907,0.837,invtfxgb.p
sklearn Primitive - invocation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.755,0.681,0.962,0.797,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.858,0.828,0.907,0.865,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.885,0.874,0.899,0.886,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.771,0.74,0.836,0.785,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.766,0.757,0.784,0.77,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.869,0.87,0.868,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.89,0.878,0.907,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.848,0.845,0.853,0.849,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.823,0.807,0.85,0.827,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.879,0.878,0.881,0.879,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.803,0.771,0.863,0.815,invtfxgb.p
5 changes: 4 additions & 1 deletion experiments/setup_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@ def build_corpus(selected_category):
map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)),
columns=["excerpt"]).assign(description=False)
# print("Treebank has {} samples.".format(len(treebank_background)))
# print("categories_df")
# Rename the column to match the corpus when merging
treebank_background = treebank_background.rename(columns={'description': selected_category})
# print(categories_df)
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
#corpus.append(treebank_background, ignore_index=True, sort=False)
corpus = pd.concat([corpus, treebank_background], ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
# print(corpus)
return corpus

Binary file modified experiments/trained_models/citcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfada.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittflr.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfper.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/descvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfada.p
Binary file not shown.
Binary file modified experiments/trained_models/destfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/destflr.p
Binary file not shown.
Binary file modified experiments/trained_models/destfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfper.p
Binary file not shown.
Binary file modified experiments/trained_models/destfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/destfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/destfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfada.p
Binary file not shown.
Binary file modified experiments/trained_models/instfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/instflr.p
Binary file not shown.
Binary file modified experiments/trained_models/instfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfper.p
Binary file not shown.
Binary file modified experiments/trained_models/instfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/instfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/instfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfada.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtflr.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfper.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfxgb.p
Binary file not shown.
Loading

0 comments on commit b01f09c

Please sign in to comment.