Skip to content

Commit

Permalink
Merge pull request #617 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
Preparing for 0.9.5 (classifiers re-trained)
  • Loading branch information
dgarijo authored Jan 12, 2024
2 parents f74c3d8 + b01f09c commit 1f5b100
Show file tree
Hide file tree
Showing 86 changed files with 1,153 additions and 873 deletions.
12 changes: 8 additions & 4 deletions experiments/create_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@
'tfdtc': make_pipeline(CountVectorizer(), DecisionTreeClassifier()),
'tflr': make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear')),
'tfnb': make_pipeline(TfidfVectorizer(), MultinomialNB()),
'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
'tfrfc': make_pipeline(TfidfVectorizer(), RandomForestClassifier()), #(max_depth=3, random_state=0))
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log')),
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log_loss')),
'tfxgb': make_pipeline(TfidfVectorizer(), XGBClassifier(use_label_encoder=False,eval_metric="logloss"))
}
#'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
evaluation_names = ('cvlr', 'tflr', 'tfnb', 'cvnb', 'cvbb', 'tfsgd', 'tfxgb', 'tfper', 'tfrfc', 'tfdtc', 'tfada')
evaluation_text = {
'cvbb': '"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,',
Expand All @@ -54,21 +54,25 @@
'tfdtc': '"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,',
'tflr': '"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,',
'tfnb': '"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,',
'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',
'tfrfc': '"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,',
'tfsgd': '"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = \'log\',Allen,',
'tfxgb': '"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,'
}
#'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',

def evaluate_category(corpora,category):
dec = 3
cv = StratifiedKFold(n_splits = 5, shuffle=True)
file_content = "sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
file_content = f"sklearn Primitive - {category},Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
limit = 0.0
file_to_copy = ""
for name in evaluation_text:
X = corpora[category].excerpt
Y = corpora[category][category]
#print(X)
#for e in X:
# print(e)
#Y = Y.astype(int)
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
pipeline = pipelines[name]
pipeline.fit(x_train, y_train)
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
category,total correct,total incorrect,total missed,precision,recall
citation,33,7,3,0.825,0.917
run,20,3,1,0.870,0.952
install,72,16,10,0.818,0.878
download,2,3,0,0.400,1.000
requirements,29,2,2,0.935,0.935
contact,1,0,2,1.000,0.333
description,13,5,7,0.722,0.650
contributor,3,0,0,1.000,1.000
documentation,18,2,0,0.900,1.000
license,30,0,0,1.000,1.000
usage,55,32,20,0.632,0.733
faq,3,6,2,0.333,0.600
support,6,8,3,0.429,0.667
ack,7,0,3,1.000,0.700
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
24 changes: 12 additions & 12 deletions experiments/ranking/citation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.732,0.687,0.984,0.808,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.877,0.838,0.975,0.901,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.917,0.913,0.946,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.828,0.913,0.785,0.837,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.792,0.77,0.908,0.833,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.875,0.837,0.971,0.899,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.904,0.869,0.981,0.921,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.884,0.868,0.94,0.902,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.814,0.782,0.937,0.852,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.895,0.863,0.972,0.914,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.783,0.783,0.867,0.817,cittfxgb.p
sklearn Primitive - citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.675,0.993,0.803,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.922,0.912,0.938,0.924,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.927,0.909,0.952,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.886,0.948,0.821,0.877,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.879,0.95,0.8,0.868,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.938,0.958,0.917,0.936,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.92,0.91,0.934,0.922,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.907,0.899,0.921,0.909,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.879,0.864,0.907,0.882,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.939,0.957,0.921,0.938,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.839,0.906,0.759,0.825,cittfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/description_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.725,0.934,0.559,0.699,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.818,0.876,0.795,0.833,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.816,0.776,0.955,0.856,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.798,0.838,0.8,0.818,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.767,0.823,0.759,0.787,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.827,0.802,0.926,0.859,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.779,0.728,0.982,0.836,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.826,0.853,0.842,0.847,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.784,0.83,0.783,0.805,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.847,0.875,0.857,0.865,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.76,0.806,0.768,0.786,destfxgb.p
sklearn Primitive - description,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.733,0.914,0.516,0.657,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.813,0.854,0.758,0.801,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.784,0.716,0.954,0.816,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.731,0.73,0.737,0.733,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.686,0.701,0.648,0.67,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.802,0.788,0.836,0.809,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.775,0.695,0.982,0.814,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.804,0.777,0.854,0.814,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.745,0.744,0.751,0.747,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.815,0.799,0.843,0.82,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.72,0.729,0.726,0.724,destfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/installation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.75,0.7,0.986,0.819,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.871,0.87,0.912,0.89,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.879,0.844,0.968,0.902,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.79,0.899,0.714,0.794,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.816,0.877,0.789,0.83,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.908,0.906,0.907,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.842,0.791,0.986,0.877,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.865,0.895,0.864,0.879,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.851,0.9,0.833,0.865,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.896,0.918,0.899,0.908,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.812,0.881,0.776,0.825,instfxgb.p
sklearn Primitive - installation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.76,0.678,0.991,0.805,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.894,0.874,0.923,0.897,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.898,0.853,0.961,0.904,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.797,0.882,0.685,0.771,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.825,0.841,0.807,0.822,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.938,0.845,0.889,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.897,0.851,0.965,0.904,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.887,0.887,0.887,0.887,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.87,0.854,0.894,0.873,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.903,0.922,0.882,0.901,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.833,0.874,0.781,0.824,instfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/invocation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.718,0.939,0.814,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.84,0.817,0.929,0.869,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.865,0.875,0.891,0.883,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.773,0.753,0.899,0.819,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.763,0.847,0.716,0.776,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.841,0.822,0.923,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.869,0.848,0.94,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.841,0.862,0.861,0.861,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.826,0.868,0.82,0.843,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.867,0.859,0.919,0.888,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.798,0.778,0.907,0.837,invtfxgb.p
sklearn Primitive - invocation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.755,0.681,0.962,0.797,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.858,0.828,0.907,0.865,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.885,0.874,0.899,0.886,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.771,0.74,0.836,0.785,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.766,0.757,0.784,0.77,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.869,0.87,0.868,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.89,0.878,0.907,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.848,0.845,0.853,0.849,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.823,0.807,0.85,0.827,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.879,0.878,0.881,0.879,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.803,0.771,0.863,0.815,invtfxgb.p
13 changes: 9 additions & 4 deletions experiments/setup_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ def build_corpora():
def build_corpus(selected_category):
categories_df = {cat: pd.read_csv(f'./training_corpus/{cat}.csv') for cat in categories}
negative_sample_size = int(len(categories_df[selected_category]) / 4)
print(f"Selected Category: {selected_category}")
print(f"Selected Category: {selected_category}. Negative sample size for category: {negative_sample_size}")
for category in categories_df:
categories_df[category].drop('URL', 1, inplace=True)
categories_df[category].drop('URL', axis=1, inplace=True)
# add negative samples to a category from the other ones
if category != selected_category:
categories_df[category] = categories_df[category].sample(negative_sample_size)
categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category})
Expand All @@ -30,9 +31,13 @@ def build_corpus(selected_category):
map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)),
columns=["excerpt"]).assign(description=False)
# print("Treebank has {} samples.".format(len(treebank_background)))
# print("categories_df")
# Rename the column to match the corpus when merging
treebank_background = treebank_background.rename(columns={'description': selected_category})
# print(categories_df)
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
corpus.append(treebank_background, ignore_index=True, sort=False)
#corpus.append(treebank_background, ignore_index=True, sort=False)
corpus = pd.concat([corpus, treebank_background], ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
# print(corpus)
return corpus

Binary file modified experiments/trained_models/citcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfada.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittflr.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfper.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/descvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfada.p
Binary file not shown.
Binary file modified experiments/trained_models/destfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/destflr.p
Binary file not shown.
Binary file modified experiments/trained_models/destfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfper.p
Binary file not shown.
Binary file modified experiments/trained_models/destfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/destfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/destfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfada.p
Binary file not shown.
Binary file modified experiments/trained_models/instfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/instflr.p
Binary file not shown.
Binary file modified experiments/trained_models/instfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfper.p
Binary file not shown.
Binary file modified experiments/trained_models/instfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/instfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/instfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfada.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtflr.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfper.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfxgb.p
Binary file not shown.
Loading

0 comments on commit 1f5b100

Please sign in to comment.