-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclassification.py
86 lines (66 loc) · 2.36 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
""" Topic Classification -- SVM tutorial
Usage:
classification.py --C=<n> --kernel=[linear|poly|rbf] [--degree=<n>]
classification.py (-h | --help)
classification.py --version
Options:
-h --help Show this screen.
--version Show version.
--C=<n> Value of C parameter.
--kernel=[linear|poly|rbf] Type of kernel.
--degree=<n> Degree of kernel.
"""
import os
from sys import argv
from docopt import docopt
from numpy import concatenate
from sklearn.svm import SVC
from utils import *
args = docopt(__doc__, version='Topic Classification 1.0')
#print(args)
C = int(args["--C"])
kernel = args["--kernel"]
degree = int(args["--degree"]) if args["--degree"] else 3
#csv sources
files = ["./data/class1/vecs.csv","./data/class2/vecs.csv"]
#fetch data in csv and structure in a class:vector dict
cl_dict = get_data(files)
t1 = list(cl_dict.keys())[0]
t2 = list(cl_dict.keys())[1]
#get user-selected size of training sets
train1_size = get_train_size(t1, cl_dict[t1])
train2_size = get_train_size(t2, cl_dict[t2])
print()
#build numpy arrays and lists of docs
t1_train, t1_test, t1_train_docs, t1_test_docs = \
make_arrays(cl_dict[t1], train1_size)
t2_train, t2_test, t2_train_docs, t2_test_docs = \
make_arrays(cl_dict[t2], train2_size)
train_docs = t1_train_docs + t2_train_docs
test1_size = len(t1_test)
test2_size = len(t2_test)
print('Topic 1: Train size: {} | Test size: {}\n' \
.format(train1_size, test1_size) + \
'Topic 2: Train size: {} | Test size: {}\n' \
.format(train2_size, test2_size))
#prepare train/test sets
x_train = concatenate([t1_train, t2_train])
x_test = concatenate([t1_test, t2_test])
y_train = make_labels(train1_size, train2_size)
y_test = make_labels(test1_size, test2_size)
#setup SVM setup and print output
print('SVC output:')
clf = SVC(C = C, verbose = True, kernel = kernel, degree = degree) #prints data
model = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
y_pred = clf.predict(x_test)
print('\n') #needed because SVC prints output in a weird way
print('SVC Model:')
print(model)
fir_matrix = confusion_matrix(y_test, y_pred)
print('Score: {}\n'.format(score))
print('Confusion matrix:')
print(fir_matrix)
#print('Training docs:')
#print('\n'.join([train_docs[s] for s in clf.support_]))
#print()