-
Notifications
You must be signed in to change notification settings - Fork 0
/
product_categorization_model.py
76 lines (71 loc) · 3 KB
/
product_categorization_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np
import pickle
import random
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation,Dense,Dropout
from sklearn.svm import libsvm
from sklearn.preprocessing import MultiLabelBinarizer
from pathlib import Path
import LeclercDataLoader as dataLoader
class text_classification():
np.random.seed(12379)
path_train="./base"
products=dataLoader.read_leclerc_dataset(path_train)
np.random.shuffle(products)
train_size = int(len(products)*.7)
#_____________________tronsforme to data frame with two columns text and category ________________
df_product=pd.DataFrame(data=[product for product in products])
train_text=df_product['text'][:train_size]
train_categories=df_product['categories'][:train_size]
test_text=df_product['text'][train_size:]
test_categories=df_product['categories'][train_size:]
vocab_size=15000
batch_size=100
tokenizer=Tokenizer(num_words=vocab_size, lower=True)
tokenizer.fit_on_texts(train_text)
x_train=tokenizer.texts_to_matrix(train_text,mode='tfidf')
x_test=tokenizer.texts_to_matrix(test_text,mode='tfidf')
encoder= MultiLabelBinarizer()
encoder.fit(train_categories)
y_train=encoder.transform(train_categories)
y_test=encoder.transform(test_categories)
model=Sequential()
model.add(Dense(3000,input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(3000))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(len(y_train[0])))
model.add(Activation('sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(x_train,y_train,batch_size=batch_size,epochs=5,verbose=1,validation_split=0.1)
score=model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)
print("test accuracy : ",score[1])
print("========== just for test ============")
text_labels = list(test_categories)
for i in range(10):
index=random.randint(0,len(x_test-1))
prediction=model.predict(np.array([x_test[index]]))
predicted_label=text_labels[np.argmax(prediction[0])]
print("Actuel label", test_categories.iloc[index])
print("predicted label : ", predicted_label)
#=============Saving the model ===========================
# model.model.save("text_classification.h5")
'''
predicted_label = list()
for i in range(10):
text_labels = encoder.classes_
prediction=model.predict(np.array([x_test[random.randint(0,len(x_test-1))]]))
predicted_label.clear()
for j in range(3):
predicted_label.append(text_labels[np.argmax(prediction[0])])
text_labels=np.setdiff1d(text_labels,predicted_label)
print("Actuel label",test_categories.iloc[i])
print("predicted label : ",predicted_label)
'''
#=============== class test ===================
test=text_classification()