-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bazinga.py
107 lines (83 loc) · 3.16 KB
/
Bazinga.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import json
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import Counter
import tensorflow as tf
from keras import models
from keras import layers
import numpy as np
data = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
data.append(json.loads(line))
titles = []
y_vals = []
for i in range(0,len(data)):
titles.append(data[i]['headline'])
y_vals.append(data[i]['is_sarcastic'])
# nltk.download('punkt')
titles_tokenized = []
for title in titles:
titles_tokenized.append(word_tokenize(title))
titles_an = [] # alphanumeric
for title in titles_tokenized:
words = [word for word in title if word.isalpha()]
titles_an.append(words)
# Let's now stem the words
porter = PorterStemmer()
titles_preprocessed = []
for title in titles_an:
stemmed = [porter.stem(word) for word in title]
titles_preprocessed.append(stemmed)
# Now, let's create a large list of all of the words and find the 10,000 most frequent ones
word_list = []
for title in titles_preprocessed:
for word in title:
word_list.append(word)
freq_list = Counter(word_list)
dictionary = freq_list.most_common(10000)
dictionary = list(zip(*dictionary))[0]
# We now have a list with the 10000 most common words. Let us convert our sentences to lists of these words in
# order to feed it into the Neural Network.
nums = range(0,10000)
word_int = dict(zip(dictionary, nums))
x_vals = []
for title in titles_preprocessed:
x_vals.append([word_int[x] for x in title if x in word_int.keys()])
# Now, let's format the data for the Neural Network and divide the training, validation, and test sets
x = np.array(x_vals)
test_data = x[:5000]
train_data = x[5000:]
def vectorize_sequences(sequences, dimension=10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y = np.asarray(y_vals).astype('float32')
y_test = y[:5000]
y_train = y[5000:]
# Create the validation set
x_val = x_train[:5000]
x_partial_train = x_train[5000:]
y_val = y_train[:5000]
y_partial_train = y_train[5000:]
# Prevent Tensorflow from allocating my entire GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape = (10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
history = model.fit(x_partial_train, y_partial_train, epochs = 20, batch_size = 512, validation_data=(x_val, y_val))
# Let us train the model with 6 epochs.
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape = (10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs = 4, batch_size = 512)
results = model.evaluate(x_test, y_test)
print(results)