From 890a0fa54ea99cd49c8094be945ab06f65315486 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 15 Nov 2019 17:32:48 +0200 Subject: [PATCH] less intensive code and latex fixes --- Mission433Solutions.ipynb | 199 ++++++++++++++++++++------------------ 1 file changed, 106 insertions(+), 93 deletions(-) diff --git a/Mission433Solutions.ipynb b/Mission433Solutions.ipynb index 164a6f2..9ee4f3a 100644 --- a/Mission433Solutions.ipynb +++ b/Mission433Solutions.ipynb @@ -236,9 +236,8 @@ "\n", "Essentially, we want to bring data to this format:\n", "\n", - "
\n", "![img](https://dq-content.s3.amazonaws.com/433/cpgp_dataset_3.png)\n", - "
\n", + "\n", "\n", "### Letter Case and Punctuation\n", "\n", @@ -505,27 +504,27 @@ " \n", " \n", " \n", - " woke\n", - " gr8\n", - " forum\n", - " bettr\n", - " std\n", - " pobox334\n", - " wap\n", - " kalstiya\n", - " skillgame\n", - " slap\n", + " ticket\n", + " kappa\n", + " too\n", + " abdomen\n", + " unhappy\n", + " hoody\n", + " start\n", + " die\n", + " wild\n", + " 195\n", " ...\n", - " sterm\n", - " click\n", - " person2die\n", - " amused\n", - " box434sk38wp150ppm18\n", - " bcaz\n", - " lodging\n", - " lyf\n", - " officially\n", - " again\n", + " 09058095201\n", + " chase\n", + " thru\n", + " ru\n", + " xclusive\n", + " fellow\n", + " red\n", + " entitled\n", + " auto\n", + " bothering\n", " \n", " \n", " \n", @@ -655,26 +654,26 @@ "" ], "text/plain": [ - " woke gr8 forum bettr std pobox334 wap kalstiya skillgame slap \\\n", - "0 0 0 0 0 0 0 0 0 0 0 \n", - "1 0 0 0 0 0 0 0 0 0 0 \n", - "2 0 0 0 0 0 0 0 0 0 0 \n", - "3 0 0 0 0 0 0 0 0 0 0 \n", - "4 0 0 0 0 0 0 0 0 0 0 \n", + " ticket kappa too abdomen unhappy hoody start die wild 195 ... \\\n", + "0 0 0 0 0 0 0 0 0 0 0 ... \n", + "1 0 0 0 0 0 0 0 0 0 0 ... \n", + "2 0 0 0 0 0 0 0 0 0 0 ... \n", + "3 0 0 0 0 0 0 0 0 0 0 ... \n", + "4 0 0 0 0 0 0 0 0 0 0 ... \n", "\n", - " ... sterm click person2die amused box434sk38wp150ppm18 bcaz lodging \\\n", - "0 ... 0 0 0 0 0 0 0 \n", - "1 ... 0 0 0 0 0 0 0 \n", - "2 ... 0 0 0 0 0 0 0 \n", - "3 ... 0 0 0 0 0 0 0 \n", - "4 ... 0 0 0 0 0 0 0 \n", + " 09058095201 chase thru ru xclusive fellow red entitled auto \\\n", + "0 0 0 0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 0 0 \n", "\n", - " lyf officially again \n", - "0 0 0 0 \n", - "1 0 0 0 \n", - "2 0 0 0 \n", - "3 0 0 0 \n", - "4 0 0 0 \n", + " bothering \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", "\n", "[5 rows x 7783 columns]" ] @@ -717,25 +716,25 @@ " \n", " Label\n", " SMS\n", - " woke\n", - " gr8\n", - " forum\n", - " bettr\n", - " std\n", - " pobox334\n", - " wap\n", - " kalstiya\n", + " ticket\n", + " kappa\n", + " too\n", + " abdomen\n", + " unhappy\n", + " hoody\n", + " start\n", + " die\n", " ...\n", - " sterm\n", - " click\n", - " person2die\n", - " amused\n", - " box434sk38wp150ppm18\n", - " bcaz\n", - " lodging\n", - " lyf\n", - " officially\n", - " again\n", + " 09058095201\n", + " chase\n", + " thru\n", + " ru\n", + " xclusive\n", + " fellow\n", + " red\n", + " entitled\n", + " auto\n", + " bothering\n", " \n", " \n", " \n", @@ -865,26 +864,26 @@ "" ], "text/plain": [ - " Label SMS woke gr8 forum \\\n", - "0 ham [yep, by, the, pretty, sculpture] 0 0 0 \n", - "1 ham [yes, princess, are, you, going, to, make, me,... 0 0 0 \n", - "2 ham [welp, apparently, he, retired] 0 0 0 \n", - "3 ham [havent] 0 0 0 \n", - "4 ham [i, forgot, 2, ask, ü, all, smth, there, s, a,... 0 0 0 \n", + " Label SMS ticket kappa \\\n", + "0 ham [yep, by, the, pretty, sculpture] 0 0 \n", + "1 ham [yes, princess, are, you, going, to, make, me,... 0 0 \n", + "2 ham [welp, apparently, he, retired] 0 0 \n", + "3 ham [havent] 0 0 \n", + "4 ham [i, forgot, 2, ask, ü, all, smth, there, s, a,... 0 0 \n", "\n", - " bettr std pobox334 wap kalstiya ... sterm click person2die amused \\\n", - "0 0 0 0 0 0 ... 0 0 0 0 \n", - "1 0 0 0 0 0 ... 0 0 0 0 \n", - "2 0 0 0 0 0 ... 0 0 0 0 \n", - "3 0 0 0 0 0 ... 0 0 0 0 \n", - "4 0 0 0 0 0 ... 0 0 0 0 \n", + " too abdomen unhappy hoody start die ... 09058095201 chase thru \\\n", + "0 0 0 0 0 0 0 ... 0 0 0 \n", + "1 0 0 0 0 0 0 ... 0 0 0 \n", + "2 0 0 0 0 0 0 ... 0 0 0 \n", + "3 0 0 0 0 0 0 ... 0 0 0 \n", + "4 0 0 0 0 0 0 ... 0 0 0 \n", "\n", - " box434sk38wp150ppm18 bcaz lodging lyf officially again \n", - "0 0 0 0 0 0 0 \n", - "1 0 0 0 0 0 0 \n", - "2 0 0 0 0 0 0 \n", - "3 0 0 0 0 0 0 \n", - "4 0 0 0 0 0 0 \n", + " ru xclusive fellow red entitled auto bothering \n", + "0 0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 \n", "\n", "[5 rows x 7785 columns]" ] @@ -908,17 +907,25 @@ "We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:\n", "\n", "\\begin{equation}\n", - "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam) \\\\\\\n", + "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam)\n", + "\\end{equation}\n", + "\n", + "\\begin{equation}\n", "P(Ham | w_1,w_2, ..., w_n) \\propto P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n", "\\end{equation}\n", "\n", + "\n", "Also, to calculate P(wi|Spam) and P(wi|Ham) inside the formulas above, we'll need to use these equations:\n", "\n", "\\begin{equation}\n", - "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n", + "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n", + "\\end{equation}\n", + "\n", + "\\begin{equation}\n", "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n", "\\end{equation}\n", "\n", + "\n", "Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:\n", "\n", "- P(Spam) and P(Ham)\n", @@ -933,15 +940,23 @@ "metadata": {}, "outputs": [], "source": [ + "# Isolating spam and ham messages first\n", + "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n", + "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n", + "\n", "# P(Spam) and P(Ham)\n", - "p_spam = training_set_clean['Label'].value_counts(normalize=True)['spam']\n", - "p_ham = training_set_clean['Label'].value_counts(normalize=True)['ham']\n", + "p_spam = len(spam_messages) / len(training_set_clean)\n", + "p_ham = len(ham_messages) / len(training_set_clean)\n", + "\n", + "# N_Spam\n", + "n_words_per_spam_message = spam_messages['SMS'].apply(lambda x: len(x))\n", + "n_spam = n_words_per_spam_message.sum()\n", "\n", - "# N_Spam, N_Ham, and N_Vocabulary\n", - "n_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n", - " axis=1).sum()\n", - "n_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n", - " axis=1).sum()\n", + "# N_Ham\n", + "n_words_per_ham_message = ham_messages['SMS'].apply(lambda x: len(x))\n", + "n_ham = n_words_per_ham_message.sum()\n", + "\n", + "# N_Vocabulary\n", "n_vocabulary = len(vocabulary)\n", "\n", "# Laplace smoothing\n", @@ -959,7 +974,10 @@ "The parameters are calculated using the formulas:\n", "\n", "\\begin{equation}\n", - "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n", + "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n", + "\\end{equation}\n", + "\n", + "\\begin{equation}\n", "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n", "\\end{equation}" ] @@ -974,18 +992,13 @@ "parameters_spam = {unique_word:0 for unique_word in vocabulary}\n", "parameters_ham = {unique_word:0 for unique_word in vocabulary}\n", "\n", - "# Isolate spam and ham messages before starting the loop below\n", - "# Don't do this inside the loop, it'll add to code running time significantly\n", - "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n", - "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n", - "\n", "# Calculate parameters\n", "for word in vocabulary:\n", - " n_word_given_spam = spam_messages[word].sum()\n", + " n_word_given_spam = spam_messages[word].sum() # spam_messages already defined in a cell above\n", " p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)\n", " parameters_spam[word] = p_word_given_spam\n", " \n", - " n_word_given_ham = ham_messages[word].sum()\n", + " n_word_given_ham = ham_messages[word].sum() # ham_messages already defined in a cell above\n", " p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)\n", " parameters_ham[word] = p_word_given_ham" ]