From 890a0fa54ea99cd49c8094be945ab06f65315486 Mon Sep 17 00:00:00 2001
From: Alex <alex@dataquest.io>
Date: Fri, 15 Nov 2019 17:32:48 +0200
Subject: [PATCH] less intensive code and latex fixes

---
 Mission433Solutions.ipynb | 199 ++++++++++++++++++++------------------
 1 file changed, 106 insertions(+), 93 deletions(-)
diff --git a/Mission433Solutions.ipynb b/Mission433Solutions.ipynb
index 164a6f2..9ee4f3a 100644
--- a/Mission433Solutions.ipynb
+++ b/Mission433Solutions.ipynb
@@ -236,9 +236,8 @@
     "\n",
     "Essentially, we want to bring data to this format:\n",
     "\n",
-    "<center>\n",
     "![img](https://dq-content.s3.amazonaws.com/433/cpgp_dataset_3.png)\n",
-    "</center>\n",
+    "\n",
     "\n",
     "### Letter Case and Punctuation\n",
     "\n",
@@ -505,27 +504,27 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>woke</th>\n",
-       "      <th>gr8</th>\n",
-       "      <th>forum</th>\n",
-       "      <th>bettr</th>\n",
-       "      <th>std</th>\n",
-       "      <th>pobox334</th>\n",
-       "      <th>wap</th>\n",
-       "      <th>kalstiya</th>\n",
-       "      <th>skillgame</th>\n",
-       "      <th>slap</th>\n",
+       "      <th>ticket</th>\n",
+       "      <th>kappa</th>\n",
+       "      <th>too</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>unhappy</th>\n",
+       "      <th>hoody</th>\n",
+       "      <th>start</th>\n",
+       "      <th>die</th>\n",
+       "      <th>wild</th>\n",
+       "      <th>195</th>\n",
        "      <th>...</th>\n",
-       "      <th>sterm</th>\n",
-       "      <th>click</th>\n",
-       "      <th>person2die</th>\n",
-       "      <th>amused</th>\n",
-       "      <th>box434sk38wp150ppm18</th>\n",
-       "      <th>bcaz</th>\n",
-       "      <th>lodging</th>\n",
-       "      <th>lyf</th>\n",
-       "      <th>officially</th>\n",
-       "      <th>again</th>\n",
+       "      <th>09058095201</th>\n",
+       "      <th>chase</th>\n",
+       "      <th>thru</th>\n",
+       "      <th>ru</th>\n",
+       "      <th>xclusive</th>\n",
+       "      <th>fellow</th>\n",
+       "      <th>red</th>\n",
+       "      <th>entitled</th>\n",
+       "      <th>auto</th>\n",
+       "      <th>bothering</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -655,26 +654,26 @@
        "</div>"
       ],
       "text/plain": [
-       "   woke  gr8  forum  bettr  std  pobox334  wap  kalstiya  skillgame  slap  \\\n",
-       "0     0    0      0      0    0         0    0         0          0     0   \n",
-       "1     0    0      0      0    0         0    0         0          0     0   \n",
-       "2     0    0      0      0    0         0    0         0          0     0   \n",
-       "3     0    0      0      0    0         0    0         0          0     0   \n",
-       "4     0    0      0      0    0         0    0         0          0     0   \n",
+       "   ticket  kappa  too  abdomen  unhappy  hoody  start  die  wild  195  ...  \\\n",
+       "0       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "1       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "2       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "3       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "4       0      0    0        0        0      0      0    0     0    0  ...   \n",
        "\n",
-       "   ...  sterm  click  person2die  amused  box434sk38wp150ppm18  bcaz  lodging  \\\n",
-       "0  ...      0      0           0       0                     0     0        0   \n",
-       "1  ...      0      0           0       0                     0     0        0   \n",
-       "2  ...      0      0           0       0                     0     0        0   \n",
-       "3  ...      0      0           0       0                     0     0        0   \n",
-       "4  ...      0      0           0       0                     0     0        0   \n",
+       "   09058095201  chase  thru  ru  xclusive  fellow  red  entitled  auto  \\\n",
+       "0            0      0     0   0         0       0    0         0     0   \n",
+       "1            0      0     0   0         0       0    0         0     0   \n",
+       "2            0      0     0   0         0       0    0         0     0   \n",
+       "3            0      0     0   0         0       0    0         0     0   \n",
+       "4            0      0     0   0         0       0    0         0     0   \n",
        "\n",
-       "   lyf  officially  again  \n",
-       "0    0           0      0  \n",
-       "1    0           0      0  \n",
-       "2    0           0      0  \n",
-       "3    0           0      0  \n",
-       "4    0           0      0  \n",
+       "   bothering  \n",
+       "0          0  \n",
+       "1          0  \n",
+       "2          0  \n",
+       "3          0  \n",
+       "4          0  \n",
        "\n",
        "[5 rows x 7783 columns]"
       ]
@@ -717,25 +716,25 @@
        "      <th></th>\n",
        "      <th>Label</th>\n",
        "      <th>SMS</th>\n",
-       "      <th>woke</th>\n",
-       "      <th>gr8</th>\n",
-       "      <th>forum</th>\n",
-       "      <th>bettr</th>\n",
-       "      <th>std</th>\n",
-       "      <th>pobox334</th>\n",
-       "      <th>wap</th>\n",
-       "      <th>kalstiya</th>\n",
+       "      <th>ticket</th>\n",
+       "      <th>kappa</th>\n",
+       "      <th>too</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>unhappy</th>\n",
+       "      <th>hoody</th>\n",
+       "      <th>start</th>\n",
+       "      <th>die</th>\n",
        "      <th>...</th>\n",
-       "      <th>sterm</th>\n",
-       "      <th>click</th>\n",
-       "      <th>person2die</th>\n",
-       "      <th>amused</th>\n",
-       "      <th>box434sk38wp150ppm18</th>\n",
-       "      <th>bcaz</th>\n",
-       "      <th>lodging</th>\n",
-       "      <th>lyf</th>\n",
-       "      <th>officially</th>\n",
-       "      <th>again</th>\n",
+       "      <th>09058095201</th>\n",
+       "      <th>chase</th>\n",
+       "      <th>thru</th>\n",
+       "      <th>ru</th>\n",
+       "      <th>xclusive</th>\n",
+       "      <th>fellow</th>\n",
+       "      <th>red</th>\n",
+       "      <th>entitled</th>\n",
+       "      <th>auto</th>\n",
+       "      <th>bothering</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -865,26 +864,26 @@
        "</div>"
       ],
       "text/plain": [
-       "  Label                                                SMS  woke  gr8  forum  \\\n",
-       "0   ham                  [yep, by, the, pretty, sculpture]     0    0      0   \n",
-       "1   ham  [yes, princess, are, you, going, to, make, me,...     0    0      0   \n",
-       "2   ham                    [welp, apparently, he, retired]     0    0      0   \n",
-       "3   ham                                           [havent]     0    0      0   \n",
-       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...     0    0      0   \n",
+       "  Label                                                SMS  ticket  kappa  \\\n",
+       "0   ham                  [yep, by, the, pretty, sculpture]       0      0   \n",
+       "1   ham  [yes, princess, are, you, going, to, make, me,...       0      0   \n",
+       "2   ham                    [welp, apparently, he, retired]       0      0   \n",
+       "3   ham                                           [havent]       0      0   \n",
+       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...       0      0   \n",
        "\n",
-       "   bettr  std  pobox334  wap  kalstiya  ...  sterm  click  person2die  amused  \\\n",
-       "0      0    0         0    0         0  ...      0      0           0       0   \n",
-       "1      0    0         0    0         0  ...      0      0           0       0   \n",
-       "2      0    0         0    0         0  ...      0      0           0       0   \n",
-       "3      0    0         0    0         0  ...      0      0           0       0   \n",
-       "4      0    0         0    0         0  ...      0      0           0       0   \n",
+       "   too  abdomen  unhappy  hoody  start  die  ...  09058095201  chase  thru  \\\n",
+       "0    0        0        0      0      0    0  ...            0      0     0   \n",
+       "1    0        0        0      0      0    0  ...            0      0     0   \n",
+       "2    0        0        0      0      0    0  ...            0      0     0   \n",
+       "3    0        0        0      0      0    0  ...            0      0     0   \n",
+       "4    0        0        0      0      0    0  ...            0      0     0   \n",
        "\n",
-       "   box434sk38wp150ppm18  bcaz  lodging  lyf  officially  again  \n",
-       "0                     0     0        0    0           0      0  \n",
-       "1                     0     0        0    0           0      0  \n",
-       "2                     0     0        0    0           0      0  \n",
-       "3                     0     0        0    0           0      0  \n",
-       "4                     0     0        0    0           0      0  \n",
+       "   ru  xclusive  fellow  red  entitled  auto  bothering  \n",
+       "0   0         0       0    0         0     0          0  \n",
+       "1   0         0       0    0         0     0          0  \n",
+       "2   0         0       0    0         0     0          0  \n",
+       "3   0         0       0    0         0     0          0  \n",
+       "4   0         0       0    0         0     0          0  \n",
        "\n",
        "[5 rows x 7785 columns]"
       ]
@@ -908,17 +907,25 @@
     "We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam) \\\\\\\n",
+    "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam)\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(Ham | w_1,w_2, ..., w_n) \\propto P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
     "\\end{equation}\n",
     "\n",
+    "\n",
     "Also, to calculate P(w<sub>i</sub>|Spam) and P(w<sub>i</sub>|Ham) inside the formulas above, we'll need to use these equations:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
     "\\end{equation}\n",
     "\n",
+    "\n",
     "Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:\n",
     "\n",
     "- P(Spam) and P(Ham)\n",
@@ -933,15 +940,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Isolating spam and ham messages first\n",
+    "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n",
+    "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n",
+    "\n",
     "# P(Spam) and P(Ham)\n",
-    "p_spam = training_set_clean['Label'].value_counts(normalize=True)['spam']\n",
-    "p_ham = training_set_clean['Label'].value_counts(normalize=True)['ham']\n",
+    "p_spam = len(spam_messages) / len(training_set_clean)\n",
+    "p_ham = len(ham_messages) / len(training_set_clean)\n",
+    "\n",
+    "# N_Spam\n",
+    "n_words_per_spam_message = spam_messages['SMS'].apply(lambda x: len(x))\n",
+    "n_spam = n_words_per_spam_message.sum()\n",
     "\n",
-    "# N_Spam, N_Ham, and N_Vocabulary\n",
-    "n_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
-    "                                                    axis=1).sum()\n",
-    "n_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
-    "                                                    axis=1).sum()\n",
+    "# N_Ham\n",
+    "n_words_per_ham_message = ham_messages['SMS'].apply(lambda x: len(x))\n",
+    "n_ham = n_words_per_ham_message.sum()\n",
+    "\n",
+    "# N_Vocabulary\n",
     "n_vocabulary = len(vocabulary)\n",
     "\n",
     "# Laplace smoothing\n",
@@ -959,7 +974,10 @@
     "The parameters are calculated using the formulas:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
     "\\end{equation}"
    ]
@@ -974,18 +992,13 @@
     "parameters_spam = {unique_word:0 for unique_word in vocabulary}\n",
     "parameters_ham = {unique_word:0 for unique_word in vocabulary}\n",
     "\n",
-    "# Isolate spam and ham messages before starting the loop below\n",
-    "# Don't do this inside the loop, it'll add to code running time significantly\n",
-    "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n",
-    "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n",
-    "\n",
     "# Calculate parameters\n",
     "for word in vocabulary:\n",
-    "    n_word_given_spam = spam_messages[word].sum()\n",
+    "    n_word_given_spam = spam_messages[word].sum()   # spam_messages already defined in a cell above\n",
     "    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)\n",
     "    parameters_spam[word] = p_word_given_spam\n",
     "    \n",
-    "    n_word_given_ham = ham_messages[word].sum()\n",
+    "    n_word_given_ham = ham_messages[word].sum()   # ham_messages already defined in a cell above\n",
     "    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)\n",
     "    parameters_ham[word] = p_word_given_ham"
    ]