comments to classifier notebook

Signed-off-by: Krzysztof Nowak <[email protected]>
zenodo · May 29, 2018 · 2ae5834 · 2ae5834
1 parent ba36fdc
commit 2ae5834
Showing 1 changed file with 66 additions and 77 deletions.
diff --git a/records-sklearn.ipynb b/records-sklearn.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## This notebook trains SPAM classifier for Zenodo records.\n",
+    "\n",
+    "Run the cells in sequence to train the model on the data. Some steps are optional or used for experimentation."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -24,79 +33,65 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "with open(\"./data/zenodo_open_metadata_17_05_2018.txt\", \"r\") as fp:\n",
-    "    data = [json.loads(l) for l in fp.readlines()]"
+    "## Load the data\n",
+    "\n",
+    "Loads the previously dumped data (TXT). Each line in the file is single record's metadata."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "413079"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "len(data)"
+    "# Update the filename accordingly\n",
+    "FILENAME = \"./data/zenodo_open_metadata_17_05_2018.txt\"\n",
+    "\n",
+    "with open(FILENAME, \"r\") as fp:\n",
+    "    data = [json.loads(l) for l in fp.readlines()]\n",
+    "print(\"Loaded metadata of {} records\".format(len(data)))"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 7,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "spam_owners = {}"
+    "## Optional step: Manually mark some spammers\n",
+    "\n",
+    "Next cell allows for manually marking some Users as spammers. Provide User IDs (int) of record owners, which records should be marked as SPAM."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
+    "spam_owners = {}  # Manually mark some User IDs as SPAMmers\n",
     "for d in data:\n",
     "    owner = d['owners'][0] if d['owners'] else None\n",
     "    if owner in spam_owners and not d['spam']:\n",
-    "        d['spam'] = True"
+    "        d['spam'] = True\n",
+    "        \n",
+    "spamcnt = Counter([d['spam'] for d in data])\n",
+    "print(\"SPAM: {0}, Non-SPAM: {1}\".format(spamcnt[True], spamcnt[False]))"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 8,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({False: 404648, True: 8472})"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "Counter([d['spam'] for d in data])"
+    "## Train the model\n",
+    "\n",
+    "Train the model on the SPAM label. You can experiment with parameters here, and observe the accuracy on the test set (Spam->Spam, Ham->Ham values)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -121,12 +116,11 @@
     "\n",
     "ngram_range=(1, 1)\n",
     "\n",
-    "# Build spam vocabulary from the training dataset\n",
-    "X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]\n",
-    "count_vect = CountVectorizer(ngram_range=ngram_range, max_features=8000)\n",
-    "count_vect.fit_transform(X_train_spam)\n",
-    "vocabulary = count_vect.vocabulary_\n",
-    "vocabulary = None\n",
+    "## Alternatively you can experiment with building a spam vocabulary from the training dataset\n",
+    "# X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]\n",
+    "# count_vect = CountVectorizer(ngram_range=ngram_range, max_features=8000)\n",
+    "# count_vect.fit_transform(X_train_spam)\n",
+    "# vocabulary = count_vect.vocabulary_\n",
     "\n",
     "text_clf = Pipeline([\n",
     "                     ('vect', CountVectorizer(max_features=8000, ngram_range=ngram_range)),\n",
@@ -146,18 +140,27 @@
     "print(\"Accuracy: {0:.4f}\".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check accuracy again on the full data.\n",
+    "\n",
+    "This contains biased because some data was used for training the model."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Counter({(False, False): 404185, (True, True): 8009, (True, False): 458, (False, True): 427})\n",
-      "Spam->Spam: 0.9459\n",
-      "Ham -> Ham: 0.9989\n",
+      "Counter({(False, False): 404241, (True, True): 8015, (True, False): 457, (False, True): 407})\n",
+      "Spam->Spam: 0.9461\n",
+      "Ham -> Ham: 0.9990\n",
       "Accuracy: 0.9979\n"
      ]
     }
@@ -170,9 +173,16 @@
     "print(c)\n",
     "print(\"Spam->Spam: {0:.4f}\".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))\n",
     "print(\"Ham -> Ham: {0:.4f}\".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))\n",
-    "print(\"Accuracy: {0:.4f}\".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))\n",
-    "acc = [idx for idx, (ref, pred) in enumerate(zip(labels, y_pred)) if (ref, pred) == (False, True)]\n",
-    "spammy_stuff = [(data[idx]['recid'], data[idx]['description']) for idx in acc]"
+    "print(\"Accuracy: {0:.4f}\".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Investigate the records\n",
+    "\n",
+    "The next cell allows you to take a peek at false negatives (i.e.: SPAM which slipped through the filter)."
    ]
   },
   {
@@ -1898,35 +1908,14 @@
     }
    ],
    "source": [
+    "acc = [idx for idx, (ref, pred) in enumerate(zip(labels, y_pred)) if (ref, pred) == (False, True)]\n",
+    "spammy_stuff = [(data[idx]['recid'], data[idx]['description']) for idx in acc]\n",
+    "\n",
     "for rec in spammy_stuff:\n",
     "    print(rec)\n",
     "    print(\"\\n\\n\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Counter({(False, False): 133452, (True, True): 2603, (True, False): 158, (False, True): 104})\n",
-    "Spam->Spam: 0.9428\n",
-    "Ham -> Ham: 0.9992\n",
-    "Accuracy: 0.9981"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Counter({(False, False): 58790, (True, True): 1808, (True, False): 63, (False, True): 59})\n",
-    "Spam->Spam: 0.9663\n",
-    "Ham -> Ham: 0.9990\n",
-    "Accuracy: 0.9980"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,