From 1f6ad3f51cacd21bb57fef8425b39649d4febb01 Mon Sep 17 00:00:00 2001 From: chengjun Date: Mon, 22 Apr 2019 16:32:56 +0800 Subject: [PATCH] update --- ...ne-learning-with-sklearn-checkpoint.ipynb} | 2 +- .../09.08-Random-Forests-checkpoint.ipynb | 1143 ++++++ ...-machine-learning-summary-checkpoint.ipynb | 3055 +++++++++++++++++ ...ine_learning_with_sklearn-checkpoint.ipynb | 3015 ---------------- .../09.01-machine-learning-with-sklearn.ipynb | 2200 ++++++++++++ code/09.06-Linear-Regression.ipynb | 1886 ++++++++++ ...nb => 09.07-Support-Vector-Machines.ipynb} | 0 code/09.08-Random-Forests.ipynb | 21 +- code/09.09-machine-learning-summary.ipynb | 3055 +++++++++++++++++ code/09.machine_learning_with_sklearn.ipynb | 3016 ---------------- 10 files changed, 11341 insertions(+), 6052 deletions(-) rename code/{09.machine_learning_introduction.ipynb => .ipynb_checkpoints/09.01-machine-learning-with-sklearn-checkpoint.ipynb} (99%) create mode 100755 code/.ipynb_checkpoints/09.08-Random-Forests-checkpoint.ipynb create mode 100644 code/.ipynb_checkpoints/09.09-machine-learning-summary-checkpoint.ipynb delete mode 100644 code/.ipynb_checkpoints/09.machine_learning_with_sklearn-checkpoint.ipynb create mode 100755 code/09.01-machine-learning-with-sklearn.ipynb create mode 100755 code/09.06-Linear-Regression.ipynb rename code/{09.06-Support-Vector-Machines.ipynb => 09.07-Support-Vector-Machines.ipynb} (100%) create mode 100644 code/09.09-machine-learning-summary.ipynb delete mode 100644 code/09.machine_learning_with_sklearn.ipynb diff --git a/code/09.machine_learning_introduction.ipynb b/code/.ipynb_checkpoints/09.01-machine-learning-with-sklearn-checkpoint.ipynb similarity index 99% rename from code/09.machine_learning_introduction.ipynb rename to code/.ipynb_checkpoints/09.01-machine-learning-with-sklearn-checkpoint.ipynb index dbfa758..cbb64ac 100755 --- a/code/09.machine_learning_introduction.ipynb +++ b/code/.ipynb_checkpoints/09.01-machine-learning-with-sklearn-checkpoint.ipynb @@ -20,7 +20,7 @@ }, "source": [ "\n", - "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" diff --git a/code/.ipynb_checkpoints/09.08-Random-Forests-checkpoint.ipynb b/code/.ipynb_checkpoints/09.08-Random-Forests-checkpoint.ipynb new file mode 100755 index 0000000..ac229a0 --- /dev/null +++ b/code/.ipynb_checkpoints/09.08-Random-Forests-checkpoint.ipynb @@ -0,0 +1,1143 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# In-Depth: Decision Trees and Random Forests" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "\n", + "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "\n", + "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Previously\n", + "\n", + "- simple generative classifier (naive Bayes; see [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb)) \n", + "- powerful discriminative classifier (support vector machines; see [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb)).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "\n", + "\n", + "\n", + "## *Random Forests*\n", + "- Another powerful & non-parametric algorithm \n", + "- Random forests are an example of an **ensemble method**, \n", + " - meaning that it relies on aggregating the results of an ensemble of simpler estimators.\n", + "\n", + "The sum can be greater than the parts: \n", + "- a majority vote among a number of estimators can end up being better than any of the individual estimators doing the voting!\n", + "\n", + "We will see examples of this in the following sections." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Motivating Random Forests: Decision Trees" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T06:58:23.100831Z", + "start_time": "2018-12-26T06:58:21.786163Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns; sns.set()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Random forests are an example of an *ensemble learner* built on decision trees.\n", + "- For this reason we'll start by discussing decision trees.\n", + "\n", + "Decision trees are extremely intuitive ways to classify or label objects: \n", + "- you simply ask a series of questions designed to zero-in on the classification.\n", + "\n", + "For example, if you wanted to build a decision tree to classify an animal you come across while on a hike, you might construct the one shown here:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Guess what is the animal I am thinking?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "![](figures/05.08-decision-tree.png)\n", + "\n", + "[figure source in Appendix](06.00-Figure-Code.ipynb#Decision-Tree-Example)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The binary splitting makes this extremely efficient: in a well-constructed tree, \n", + "- each question will cut the number of options by approximately half, \n", + "- very quickly narrowing the options even among a large number of classes.\n", + "\n", + "The trick comes in deciding which questions to ask at each step.\n", + "\n", + "Using axis-aligned splits in the data: \n", + "- each node in the tree splits the data into two groups using a cutoff value within one of the features.\n", + "\n", + "Let's now look at an example of this." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Creating a decision tree\n", + "\n", + "Consider the following two-dimensional data, which has one of four class labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T06:58:23.571323Z", + "start_time": "2018-12-26T06:58:23.103258Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD3CAYAAADSftWOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAIABJREFUeJzsnXd4FVX6xz/TbkshAULv7dKrIkgVERurYi9rQbCvvevuWn6ua1vsFQTFriA2FBVREJQiICAl9N4CpN825fz+uJDkcueGJCSQwHyeJ8+TzJk5c2Zy73fOvOctkhACBwcHB4eahXy0B+Dg4ODgUH4c8XZwcHCogTji7eDg4FADccTbwcHBoQbiiLeDg4NDDUQ9UifKysqvEreW9HQf2dmBqui6xuPcm8Q49yYxzr1JzNG4NxkZKZLd9ho/81ZV5WgPodri3JvEOPcmMc69SUx1ujc1XrwdHBwcjkcc8XZwcHCogTji7eDg4FADccTbwcHBoQbiiPfximGgZK5C3rnjaI/EwcGhAjjifRzieedt0oYOJH1Ab9L79CT1sguQ16w+2sNycHAoB0fMz9uheuD6+kuSHvsncmEhAFKgEPdPPyJnZZHz3U+gaUd5hA4ODmXBmXkfZ3g+/bBIuEuiLf0T9ycfHoUROTg4VARHvI8z5B3bE7YpG9YfwZE4ODgcDo54H2dYDRombmve4sgNxMHB4bAos83b7/efBDydmZk52O/3twHeAQTwF3BLZmamVTVDdKhMQhdfhjb7V+RArOlE79qN0KVXHKVROTg4lJcyzbz9fv99wDjAs3/TGOCfmZmZAwAJOLdqhudQ2UTOGUHhvx5Db98RAMvrI3zKqeS9OhZcrqM8OgcHh7JS1pn3OuB84L39f/cCZu7//TtgGDClcofmUFWERl1P6KqRKKtWItLSsJo2O9pDcnBwKCdlEu/MzMzJfr+/RYlNUmZm5oEUr/lArUP1kZ7uq7KMXBkZKVXS77FAqfemUb8jN5BqiPO5SYxzbxJTXe5NRf28S9q3U4CcQx1QVTlwMzJSyMrKr5K+azrOvUmMc28S49ybxByNe5PoYVFRb5PFfr9/8P7fzwR+rWA/DscJUm4O7smfov72K4gqqcvh4HBcUdGZ993AWL/f7wJWApMqb0gOxxRC4HvycTyffoSyYztCVdF79KLwiacwevQ62qNzcKixlFm8MzMzNwJ99v++GhhURWNyOIbwTBiL75UXkEwTAMkwcC2Yh3zXrWR//4vj4eLgUEGcIB2HKsU19esi4S6Juvwv3J9+dBRG5OBwbOAkpnKwRwi0H77H/f1UME30vv0IX3QpKOXzGJL37EnYpmzberijdHA4bnHE2yEeIUi6/y6877+LZBgAeD56H/d335A3bmK5Mg9aTZvByuXxp5BljM5dK23IDg7HG47ZxCEObcZPeD+YWCTcEA2jdX83Fe+4N8vVV/DKq7FSU+O26337ETlr+OEOFXXGdFLPOYO0fieQdtogkh68Byl732H36+BQ3XHEu7oiBNLu3VBQcNj9EAiAVfbUM+5pU5F03bZN+312uU6vn34WBc88T+SkPli1amE2bkzowkvIe/s9kKRy9VUSKSeb1EtGkHbZBbjn/oa2ZjXaksX43n6L1L9fAqFQdMdgEEo8hBwcjhUcs0k1xP3xh3jfGRcNX09ORu/bj4InnkLUb1C+jl54gbR330Peshmrbl0iw84k8NC/D223tuIXGIuwWXw8FOHzLyI84kKk3ByExwsez6EPOgTJ996B++efbNtcC+aRfM/tKFu3oGSuit7Dk/tT+H//RaQeMhjYwaFG4Ih3NcP17dckP3wvcv7+KK5AIcqXnyPv3kXulKkgl+1lyfPqS/DkY2j7Z9DK3j1omauQ8nIpfPaFUo+NDDwFzwcTkWxm63qvE8t3QQeQJERaesWOPbirXbtwzfy51H3cUyYhH3h72LsHddNG5O3byPv0i8Oa8Ts4VBccs0k1w/Ph+8XCXQJt7m+4vv6ybJ2YJp5Jn4CN6cP97ddRc0wpRM45j/A5I+K3n9QXkZyMZ9ybSLmHzIhQZchbNiPnlH5+2ebaXXN+Rfvh+6oaloPDEcUR72qGvHWz7XZJCFQbrw3bffftQ9m4wbZNycpCmzvnEB1I5L8+jvynxxAafi7hM84mNOxMlC1bSPnnA6Q8dC/pg/riee3lMo2nsjHbd8Bs0jRhu0gws5YMA+3PhVU1LAeHI4pjNqlmWHUzEraVJlglEampiNp1oDB+sdPy+jDb+g/diaIQGjma0MjRqAsXUOuSEch5ecXN27eR9MyTGF27YfQfWKZx2WJZeJ9/Fvf33yLv2YPZvAWhy/5O+OLLDroogevzz3DN+BFJNzBatETeuoWDZdryerEy6qFu3mR/uoaNKj7WkoRCyNn7sOrUdaJEHY4KjnhXM8LnjMD1+5w4bw+9c5d4QUuE203klCF4J06Ia9JP7o/ZoWO5xuT5+IMY4T6AHCjEM+kTCg5DvJMevg/v228VibCydQvqooUQDhO+8proRiFIvuMWPB9/gFQiqZXRvAXC40XesQ3cHoz2Hch/6XXc33xJ8iMPx9ns9Q6dDr9akK6T9Ng/cX0/DXnXTqwmTQgPP5fAA/8s83qEg0Nl4Ih3NSN81UjkXTvxfvwBypbNCE1DP6E3Bf/333LN8AqeeBpvOID17bfI+fkIt5tIn5PJ/9+L5R6TlJubsE3OS9x2yH5378b91ZS42bMcDOD9YCLhv18NkoT2w/d4Pvs4RrgB1E0bKXjo34RG34DwJRWJZ+j6m5G3b8Mz+TOU3buiAUHdelDwn6cPe5ac9NC9+N4dXzzWtWtQXngOhCDw8COH1beDQ3k4fsRbCDyvv4LnqynIu3ZgNmxMeMQFhK676WiPLI7gvQ8SvOlWtF9nYtVvgNmjZ/k9JDwe+PhjsmcvwPX7HIxOnTFOPKlC4zHbtE3YZrRsU6E+AVw/T0fJyrJtU9atRcrPQ6TWwj39+5iAoZJoC+YRvOOe2I2SROCxJwn+407c077BbNgYfcjQw54ZS3v34v5uavx2wP3NFwTueQDc7sM6h4NDWTluxNv77JMkPf9cUZIkZds2tD8XIRUGCN5x91EenQ1JSUiRCL6xryOFghiduxK44RZITi5XN5a/PSF/+8MaSvCGm3F9+w3air9ituvt2hO86R/I27bi/uwTkGVCF1+KKKVCfUnMFi0RLhdSJBI/7lppCK/vsMYtMjIIXTnysPooifrXUpTdu2zblE2bomaUZs0r7XwODqVxfBjpAgE8n30al91OMgw8n34ENuJxtEn61wOk3jQKz+RPcU/9mqSn/0Oti85B2nfkQ79FrTTyJrxP8IKLMVq2xGjRkuAFF5E3/j0874wjbegAkp98jOQnHiF9SH+8L/6vTP0avfugn9Dbtk0fMKgoh0p46OkI1X6eoZ/Yp2IXVQHMNm2xaqXZtlkNGkQXLx0cjhDHhXgrK5ajbkrgOrd2NcqmjUd2QIdAWbIYz/sT40wFroV/4Hv+2aMyJqtlKwpeH0f2vCVkz19Cwetvo2zZQtJLY1D27i3aT9mThe/5Z1Fnzzp0p5JE/tNjiJzYG7HfpCE8XsJnnEXBE08V7aYPO53QxZfFuQCGTxlK8KZ/VM4FlgGrcRMig06xbYsMGQZJSUdsLA4Ox4XZRDRqhJWSYhv8YtWujVWnzlEYVWLcX3+JHCi0bVMX/XGER5MY95eTkQ7kECmBHAjgmfxZvBdKJILvqSdwzZyBlJeH2bYdwZHXkfv1D7imTUXZsB6990kYB8+mJYmC518hMnAwrp9+QNIN9N59CF018oi76RU8/3LUbXHmDOS8PMzadYgMHRZdDHVwOIIcF+JtNWpMpP9APDaLTXr/QVGf6OqEVMoLUTVyR5Nt/MgPINm0pdxyPZ4vPy/6W920EXXRQvJfeZPIWX8r/WSSROT8i4icf1H5BypE2Rd8hcAzfizur6YgZ+3GbNyU8KWXE77g4mhzSir5b09E3rgRZdUKjG7dEZXlO+7gUA6OC/EGKHjmeeSCArS5vyHpOsLlItKvPwVPlc0+eyQJnX8h3rffQLbJKFjh3CIVRQiUNasRioLVqnWxCAqBtG1bwsPM9h1i/lYXL8T1w7S4/ZR9e/FOGIs+dBgAUkE+AqncC7N2aNN/wDv2ddT9Cb4i/QZS+OgT4Eu8EOp99kmSXvhfkclKXbsGbcE8pPx8QteMKtrPatECq0WLwx6jg0NFOW7EW9RvQO6kr9Bm/Yy6Yjl6124Y/QYi7diO739Pg64TPms4ZtfulXpeee0avOPeQNm2FatOXUKXXoHR5+RSj7E6dCQ46np8b7yKFA4XbY/0OZng3fdV6vhKw/XtN/heGoO6ZDFIEnqvEym85wGMQafg+vpLtKV/2h5ntGhF4PqbY7Zps2YiBwO2+ytr16AumIdvzDOoixeBJGH0OoHC+x6q8P9DnfkzKbfeiLK3uJKPumY1ytbN5H3wmf1BBQV4P/kobq1BDhTief/dqJmmGr35OBzfHDfiDUQFaNAQ9EFDAPC8+Sq+F8eg7In6GnvfeJXQxZdS+PSYSsk8p86ZTeo/ro8p9+We+hUFj/6H8BVXlXps4OFH0fsOwP3NF0jBIHr3HoSuHlUp6VTLgvLXMpLvuzPGNc4173eUO/9Bztff45o2NaHvtd6jZ9zM2apXL+G5hNtD6k2jUUqEtCs/TENZt5acr75HZCROGZAI7zvjYoS76Bpm/ow2Yzpcen5cmzb/d5Qt9rll1NWrkHfuwGrUuNxjcXCoCo7baYS8cgVJzz1VJNwQnWF5J07A/d47lXIO3wvPxdVplHNz8b3+cpncE/Uhp1Iw5mXyXx9H6IZbjphwA3gnjrf1aVa2bsEz7k1km4XKA0g2s9PwOSOwEo1f12OE+wDqurV433qt7IMuOc716+3HputoCxfYtlkNGyMSjNFKTUWkpFRoLA4OVcHxKd66TsptNyHbhH1LloX7px8O+xRSbg7an4ts29TVmWgJCgmUCV3H++IYUi8+j1rnnUXSvx5EOjhS0TRh06YKlwSTd+xI2Kbs2oFRymzY6BZv6nB/MTmh4MsF8V5ARedKMBM+FCLN3h8bSOiPbXboiN7b3m9cP7k/IiW+nJuDw9Hi+DKb7Cf5zltxLVmceIeAvW22XMgyIkHFGgGgVfDWC0HKDSPxfPNV0SbXb7PRfp9N7kefIzIy8Ewcj+fd8bBqJbU1DZGcjNm0OUa37gRvug2r+f4owIJ8kp55Em3+PDAMjG49KLzzHkSTplgNE0dJyuvX41q10rYt0rcfwWtGx21X12QmvqZSypRZ6bUTH1cKkdPOiC5OH5QPxWjbjtDlV5JoDp3/5LOk3noj6uKFSIBQFPQ+/Sj4z9Hxr3dwSESFFMTv92vAu0ALwASuy8zMXFWJ46oy5PXrcH8f7zJYkoM9JSqCSEnF6HUiyo/xHhZG5y5Fdvfy4vr2a9zffhO3XVu6BO8rz2P06EXSvx8u8hOXdR0CAZTdu3EtXIDr11nkTvwIq2kzav39Ely/zS7Rx59oC+eT89lXBK4eheu7b1B2xZpOzLp10f5ahhQJx2wXQPjUYeSPnQCShOftt1A2b4qmeL3iKsxSQubN1m2QTCOuwIJZty7Bq68t7y0CIHjLbchbN+OeMgklOxsBGJ06U/D4U+D1JjzOaucn59vpuL6cgrJhHWbnrkSGneFU33GodlR05n0WoGZmZp7s9/tPA/4DXFB5w6o63N98ZWsuOYDub0/g5tsq5VyF9z+EsmEd6to1RdvMeg0I3H3/oetIJkCb86tteTIAbdlS1NWZCQN8IDoD9r38PEa3HjHCXdS+Yjne114m8MjjFDzzAt6Xn4+af/Z7m1gpqXhsHkgSQFIS8ubN1LrlOtTlxXlQPO+9Q96Yl9Db+dFWx87AhcdD8NrrkIIhfK+9iLpmNQB6+w4Eb78bq6IPUkmi8Kn/Ebz5NlzfTUWkpKAtXEDKA3cjBQPQvRvqNddjDBwcf6yiEDn/woqd18HhCFFR8V4NqH6/XwZSAftS49UJyyLp4ftwT/404S5maip5731SaUEXZtfu5Ez9Ee/YN5A3b0LUqUPwmtFYLVtVvNP9+T7sEG438vbEvtcHUBO4+BW1Z64AIHLm2UTOOAtl3dqon3eLliTfdnPC46RwiJRHH44RbgBt+TJSnnqCghdeJenRh9EWLUQyDMz02oQvuqQo8CZ80SVoP34PioJ+6mmlXmtZsZo1J3T9TaRePAL3zBnFDVu3kLpgAXlvv4eRwM592AiBsmoFUn4BRo+elXI9Dg5FCCHK/dOuXbum7dq1W9SuXbvV7dq129euXbuTD3WMrhviqPLII0JEY+0S/9xxx9Ed46FYs0aIyy8XQlXtx//cc0IMHXro6+zdW4hbb03cftFFsefNyxMiM1OIwkIhXn898XG33y6E12vfpihCLFwoxM8/C9G6dfH2lBQhrrpKCF2vuvs2ebIQsmw/rksvrZpz/vabEAMHCqFp0fN06hS9d5XJ778LMWqUEGefLcR11wmxaFHl9u9QXbDV1IrOvO8Evs/MzHzQ7/c3BWb4/f4umZmZCf3HsrMrYRHQhoyMFLKyEnsrHCDt8ykkmvdYXi+Rs88h/55/Qhn6OhpoM6aTctc/ULZvj2sTqkr47HPIv2IUbsVLyqxZtmlWD1DYqw+R4SOoNeGdOE8PoSjkDRpKJCsfIpHo28qP3yPv2I7VpBnh089AGTgY96xfYo6L9DqBgmHDSX/xxbjiCgCYJpFRo5GCIbR164q35+fDxIkUZjQgcP8/y3FHyo7vl9kkJTA16StXkVPJ/3OpIJ+0K69GXVdsLmP5cqx77yM/pU7Uhn6YuD//jKSH7kPZV5wUzPzyK/KfewH9jLMPq++yfqeOR47GvcnIsF9er6irYDZwwHC8D9CAihlxjwRCIO+JD9g4QGD0jeS/Nrb61iIUAt+YZ+yFW5Iw69XHbNQYhCB8xZUU3vMgxn7TjDho3/DgIQTufwijRy+Co6/HLLF4ZyWnELz2OiIXXIz3uaeo07UdvnfHo2zfhiQEypZN+Ma9idGuPYV33kOk3wAifU6m8KZbyftwEmaPXhgdOye8DG3Jn2iZ9l4qrhnTK3ZvSkPXkbdsRiQlDodPlOL1cPCMHxcr3PuRC/Jxf/rR4Z/ANPG++mKMcAMou3fhe+XF6DuFwzFPRWfezwPj/X7/r4ALeCgzMzPxKtnRRpIwmzVD2REvfpbPR+Rv5x2FQZUdecM6tATZBCUhULdvQ339ZeScbApefI3gHXcTvOFmMtYtJ2fXPlxzZiMVFqKf2JvIiAtBUfA+9xSe995FCQYBsJKSCP39agoff5Kk++/CO2Gc/QwacP84jeyZc21zhISuHY16751xLnoHxpqI0kqtHQp18ULckz6FcAi9dx8i51+E98X/4ZkyCWXDeqzadbCSkuMSaQlZJnL6mRU+byLknYl95OUs+2IO5UFZ+ifqsqW2bdqfi5C3bsFq2uywz+NQvamQeGdmZhYAF1fyWKqU0EWXo/25OCZXCIA+eAhm9x5HaVSVi2vaVORNG7Gat4i6w51yCkZWPsapw2L20378nqSXxsSkc5ULo/k7In374f7qi4TCDdEoS3nXTtuF19BV1+IZ9xbaqhVxbQIQmhZ1XzyIuFJrhhH1yDmEi553zDP4Xn6hSJjFxAkYLzyLumFDcdWkXTuB6JtFkZmoTh2CIy4iNPrGUvuvCGYpwmk1rITwes0FqmrrHy8UNdrmcMxz3ERYhq+6hoJH/4PerTtWSgpm02YE/341ea+OPdpDOyRWy9YYrVofcj8lOxvtIFu0He4vp9jn4S7Ix/vGK7Y5QWLG07ARVr36CdtDI0fbVr4xuvckcsbwuO1m7ToEr70uOrYPJlJr+DBqd+9A2qC++B7/d8IgHjlzFb7XXoqZUUuAtnZtXNWkKIL8hx+JFnP+4w8Kn3ymSvy3Q1dfi9GhU9x2Kz2d4BVXH3b/ZqfO0fwxNugn9sZyUtQeFxxXj+jQqOsJjRyNlJ2NSE6uMcVipV07kQ8hqBA1A5jt/KXvFA6jrk0c7aiuzkTIckJfcoiWJSutakxo5GikrN14P/kIZcsmhNuNfmIf8p98BqtVa8xmzfYXZMjHbNuW4LXXow85Dfd775Dy8P1IoagpR9m9C23VCuS9eyh4MT7Hieezj5Hz8kq/3hLIBQWQnk7wqmtJzkipusVpn4/cN94m+fF/oc2fixQKYXTpRvD6mzAGDDz08YdCkgjc9xDKnbeibN1StNlo2YrA/Q8ffv8ONYLjSryBaNh6eSrnCIG8ZTOYJlaLlkcl0s779lsxpcYSIgTqsiUYJ/Ut2iRvWI9n4gSkgnykYBBt0R8xQUMHU9qs2/J4CV90CYVlqBoTvO8hgjffhjZ3DlaDRpiduxS1BR75PwL8X9zYPR+9XyTcJXF/N5XAXfvNQSWQzMRh9XYIjxe965ExkVkdOpL30WSk3buRQsGoDboSPzv6oCFkfzsd79g3kXfvxGrcmOCoGyuUgdGhZnL8iXc5UH+dRdJz/41mobMs9B69CNx+N3oluHrJG9bjmv5DtMrPmWdDMBgVr/w8IkNPx+zStXjfrN1l6lMSAs+nHxMadUNUKN5+m7T77y+b8NsgiJohhMuN3r0neWMnlC+AKTkZfejpZds3FELZYJ8JUM7JxjVzBqGrYkPlw0OH4R37hq1b5IGxlyTSr/8RX98Q9epRVb4fokFDAv96tIp6d6juOOKdAHnbVlLvuAllS/FrqWvBPJRbbyTno0lYPU+oWMemSfI9d+D+5kvk3Jyoq1/zFhAKo+6MesNYLz1P+G/nReslKgpms+ZlH/fGDUgHFuUee6zCwg1gpaQQvPlW9IGD4+tKVjZuNyItHWxm/kLTMFq3idtu9BtI6PyL8Hz8QYxQ6126onfpjnvGjyg7d2AlJWM1aIBwu0i+/y6Cl18JQyvBfOHgcBRxxDsBnnFvxgj3AZTsfaRfcA7BUdcTePiRcr8Ke8c8g/eDd4v+loRA3Rhb2V4uLMDz8fuYbdoQvO0uQtfdiHvKJLQy5P4StWsjvD68498Cm/GXByU/H9eMnwjecAQqtMsykSGn2vpH6yeehHHyANvDCl54FaNHL1w//wShEEbnLgRvvg1Rty6BnGy0b77E99ILaOvWoq5bC4B7yiR49lk495IqvSQHh6rkuPE2KS/Kznif8APIhQX4XnkB7ysvlLtf9/Sy5QqXKA5cESmp5L0xnvBpZ2Cm1456y6TWsj0ucsqpUVexg1wiS6O013rXgnnUHtS3coJLDkHhv/+P0PkXYe0veiBUlUifk8l/7sXED0lZJjRyNHkTPyLv0ykE/v04om40X7dIS8f98wy0jbHmGDknB556CoLx9nUHh5qCI94JMOs3KLVdsizcX39Z7n6l7Oyy75tf7ElhdepM3gefkj13EYHrb8Fq1BihaUXCa6WkEDr/Igof/Q8QrVxD7UPnwjZatUEvJSoSQNmyiaTH/hVduK1K3G7y33ib7KnTyX/yGXI/+JTcL7/DOtgHvKyYJtoi+6o5rFuH+4vJFR+rg8NRxjGbJCA06gY8X06JK2NWEnnXjmgocolZobJ0Ce6pXyIUlfDFl0U9VEpgtmqFutF+Ye5gjHbt47Z53nmbpBeejfFjFkD4zOEUvPJm0TareQsYPRoxZkxMrUnL60O4XaBpGD1PpPDeB7Dq1KX2gJNKr2iTtRvPxAlRU1E5kHKy8bw7ASkvF6N3nzLlxrbadyBUCTnVo50lfq9wTfsW9Y/5iIx6BEffWDRjd3CoCTjinQCraTPyn3+FpEceQl25wjbi0GrUpFiIhCDp4fvwfPh+UT5t39g3CNx0K8G77i06JnTVtWgLF5SaUxzAaNSY0HUHRf/pOu7Jn8UFoEiA69eZSLk5iJK5Op5+mvzmrXF/OxXy8zBbtyF4061YGfuLAXs8RXnFQ1dcGfXcKMW/2/31FwRv/EeZXS1dU78i+d8PFq0dCEUhcsqp5L39XqkFESoNRUHvdQJKiapDRWganu+Ki1p4Jn1C3vOvVo4ftoPDEcAxm5SCPngIOb/8jt7n5Lg2oaqERhQn7Hd99jHeCeNiCiHIuTn4XhqDumBu0bbIWcPJ/9/LWEmx1dVjztvOT/7YdzAO8miRd+5A3bDO9hhlx3bUxfE1M8MXXUbehPfJm/QVhU+PwWrQkKQnHiF92CBqd2tPrRFn4/5iMoWP/5eCJ57GLKX8mbp+HanXXV22xEeBAEmP/ztm0VcyTdzTfyDpyccOfXwlEbj7fozWsWYXoapwUIi+snkTSU897iR1cqgxOOJ9KCSJvHc/JHTehZh16yJkGaOdn8L7HiJ0/U1Fu7l/mGYbki0HCvFMnhSzLXLOeYTPsU+GZfmSyH9tLMaJJ8W1ifT0hMVzrZQUzBaHLvKQetMofG+/hbpmNcruXbjm/EryPbfj+uYrQqNvIGfS1xil9KP9PgfXtNLLyAF4Pv0INYHftjYnvoJPVWF26kLOF1MpvOV2wsPPJXjBxYgE2SO1xYtQli45YmNzcDgcHLNJGRDptcl/azxSTjbSvn1YTZrGpY+1iwwswqagceC2u9D+mF9U9guiKVtDF16M2TW++jqASE4hMugUvJ98GNcW6dsf93ffoP02GywzOmt/JDZUWv1jPq6f4lOvynl5eN5/h8jfzsVq2468ce+QNuJs5Px4G7hkmijLlsKZ8TlKYvYrxSwklVKmrSoQ9RsQeCQa0Slv34Znqo0ZBcAwoiXSHBxqAI54lwORlh4NJLHBaN8R9w/xtR2BaAmsg7BatyH3w8l4X3sJNXMlIimJyKnDCI2Mr7xekoKn/odcUID2ywzkwgKEx0Okb38kwyD5kYeK9nP/+D3MmwPvflJkX9Z+n5PwIVMyutHs2h2zUWPkBH7lVrr9PShJ+PQz8b34XDSfyEEYHeOTNh0prIaN0Lt2wzV/Xlyb0aETxgm9j8KoHBzKjyPelUTw5ttw/fwT2rLY1+5IvwGErrjK9hireXMKn/5f+U6UlETehPdRVixHmz8Xo0tXlDWrSbntpvh9f/kF71uvE7z9LgBPEjhZAAAgAElEQVTMxk1sw8YhapKJoZS0onIZojYlPYLw+RAFBTHnMxo2InjjEQj6SYQkEbz5dlwb74DdxWkHrFq1CF5/s5NO1aHG4HxSKwlRuza5H36Gb8yz0WrrikLkxD4E7n2gSir0mB07Ye6fwXomTkiYf1tbuIADc+3IuedjvPYSmo1dN3JQzm+Rkprw3HJOTumDMwxS7vwHyu7YnCwCiAw/t+oK/paRyFnDoUNrgi++grx9G1ZGPUKXXYnR3z6K08GhOuKIdyUi6jco/0y6MlASV6CLyautKOQ/9yIp99+N+uciJCGw0tIJDz+HwN33xxxntmgJc3+z7dP0x/ufl8T1+We2DwiJQ1euP2L07k3BC68e7VE4OFQYR7yPAcKnnxVNzmRTtEAfMCjmb7N7T3K++wnth2koW7cQGTosLpAIornPXb/MQDmopJfeo1dCM9ABSuaYPpiy5CV3cHA4NI6r4DGAPuwMgldeg9C0om1CluGSSwhdNTL+AFlGP+MsQqNvsBVuAKNbD/JeeZPwkKGYGfUwmzQhNOIicie8f0gzkNGjV8xYSlKeDIkODg6JcWbeVYEQaD/PQP1jHqJefUKXXhGNZqwqJInCp/5HZNhZuL+fCqZJZNAQao28AvZW3C3PGDiYvIGDIRSKLuSVcTFPHzyESL8BuH+ZEbPdSk4hdLn9rF0I2L1YxghCg94Wir32l5ngXgkEeOs6QTcOxyaOeFc2hYWkXnc1rpk/I+2P4vOMe4OCp5/H6Ne/6s4rSeinDkU/dWjxNrmSXqzK++CRJPLHvYv1rwdw/forUn4eRjs/oauvJWITnLRlpsL8p1zs/lNBmBLp7U263RCh4xXlq5QDsPMPmQXPuti1SEFYEvW6G/S6Q6fJALualrBrscyaySqWIdF4gEGrs8yjUSzpmCFrqcSOeSq1/SaNB1jOvaxCJHGEwoGzsvKr5EQZGSlkVVUtwgqQdN9d+N4ZF7dd79yFnB9nlbq4WNlU9r1Rf5+D963XUFevRiQnExl8CoF7HoQEJhIAIhGkUDDqvWLzTQ5kSUw6w0vBltj74qplccbbIZoMtBddOwp3SUw5x0vehti+khqbnDspSFrr4o9gRkYK39wdZskbLozA/nHJglZnGQx7K4RchmnNvtUSKz/UMAok6nazaH+pfthvDNWBinxu9AD89A8Pm2eoGAEJSRM07G0y5IUQqc2Pnbefo6E3GRkpto9AZ+ZdmQiBa/ZM2yb1r2W4pn5F5JwRR3hQlYP6+xxSrx+Jsmtn0TZt8UKUDRvJf2t84gNdroTh6ADLxmlxwg0QyZVZ9bFWLvFeNlaLE26Awm0Ky8ZpDPhvcbm0bQuIFW4AS2L9NxpL3jTpcYse10/MuSaozH/KTTi7+O1mzWSVMycGcSf2soxh958ya6aoWAY0G2LSbEjNnfXP+aeb9d+UWHPRJbbPUZl5r4e/ferkTa8KnAXLykQIpML4iEKIusnJO3fattUEvGNfjxHuA7inTUX9Y3789smfknrhOaSf2JW0M4bgfeE526RPwV0G9fiTVOJzhQeyyjfG/G2JP84Ht/31EbHCXYJts0t/Owplw8LnY4UbYPtvKvP+W/pirmXCmikqX17g4YtzfSx53c2ysW6+vcrL9Js9iMRJHastRihq+rJj+1yFrKWOzFQFzsy7MpFljHYdUHbsiGsy02tHg0NqKGqCUHkpFESb+XNMWLn7g4kkP3x/cYbFTRtRFy9EztpN4X+eKdrPM+4NTv/hHVJZQZgkNjOAaTzPPqJ+5MmNy/e67a2bWPl89WL7MuNrFhe3lT7pZuWHGoGd9oK0c74K2He+Y77MrAc97F0mc3Ccq9Al1kxWadBLo8voQwygmqEXSIRy7O+HGZLIWS+T0bUGPpWqORV+JPr9/gf9fv/vfr9/od/vH1WZg6rJBK+7AbN2bL5rIUmEzzs/mtCqhiKSUxK3pZeo2CME3vffjUmNC9Fane4vpyBlRafTrimTSfq/R0ndswIAN4W0ZRrncyUyOr56Fp2vKZ+Idb4metzBuNMtOlwR21fLU0FS7B8OhxIaM5zYtpFI+C0DZt3vYe8yBfsEBQASW2YduTWRysJTW1Crpf0982ZYND657KYvh7JTIfH2+/2DgZOBfsAgoOaqUiWjDzuT/DfeJnzmcPT2HYn0OZnCfz5G4VM2kZeWhffl56n1t2Gk9z+R1CsvQStjjcsjTeSUIbbbjdatCV3296K/pdwclLWrbfdVdu/C9ctPAHgmfYxsk8GvMX8woNUEBj0Xol738s3W0loLBj4Tom4XEyQBCOp0MOn3RJj6PWL7an8etDor3pulXg+DnreWMi0H2pyr46plP7aMrvZCteZzlb3LDy3Mpb0RVFckGdpfrCO74x+GrYcbcW89DpVDRc0mpwPLgClAKnBv6bsfX+iDh6APthe7kiQ9eA/eCeOK5mHq6ky0eXPJf+HVamdiCdz9AMqGDbimfVskukarNhQ8/t+YqjjCl4RIrQU2KWGFy4XZPBqkI9uYlg5wwt9WEzyjYrO1VmeZtDwjwI55MpYBjfpatp4jkgSnvRmiXk+TbbMVTD064+55awTPIZImprUWtL/UYNk4DWEWz6LT2pr0vM1efQsTmFkOpm4Xi9z1EloS+OrXHNHrer2OrAkyP9PI2ywjyYL0NhZdRpXtaSQEbJ2lsGGaApZEsyEGzYfV3AXcI0GFXAX9fv9YoDkwHGgJfAW0z8zMTNiZYZhCVWveK2GVsWkT9OoFdhn6hg6FH3888mMqC3PnwvTp0eLGI0falzMbNQrG23igDBwIv/wSVc7hw2FqgqIOb78N115bqcOubISAP9+F1V9BOB8yOkLfuyAtQQDptgXw7uCoS10i0lqCOxV2/wWaD5oNgNP/B3VLTyVTrVj+Gfz0IGTvL/jkrgVdroCzXklculQI+O5WWDS2+M1DUqHL5XDehOjM/jinUl0F9wKrMjMzI0Cm3+8PARnA7kQHZGdXTZL76ubnXVY8H35GSoLUquayv9i3M+ewfcKr5N607hT9ASgwwK5o8T+fIHXr9migUjiMkCT0nieQ/8SzWHui3jiucy8k5edf4mzjevee5JxxHlTx/7Qy7k2Ts6M/B9CBrAQeMq4W0Hyoh7VfxTqCS6ogva1FakuTbbM0cjZEv6eRfFj7LeRsNbng2+h3Rw9ExbCqZ6MVvTfBvfDdHT4Ktxd/bsO58MfrAnfjMN2us18Q2DBN4Y83vQij+MKEAUsnCur0CtHhsvIHa1UVR8nP23Z7RcV7NnC73+8fAzQEkogKukMZsRo0TJxbOyWl8qIjjwbJyeS9/ynqb7PRFi7AbNGSyNnnxFxT5LwLKNy3F8+7E1BXLkckJaP3OZmC//tv6UE/NZhTXwuR1MRi60yVcC6kt7XodLVOq7NMvrvWg14Q/2nYs1ThixFeArskwrkSaa0tOlyu0+mq6iNoB1j+ritGuIsQEpt+VEoRbzVGuIuR2DxDrVbiXZ2okHhnZmZ+4/f7BwLziS563pKZmeksKZeDyJlnY3TrjrYkPkVqpP+gqp9eHQGMk/tjnJw4JUDo2usJXXUtyprV0fqcDRIXP66uCAvWT1XI2yRT/wSTRn0SL7IqLuj3aAQ7V8LC7Yn/37sWFH9Ndy+S2btCQXGFaH/pkRU1ywQjAFqSvSkjnJf42Ehe4uuz9FLaHN1OSIX9vDMzM++rzIEcdygKBU88Q/K9d6CtirrLCZeLyKBTKHz0iaM8uCOIqmJ26Hi0R1Eh9q6U+PlOD7sXKyAkFI+g6WCD014PoSWVr6+kcixOmiGJVR9rR0y8LQPm/sfFph9UAnskUpoI2p6v0/1mPWaOUb+nCbIAK16M09smfqg1PMlk9Wf2b1v1ezr+4YlwgnSOIsZJfciZPgvPJx8i7dqJceJJ6AMHHxOz7urOxpmwYJybSL5EWluLrjdE8NQq+/FCRP22dy8q/gqZIYmN0zTm/EsweEy4XONpd5HOlplqwqjPg8nbfOQ+I78+6Gb5u8WRo+Fs2LtSxjKh123FppDWw02aDjLZ8nOsrCQ3Mel6fWKvkw6X62z4TmHzT7EC3uhkg66ja6Dv5BHCEe+jjctF6MprjvYojllCubD0TRc562TcaYKOl+tsna3wx3OgFxYL0oZpKme+GyS1adlmwNt/l9m50H5BecuvUdfD8iSpaj3cpHBXiOXvuMjOVFA8grRWJntX2Af1eOuWve/DIbBbYv238TIhDIm1kzV63KIj778NkgxnjA8y9z8utv+uYgShTieL7jdFyOiS+L7KKpz5Toglb1hRF08LGpxg0f3mCKqNM5NDFEe8HY4J8rdK7JivULejSe32AgIBAmOn8/X4M9i7o3i1fs1kFWGBflCa871/KfzxnIshL5Ztxpy/WUYksNVG8iTMUPnEG6DrKINOVxnsWSYja4Kd81UWvypRsPXgh4Sg5elHxmSyc75MMMt+8Tx3k0xojxTjj64lwYAn7e36paG4oeftziy7PDji7VCjMcPw811uNv6oEsmRUX2CZs3Xc1rOaObsuJu9xKYqiOQl9uLZ9UfZXTObnmLizbBshS2ttYWWXPZrKImiQfYamQXPuMjfn21RUkU0YZUl4a1n0fpvBr3utBc6MxLNvZK7USK1iaDD33XUw6gDktbWQvUJW3OOt46Fq9bhBxIJC3YukDEjiYOqysKK91XWfa0S2ieT2tyi01V6ubJS1jQc8T7OCenriRhrEBiocj18ru5IUvV31XN99w3uLz4ne34erba1Yzd3EKE1RkBi/crWvMU0BKVn+DuY8shQUn1Bm3OjUZYlzRpakqDjlXqFly1yN0n8/rg75qEgDAkUQcerIvS+L5Iw3DxnrcSPN3rIWlr8tV7+nsapLwfJ6Foxka3tFzTuZ7Dpx/jPRNNTzMN6MABsmq4w/xkXWUujkZXp7U263xihw+Xle7OY/7SLhS+5it6GspYobJutMHhMiFZnHZsC7oj3cUxBaBYBfREQXdEPk0nEWE+q91wUuQrLth0m3uefJen5Z5FCIRoCDfmBNnzHJD5lJz0BELjL3W/9XuX7kvd/IowvQ7BhmkJwr0RqC4sOl+nU7WLx22MujJBEw94mbc41yhwluPI9zd5MYUoE90il5gmZ80iscAPsW6nw22Mezp1c8Zzag8eE+fk2iW2/K5ghCVcti2ZDDPo/Ub5F2YPJ3ybxy71uCrcVv/Fkr1L47VE3tVpaNOpbNk+TUG70beNgM1Zon8zScS5anXVs5hN3xPs4RTd3E9CXcEC4i7Zb2whE5pHiGWR/4FFG2rcX7/ixSKFQzPY6rKM//2USnx2yD1ctCysSrZdZdHxHkxPvLp/NVZKh150Ret1ZvG3JmxpTzvYQzo0K8F/jBZmfGZw5IYRShudJODfxlP2Ar7QQ8Q5JgSyJHXMTpalVyF4rkd6mYrPvpPqC4Z8E2f2nzJ6/ZBr1NWOqElWUv8ZrMcJ9gHCOzKqPNBr1LdvDYeM0lcId9te+b7mMXkCFzVjVGUe8j1PCeibRgG6bttAO5FypXL7HRwr355Nsi0IANGYBJIxbjaIlC068J0Lrvh7mj48QKZBIb2PR7cYInrT4/XM2SGR+rO2vdmPQuF/i2WD+VomFL7iKhBsAIbF5usaC/1n0eejQD4e6paSjDefBp6f6CO6FWi0E7S/TaX9J1LygF4IetL9uMywR2idRPsNQPPW6W+XO9FgawazE/6dAKW0H46trRbNIivhjVJ9ALp/1rMbgiLdDHLv/VJh9TRL1ehqccGfi4r0HMMwcApE/MKw9SJKKS2mOz9ULqSoyCrkTT19NVBIJt1vOpuV5Kv6/yzTub5GR4SG5a+kzu8Wvaix6yVVUMWfpWy7anKMz5OWwrRlk5Ycaob3217z997Ithqa1MfHUMQntjd1fS7bYU8IkUrg9+n+yjBAdrzBIbSao28kia0n8edLbmdTrUf2CXZJLccssTyGOpqdYZHQ3yVocL2cN+5gox6h41+AEGg6Hg0trR6Jn997FjTECEttna8y43U3epsSzIMPMJjf0BSFjKYa1Hd3cTGHkV/JC06pk3KELLsZo0dK2bQv9bLdLGJxwYz5D3oDG/UsXsd1LZVZPVtn6qxydRZcodWaGJTI/01g6zn5Bt7Rc3FYZLDKbZij8eJ33IOEW1Gpl2j6TjEC0ALIQURNO52sjaCmxoqf6BJ2ujlTLwshdR0dIaxM/MfA1sOg8suwmLEmOph1Ia12iL0nQ8CSDfo8fnl2+OuPMvI9TXEp9vFpXgvqfgIVn2TaS56wnJ68Jq9+/vWi/gq0KS8dp9P8/+y9TIPIHprUvbnvYWEPE2IJLrVidDlMHWbHJoeHzEbjrfpIeexilRFbGbZzADP4T109qc5O2F5h0vb/0JN15myV+udvDjnnRRTnFKzBtzRASW35R6HZ9vMmpxTCDpW+5MEPxx9XtfOiZ79I3NAK7D75gieBeCT0/ga/1erko30iHywy8dYKs/EijcEd0cdN/oU7rc6qnt4UnHU57I8i8/3rY+YeMMCUyupn0uDVC3U7lM/E06mty0fQAy9/TCGZJ1O5g0XaEURRAdCziiPdxTIpnMJqoT63b78Y3bQFqKEx9YCQL+J4X2cBQAAoSLAYBGFaiKsEmYWNjucV74w8KS9/S2LtKQfMJGp1s0vu+MKs+cZG9WsaVLGh38d9p/PWJeN+bADl5LJrRiZm7/oGBr6gfySXodXuIHrcYaL5STkh0AfDnOzxsm10i1D2B/bi0toa9LdqO0Fn1Uaz7YHr7aJEGISB3vYQRkcjfLLFroYK7lqDT1TqKBnsSVNqJ5MoJc4a4awmUEo5BLYaZtBgWFWszAoU7pWq9YJfRVTD8oyChfWAZpXvTHAotCbrfWLPqfx4Ojngf59R+cRJJX8yO2VafFZzB7bzFIkzc+DJKmzUmntrIUvk+XltnKcy43VNkNw4CeRsV1n2pohcWP0BWT9Y48d6OdH/sSQDqrpBo8G8XO+YLzJBEWluTTlfqdLsxsa9w4R6Y84iLPcsUIoXY2ooTUadT4pnsKc+HqdvZYvMvCkZAok4Hk+7/0MldKzP9Hy52L1KwivQlKsZ/vaNx4v3huKjPYgRprU1y1sTfzyYDzbjZpRCw4FkXa79Qydsk482waHaKyYAnw9U23NxTGw53QfV4o0KVdCpCVlZ+lZyophZjOBKU5d6kDRuE9udi27avGEtmvWs5Z1IgGnJuQ0H4NwKRuXHbJbzU9v0dRUlcuPhgpl3rYf03ZTPO+upZXDKzEG+JAMqcdVETQ73ulu0iVcTcRiCyiMC+fGZdPYJ9S+uVeWwHqN3BZPjHQZIblv3jXLBdYsrfvEURk3bIqsCyzWkNtdubDH0jyMx7PexaGA1mkV0CV4rAXUuQ3MTCf6FRlGVwwXMuFjzrivO+aDsiwmlvHtoG7HynEnOUijFUaiUdh6OEEIKgvpiQvhpLBFCkZDxaB7yuLhXqT8pP/EHMaLCTek+GEgo3QJLrJAwzi4i5ngMzJwkvSe6+5RJugNwNZV8/D+yWyfxUo/tNxa/Jaa1FQv/jiLmdvMBULApY8fqpZRbupAYmqa0EVgQyupj0uFUvl3ADLHtbK1W4gYTCDZDS3KRuR8H53wTZ+L3C8okam39RCe2VCe2F3PUKO+erhHPDdBmts+5r1dZtbvMMldyNEWq1cGa4xwKOeNcwApG5FEbmckAoLZGDHt6BIILP1avc/Zlt2qGuWxu/3e3DP6E/5iGiDiVJoZb3HMLGenRzC5Kk4lG7oCrlyK+6H09ti9LMMAcjyuH9FowsxiJagi1nedmKPshuQc/bI3QZVbZQbcuM5lpRvbFBNIkCSMpKYNf+HCcSNDvF5LfH3HCQ2JuhqOdJ2xE6BdvsHwTh3GgmxFotnAoHxwKOeNcghNAJ6iuItw2aBPW/8Go9yu1bHRx9PerCBSh7Yhce9TPPxOxVtoeBJEl4tNZ4tNblOvfBtDzTZNsc1XZh7mC8GRbtLiy7CBlWsWeK7Er8QHKlWCBDreaCdhfrZRJuMwy/P+5i0wyVcE5xmPyBUmVJDQ/Px7rkQ2rPcpncdfYPuH0rZYL7JLz1BBG7qjayYOtMmeZDop4eDjUbR7xrELq5G0vk2raZ1l5MkYcq2YQJltbnoCHkvzYW79tvoqzJRKSkEhk0hMD9D1fGkMtFl1E6+VuivtShPTIgSPdbGEHI31wsWIpH0GWUXq4IUKlEkqp6fTew8+d2cfuoPsFZHwap3dbCnRZ1U7QMkJTS62PMuMPNmsnF/Yf2ymQtURBmiM4jDbpcp7PmC5WCQ5hOElG/Z/EDxJchUJMERmH8gFy1BL66UZfFJWttzmVJZH7sZuMPGoOeDtHm3OrpQuhQNhzxrkHIUhLRf1n8bFDCgyyVPxkTgD54CPrgIYc3uEpAkqDfYxF63KKz/lsVX4ZFizNMwjmw5A0X2WtkXCmCNucZND+1fMLjVltgRLYD0Paaeexb0pit33cAY7/IKYLOIyM0Oik6zV0/VWHZeI19mQquFEGTASYnPxKOK2+2L1Ni4w/xi6zCkJjzqJu6XS0a9LIY8mKYP551sXOhgmWCO1WAEIRzShf0ul0MTri72K6f0lTQqK/J5unxX93G/Uy8dQV9/xXBDEqs+FDDCseLfHifzO+Pu2l+aqDauhA6HBpHvGsQqpKGpjRFNzfEtbmUpshSNfUDKye+eoLO1xQLlrcO9Hn48BL1+1y9MaxswsYaJMXgpBcnk/FRTzLf7Edga20wJQq2RU1Om6Yr/HyXpyi6Mrgbctcp5G+VOPuDUMwsfNtsFT0/QU6RoMyvD7q5cFqQJv1NmvQPkrtRQphQa/8i6JcXeNk5P/ZrqHgE9U8waNTXotsNEdypsf0OeDLE9FxPNP+4kEAWNOxt0v/JqCeJrMDJj4VZ+7VKyEa8AfK3KKz8UKOrTbCRQ83AEe8aRor7FPJDYXRr+/4tEprShGTP0Z85V2ckSaaW90xydvRkyTvbCGV52TipB2aoeNa89VeFUE7U77pkWHxR+yyVrbMUmg4qnvWntTFBFXELiAfIWqqwabpSFDhT0tNDccNZ7wWZ/5SLnQtUTAPqdTPpdUek1Kx9tVoIzv86yLqvFXLWKaS3M2l1thnzUBFleDGJFDi1UmsyjnjXMFQljTTfJYSNdZjWPlQ5A5faAukYK1osLDBC8Z4bh0twS0OWP9/Gti20L5p9b98q+0VfKyKxc0GseDcZaNGgl8nOeQm+SpZEwfbEF6C4Ia2twJ1mUK9nNDqyLNcryey3WceqdDgP9EKJpAaCet1MNv9kfy2qT9D0VJ35T7vY+KNKaB+ktojW+My4qXg/vRCWjdeiwT51omsNpUVBWgZF9UKrY1bKYwlHvGsgUe8OewGq6URC+cz5vzDbZtQlkuMmtblM+0t0Oo+sHPe2up0sUpqZMQugB0hrY6F6hU1+kWKSGsQKkiTBkBdCTDrdZ1tizVPHosXp9tPgrbMUZj3gJmf/4qKkRG3rp48P4iqnLTpvi8Rv/3azfa6CXihRp6NJ4wEmWctMgrvja2C2OUdn5UQXK94rXmgt2ApZixWSfND4LMheI/HDdd79RZCjZH6mMuiZMM2Hxl/TsgkqK95xsXeljJYEDfua9H88RFoF84g7lI6TVdABw9xDQXgOBeHZ6ObuozaOYGQFP96+j5VjW5K3LoXQXhe7F6nM+beHZRMqZ56hJUHbEUbcJ19SBe0u1Fn+rn1iKQB3moX/4lgbsRCwd5VMRncTST1IpKTo4qpdUI9lwJxHioUbQJgSW35R+e3R8i08Wyb8eIOH9VOjKWnNkMTuRSp/TXBxwl0RWg2P4Ktv4k6zqNPRpM+/wpxwT4R138TfU71QYtHY6HXN/Y87Rrghmqhs3lPuOB/7NV8o/P6Yh70ro9Xu9UKJzdNVpt/swXLcyquEw/pG+P3+esBC4LTMzMxVlTMkhyNJQXg2gcifHKj2HYgsxqt1IcUz+IiOwxJhdq7+i+0/XR7XZoYlMj/R6HyNUSkmlJMeipDeyM3SjwwCuySSGgnajjDodr3OjzclLv/mThcxWQ6FBdNv8bD2CxVh7h+YJHClCtLbRj1let5qv9C6fqrK3uX2c6dtcxTbajmJWD1JtS2erOdJbJ2pcua7obi25e+qtnZ9gL2rIbQXdi6w94TZs0xmy68yzQYVK/jqzzTbIsW7/1TJ/Eylw2WOglc2FRZvv9+vAW8SzR/kcAQxzQLC5noUOQWXUnF7d1jfSCCykFi7qU5QX4ymNMajta2U8ZaFoP4XO3+rh55n7zGTtylatswuQ+CB9DxlvQ2SBP3vA//IYJxIRqM87cnboPDNpV563Bpmz18KuRsk1kw+qACEkNAD0fJoLU5LvGoYrRRjP+CC7TIf9PUhSVC/p8kJ90RIa5nY9JC9Wk7YV/5We4FObWFFq9LbLLR60gC5lPzkQsI4yMOmtCjS3PXOC35VcDgz7+eAN4AHK2ksDodACEFB+GdCeiaCICChyg1I8QxBU+qXu79AZAkHL3jtPxNhY+0RFW+EQWqbPUiagdDjP5aedBFXqXzvCok/xrjZvVhGUqIFhE96IEJq83IE7xykXZ2u0lnxvith2tets9TozNg8UFYsfj+hS2ycppYq3i3PNPjjOYvQvnhhM4MSeeujs97cdQp7/lI49/NATBKukiQ3SXy9vnr2D6MmAy3q9zTj3BQBWp8G3tqQ0dVk26/x46vVyqTZQTbvpAYWe/6yn6mntqh+VXyOBSok3n6//xogKzMz83u/318m8U5P96GqVZMZPSOjfAmQqhNCCAqDG4gY2SR72+DSEucEycqeTbDgz5JHY1g7CBgzaF1/tG1ovN290Y0Ctmd9hW6tT3gul0sc0fuaEu5K4MQF1O21hay58ZVy2v9NoV794vHkbYPpN8DezOJ98nW6SYoAACAASURBVDYq5K1zMXIWuEsMXTfy2JMzh2B4B5KkkORtgSX6k5GRghAQ2AOupOisPiMDajWFfasTj7XIRFJKrUyX6iIjI3H9rYwM6Pp3mP8yh8yEum+lwpr3Ujg1vtYEAANvh9UfwK6lsdsVN/S8SiMjwz5T43nj4OvrYdu86Bg0H7Q5E057DjRvCqf8E74cCfnbi4/RkqHv7QoNm8Z+Nk4YBdt/Az0Qe45GJ0D/W7zVspJPRakuelOhlLB+v38W0Y+cALoDq4FzMjMz7SvDcmynhBVCEIjMJWys35/pLw2P1hmvq0Opx+lmFvmhnzCsHYBAwoNba0eK+1RbU8i+wo/27xtPiucMvFrHmG2J7k1O4Iv9WQAT43P1Jdndt9R9Kpu80HT2rNvMwgfPYc/CpghdRUsN0XxYhFNflGIEYM4jLpa8br+w1/uBMCfcFX3nN61CcgKfY4rY3C3JvnZs/vxclo93sW+1gpYkaNzXpN8TIaZd42XngsN5KRUMeSlUlKI14V4Clr6lsfF7NeqmmAOF2+0nOK2G65wxPt52fYCspRJz/u1h5wIFS5dIbWnS4QqdXreVHoQjLNjwnULuRplGJ5vU72HFfG6ylkr8NcFF3hYJb21od7FOCxtPE4Cl41SWT3SRvUpB9Qka9jHp93iI2u2OHW+TGp8SNjMzc+CB3/1+/y/AjaUJ97FOfngGIX1J0d+WyEcP7wRMvK7OtscIIcgP/YhhFd82QYiQvhRZ8pHsPjnuGEskXl6wrLJ9oAxzLxFzc6n7qHKDCmUoPFxS3Keitv2ToR/NYte8dArWN6Hl4LrUaROfRak0O2pOibZAZEGccAMUFK5mzY8b2LWwEwB6vsSaKTLBfQZ1u4cPS7xbnG7Q7qJ44V73tcL6bzX0QqjTMVqxvtsNOt1uiArsL/e4WTHRXrxdqcUCGM6D1Z9Gn2RtL9TxpEUr0pw7JciepTLBfdCoj1WmwguSDK3OjvcXP0BGV8Epz5etDmTX0QadrjbYu1LGW1uQUoo5x+Hwcfy8DxPTzCes271j6wT1ZXi0Traz6LCxJka4SxIxNoCNeKtyGhEzx+YIGU0pTnMafZsyEDY5Uw1rH3a5UQ7gUTuT5B6ALB35ktuSJOFz9cDn6kH6EKCUoFFXrcTC4Eopvm7DTFCmTYLaXTex5YuoeCveyP+3d97hcl3V3X73qTNzqyTLsmw1W7bGRrZly924G3dsY4NNDzgQIARCAkkglCQQkgBfSCCACZBAIFSbYDCuuGHjXlUsWyNLVrPqVbtlyil77++PmVvmzjlzi+Y26bzPo+fRnZkzs+fMzG/vs/Za68dJf3c3h579KmZKse3Zd7L7hcOGNW7DVrTMU7TOL+dpn/iBoMbd5vF/cFj+PQcdlL8LG+6BjfdZXPnjfmOH7I0Br/zKJhhU+WhlNMdcX/7MVnzX5oWb7b4V+vPftDnpQwFLPhQgBMxcMnR8WYXlDJWuzQaHLJYcecXwCoOGg2nDoScmMe7xYL/FO5fLXdCAcdSgtaYUrMAL16G0j2W0k3aWYpsjdz8ZSzy5obJ5WItUeykLZW3AT6ro7oAAShcib0/ZJ+DLrfSm9fXimPNxrHlAOVe6GKxAqj3sK6UwmUNz6oI+MbaN2QjSkWM2jRm0pN4w4rayY4UvtxOE60E4pO0TqiaURdeHvHpHbXqa26Y47p39oQIh4vdZlN//9T/tq79izmW5yu0GZ37rx7x6yynseOBU/D0Z3OmazKGabY9bhFWbmRoVGHSuM5CeZP5Fsia+u2e1YNX/9gt3L7tWmjz3NZvzv1z+PGefrjj9kx7LvuWQ317+DDKzFCd+0GfueZJtTxk89WW3qpdKfqvJU182OGSJ5IizhhbNXS8JHvzzFLtWlN+7MDWHnym55HtFMocMeXjCJGLSrrx7vIcpBi/Qu5sTqq348jVa01fimIdP7OAGYBptlCs+an84QrjEmQvY5hHEdQg0jei2rin7GDQhJX8FodqDEA6OOZfmSk52MVhDt/cAUBavICwRsA9VzNOeua783GYzrrWQUvji4NGSsrKRwq21Htfye60VXaV78cK19L6Xov8CTe65pO0sAPMukpz21x4rvuv0pam1zJOc8jGfmcf3r8odaz6+3FDzGmHBZeOvlgDQtngbs85Zh9/lsPyLl9Px1AKCHpfWhR0s+uOnWfrO0/uOW32rxeqfWex7xaDQYVT1Hi8XsBjMOF5yxNn934e1t9uxzat2Pm8xcDJe8sGA7FsD1txiozUsuiHAaYFVP7JZ9u3o5wnzgjW32hxxVv3whtbw6Kf7hRvKm69bHrN47DMpLvlOfEw9YfIxKcU7lPsoRZgOKN1FwX8WJ33NuI3FD7cSql045jwss1ZUHXMutjGbQG2pvc+aH7uKdazDccwF+HKwi41Nyo6OkwOk7eNI28ehdQCYVc/vBSvpFbuq9yA34oeb+lbnLak3IDwHL1yP0nksow3XypJxTqs6ruC/QClYjdLdGKIJ1zqGjHPamAt5wX8GL3y56jalu8h7j+CY8zCNcjD35D8LOO5dAa/80sawYdGbg5qWrWn7ZAK5Ay9cQ/8E66JfO4fuV8tXcYcs3YSVCXn0pney4w/9bQd2PzefzjWzaGuXLLyqfOyxN4Qce0PIw3/tsuqHEYUxPYLcLTZHnN0vpPUuZIRRG/5JtdPX7S8owJ3vSPPaw/V/ql7X0J/JzmVGbOHNlsdNgjw15y9h8jIpxdsLX0ETvQqIjWE2mFB20e39jkBuobyZY2MZM2lxL8O2+jfQhBC0pC6mq3TfgEwQC8ecT4t7ft3XaEtfQbf3MH64Ca09TGMaafsE0nb9LJXy69aGYsLYUIwikNv6xFsIg5bUBTTr89D4CNwaQc57T5P3H6dX8JTuIfR3oPFpds8Zcnz7gxduiLxd6W5KwUqa3P6VcKqtbOIQhxAGrakr8MPF+HIDAoOUvZjDLpvPa3/ukbvFouuVmWy9/xh2PFGbohh2p3j5JwELr6r+PtYTS3/QfYtuCFjxvehOhbNOrd/+77mvOUMKN0D7UUOHTPLbBSqIHrffXS5pt5uSTcapwqQUb1Fns0yI8RlyV+keQvXagFsCQrWVvcUf4VqLaE1d0jcWyzyEaZm34YVrkWoftnkEjjV0aEcIm9bUGyobi5LyxyFRqogQqRGvcA2RRumoDU0wjdaa24QwENSWg2stKYUvERUKKgWryTinj3hD05fbKfmr0JQqk9RSTCOF1iEF//nKJAm2ORtNfO9urUfe11sIgWvPx7XnD7gNzviUz8l/5rP54cPY9Gy235hhEF0bowpV4sWy7cjq+9rma5Z8MOD5/3CqYvSHnR5y2t/Ufz/bnxq6NmL6sZIlHxz6vMw5V9I8R9LzWu1zTj9Wkp6ZCPdUYlKKd9p+HQX/2UjLr3KseGzxgs2DhHsgEi98mW6vLLy9lDv9ja4iUQgDqXx6vPsJ5ObKBu308ip8mK7wfrglUogBTONQXCs77PFI3Y1UeyLvU7qLUO7EsebUfw7ZQzFcgdYhSvt44WoGxna9cC2t7pXk/UeqYtK+XA/ETwy2Wf91R4rTAgvfKCl1HcnLMY9Jz6gVtSUf9Fl/l8Wel6uFcNoxkhM/WHslcOrHfWafGbL2NpugKJh5gmTxe4KaqtHBqDoLamFoZp0SctHXvdjqy4E4zZC9MeT5/zCqyuLtJs3iPwqqMk5e+4PJyv+y2bvGwG6BueeXJ5oDqdhmqjMpxVsImyb39eRLD6PI991um3Npcs4d89cvb5TWxw/Xo3UQGb4YKVprukp3EQwQsVBtp9vbDcImbR8be6xUJbpLd1dyt3svwXs3UAWWMZuW1IXDyiDRWqF0DyBiM1LAxhD1K8yK/ip6vEfRAz672nHvotu7h1BFdTGMW0WKihVc4zn2BpsX/0uye3CJt6E58orqTeU9OcG2p0xO/YTHq3fa7HjOAA2zlipO+YRHU0y/6yPOVlWx8OFw6InxvcK1Emx/1uLF/1Gc88XhXZGc/kmfplmKdbfbFDoELXMVx7494Ohr+sM36x+C+z6UotjR/53pWGbStVFw6XdHNv6EsWNSijdA2j4Wx5xLKViB0h62OQs3Jhui0Sg99BdU6QJKlzAbIN5+uJEgsnAmwAtW1RXvHu/Bymq1anSYYgZzZl1JvnsGvlxPKViNEBYp63gss7YEv+A/TylYVckDt4gKmUB5gzbqeACtQ4p+jh7/YYjZsxhIGLO6j0dTCl/CturvJYyEQHZQDFaidYkLfzCdJz5xNlsfz6BDQeZQxTFvLudQQ3nz8MGPpdj0gEXQIzBT5arM6+4o0jRLN9Q0opelf+mz/RmTnctifqpa8OL/2Lzu3QHTs0OHPYSA428K6/ZHf+o/qBLuXjbca7PjuYBZpyR53JOBSSveAKbRRFODSrS1VuT9J/DD9ShdxDSmlUvYI4TRMJw47RowtjYMEdHibhSEagdxLxi/CVmeZPxwc+R9UndhW610lm4jkBv7bi/6K2hyzybjLOm7raf0LIXgD/Rn90RtopXt1lrciyNfr+CvqIS6omPuDUM3rrVo0V9Jj/cImspkPR3O+eEaSquupXv9DOZfEpKe3v/4Rz/jsu43/ZO1LAk2PWTx8F+5XPXjsUmzyxwCb7y1yG/fkqZjefTPVfkGa39jc/oQ8fPhsjumuXNYFGz6vcWsUxrzOgn7x6QW70bSXbq/KrdZyW4CuQ20rulB4pjz8MN1dZ/PtRbVLQAZCeVc8WjqTRBal2ILhCBg847/I5DVVZyaInnvcVzzKEyzhYL/AoXgUYbqjuRYC2lLXR25ieqHm+nxHiE+3BGNZcyM7dUShz1ErH0ofK35dVBkw/ZOrvOepI3qqyypdtF8wmMccfqVVbcHBdj8++jPe+vjFvteFbQfNTYbfqk2mHmSomN5/GMGV3TuD/XK6netbPyVb1CAF79vs2+9QWqa5vibAlqOSDZPh+KgEO9Q7sMLX4m4J6AUrKwR77S9BD98DV8OPsYor9itRWScMxs2PtdahGU8H1kuX8/uzBAtmMY0pNpdc58ggxdEl99ripTCF3E5jrz3BENeZgBK9cRmv5Rz8kcm3IJMX656XF+NwTjmkbjWohG9DpSbL637rcnudQa/PbqLRy/Ic6F4hTai+8EEcmtNYZLXKSjujhauoEewb51B+1HDex+jYe4FIS/90Caqk6GZUmTf2jgXeDe6RgyA0t7Gxob2rRfc+7501V7Dmltszv1SiaOuHLvzeSBwUIi3H27ovzQeRKj2oLWsWkWXncbfiBeuwQ83AgaOeSS2dRiGSDc87l7Ou76UHu+B8tUACoMmUvaxpO1ygyitNVKVQxKm0Y4QAiEMUtZi8v6jDBZg02gmVNFl9gBKh5TCF2Pz6SOOqPNcQ/txGDRhGbPAMNCqRKC2ImPaAAxG0EzaWUyTc8aI0yf3rRM88NEUO54zQQsOM13OP60F++vrIbYcvHbVl5mpaV2g2Lu6dombOVQx65SxFZqjrpQcdXXIq78dZP5gaE7+mE/LHE1Ba24PiuxTimNMiwstF2MUgfg5p8HGh6LvCyLccvaHp/7Jrdkkzm83eObLLgsuLWDEhfo1vPRji1fvtPD2CVrna064yWf2mQdPPP6gEO+oHOdeDMMlysqznPqXJWUPP8Vuf7DNQ2hP30ggt6F0N/aASkIvWEfef6oSGy9nkDQ7Z+HY82hyT0UIi1Lw8oBKyKMJZRch8X6UrjUfL6zfFnYglhFv9mAYLbGLZ0ETrrWAjHMKlnkIWmv2Fn7McFb7ZWza09djW6NrvPGHT6fY8Wz/19yQgllPptny+dPo+MbzzKR2ArHMw2omCcOCRdcHPP0Vo8Z95qg3VsfGxwIh4LLvlXjx+zYv/cSmtBdS0zUzjlMEnYJ7f6X54Rv2sFWUz6sI4LdmkS+kWmkdYUxl3vnw+L9Fu+xMX9S4SUp6sP3p6LHtftlkw+/M2NX3U//s8MK3nL4x7nwetjxqctE3Ssy/6OBYsR8U4u1YR2IZsyPjq445f1z7dtRDCFFT3BPKDrpK96H7REYTqi10effSbrwNy2wh45xExjmp6lK/GLyEJ1dGvo4h2nCseSjtV/WPicM0ZtSUzg8kYy+tbAR3DTpuOu3pt2IaaaQqUvCXobUkVHvrn4gqArpKdzG96e0jTsvc84pg2xPR4jD9ySbu6jqeG1ufJz2gv4wh2mmKCYkt/ViAYcPa22y6twjSMzVHXhZy+qfGZwNPGHDC+wNOeH/A6lstnvy8y5oXyz9hbTgsfL3Fjm/vQKY1GlgmQ2728nwqHb94ieKYK2D+xSEb7q0+381zJCcOoxhouChZ/hdHkI/+XRZ3weqf2zWTS7HDYMV37ES8DyTKJewX0V26v7J6BbBxzAU0u+fVPXaiKfjLBwh3P0p3UwxeoMXsH//ASagc5plBENbGw5XuZm/hNpqd83HMo/Blvc1Zg2bn4si+Lr1Y5jRaU5dT8J+ubJAa2NZsmpyzMY00ee9piv4yFD3Des+DkXoXhWAZTXUmkCgKOwRhjBO80yW4u3sxG1tbOZ9XOc3QzDCnkbFPwTSj89iFKPdTOenDAWGhvLE3EQ0Y/W54+ksOhZ39Ly6UYPYfMhz/1Wks/2x/CuYKGaC0HlH4RAi49Hslnv6yYstjFmGht/+4x6EnNm4j0c7AzBMkmx6sPYmt8yVHXRWdWbTuTovCjugTv2uVifTBHP+OxuPOQSHeALY5i2mZd+CFryBVJ7Y1F8ccXr/mRuIHGymEy5GqE4MUrr2QtH1y7Oq/XDQTTdx9RX8lef+JOscqArmebq9Ae/pG8v5jFIPnYx+rdHyxTS+ONQfHmoPSHgKjb5VcCtaS95+kXg/x4aBkfMpkHLNOVrTOl3RtrF19dx0TUDws5CVmc6R9JOevaWX172zsJs3r3lXu5BeHEMNr4KS0pgdNBoHVwKu7l39m07M5+opi5tPVqSIlrVFEBQbrY6Xg7L/3GelG9Eg5+aMBe3IGPVv634+V0Rz/x0Gk2TRQqSaN9g+10zo2Tn6gcZC8zTLlOPbIsxUahReso6t0b98moQQCbzNSddFSaes6GEM0xz5fVLVhKPdWqhuH3kQM1Q46S7djiukIWtCx2Rc7SDO82L8hqq3Jyt389j83Wxgjz6m3m2DRDSHPfW1QnNrVpN7lcanrcobhwCfb+PWv+3uDr/gvmzM/47Ho+tFfft/qF/hd4LFVhbQJg9Mthw+7zTgNEPF6m4bmoH35habV0Imj0RzxeslVPymy8r/LVmup6eW9hQWX1jFvvjxkxmLF7lW1E9jsM+WEXA1NBAeVeA+XUHajdQ+WeUhDyt97KQTLIrM7SsHLZOxTMc1aoU47S/DCtTWhE0O0kLaX1jy+GKwclnD3EsiNBGys+5gwot3tcNF6/4tXBC5p+yS0lhSDlYRyJ0I4pO3jscz6G5mn/41PaoZm3e0WxZ2CpiMU2RtCzv1IKx0dghe+ZfPET6tT8Ho2mzzx+RRzz8uTHsU+6S/9At/x8n1TVl4rfh2U6Naaz40w/jwQT2vWyBD7Ekn4UwPjNQtjUNx333H9K+VpCN5sD8MLbYKZ8TrNBV8dftm9YcHZf+/x8N+4dG2oCLioeGZ+/uAp30/EewBSdtHtPYgvNwMBhmgnbR9Hxjlzvzc1tVbImHa2miJeuJaMeVLNfbY5k5bUxRT8ZwZkmxxGxj6TQG2iGOzBNDKk7RPLE80ouu4NRZyzz3AwjPbhpnHHkrJPBAT7CrcSqH4r82Kwimb3nKpq0ShOfF/AiTFtYzc9aBJ1+Z3fZrDqR06fkfFw0VpzX+BFXms8GXpsliFzzZH/7H7q5bnTL7IFDfPAvHMPmc0Wc+5p5vivT0NogTkvxPxAnqxhcYRhcI2d5iTrwAz+zr1AcuODBVb9j01xt2DGYsUx14UNLVaa7CTiXUFrTZd3d19rUgCl95H3n0SIFBnn5P18BQHCjk3sMOqEBVL2MeX0P7UbgQHYdJfuHCRkK2lJXYJlHtqIKEUVVoyzz3DI2CdXMlGqY9aCJgzRhKaIKVrRGISqttTfFDNodl9Pt/dg1fst41HwnsS1FvWlVY4EraBna52+3KPYXy0B22JSKPLAMhmMWLzv9ov8wC9U2WzIFHQfE/Lywn00zVFc+XwrSz7kM+PYJuDgcFRwmuHkjzSuOGmqkYh3BT98lUAOFgcATSnI7bd4CyFwzHmUwtr0Pcs4FNeKr6TsPd6uhAg6C3fWCJlUe+gpPUx7+kaK4cuEsjrUIUgDLpqR9h6xce3htaWNwjKn0ZZ+I3n/acJKJopjHk6T83pMsz+EoHVAZ/EOfLmR3hxw05hOi1vuiBj92YAiTylYRZN76ojGtf4huPsvM3Suiw6QCksz+4yRXzK4QLsh6FK1s7QNHDWKpeEDoRfhj1TBgO3X93DOu52GxNMTpg6JeFcI1W7ilsXDybYYDk3uuUjdSSA3972WIdppds8bdtWm1kHV1cFAQrWDQG2jLXUtef8xtNhOGARY5qFknNOwjOmEchel4BWK4dORz2EZs1C6VHH2mU7aPpH0fm7y2uYs2tNXV1ztiQxBCWHTln5TucOi2oohMqTt4weYb9Qr6okyjXiZUrgWtI9pTK+kAJYnC78HfvsB2Ls2XkjnXRTW3TSLwxCCs0yXTap232GJabN4FGGM3bp+QdM2rditFbMb1GtnPNgiQ1bIgEWmzcJRhJEmE7uU5Cd+gTUyxASWWDbvcppwx3gyndpnrYFYxkzKsc9aAa+X8TESTCNFe/rNdJcerGxCFtE6oBiswjQOxTSG6MwPaCQqdh2m0TqPacylNXUxM2e20NHRn0EiZQ+efAWl8xiitaaoxjLnMC19PeXEsrKzTyMLmIZ6rrLjzQJcFtTcZxuzIg0iBKmafic9pUcoBM/TJ+pyI77cSFvqGixzOi/+wGbvYOvQCk6r4nXvDjj9k/6oW7z+idtEj1Y8Gvp0onEoC/dfufX7oMcxUxisr7NxMF0YtE2RFAtPa75S6ubJ0CMPpICTTZu/SbUwbQoGrPcqyacKnazV/Z/PCj8kJ0P+Jd2GOYYCnoh3BcdagG3OqayKB2KQqtNPe6SUwlzFYqwswJo8XvgyShdoT19fV+C01pSCdURtsEE5A8Wxjoq8zwtepdt7AKUHpgNa9GYAm6KNFueCASvdySUGGed0ArVjkIAblYyT/ph8KPdRCFYyeDUu1R4K/tO0pi+nuCv+HLcfrSr5zaPHEoK/TrfyXilZrgLmGybHRFjQ7FOSn/pFXpEBphAsMW3e5mSwB30HLrNTrJBBbBeaM0ybzBQJmXyj1M0DYX9GSAl4QgZ8tdTDFzPx3TUnKz/3i1XC3cvTMuC+0ONye+gF2WhJxLuCEILW1JX0lMrZJmWvxemkrMVDZjOMBC9YRZTDeyA344cbce0Fscf2eA9RDJYTHd4pC1mUt2S5l/njg4QbBu5sSt1BZ+lO2tPXYpnDa9ShtaIUvoSUezGMNtL24oa1yR2MZc6gPXU9heA5QrUHIRxS1tGkBpk1e2EOYpqQBZWujTOOjQ9DtM5vXAXhTNPkDWaML6aSfKrYyeoBm5vPyoCXZcgX061VFZEX2ym6teLXXoENAz77ZuAsy+WjqdGt6AFyMuB3QYmi1mRNm6vGUGyKWvO0jL5qfEH6bJEhR0yxEMo6FZ8dsCoMEvEeL0yjibbM1ShVROoCltHecDEKVVfMPYpQbY8MGQAEcifFYBWRYR2aaXLPJu0cH3msLzfG2I0NGoHeS8F/ltb0pUM+NpR76SrdNaDdABT95bSmL8c2Z6K1LFvFoXGtIxtiHG2arbSYFw7xqHpXDOXPctENIa/8AjY/Xn1v6hDF4veOT/bCz4NilXD38oT0eTD0eMOgH/2bnAxX22m2a0WXlOxEkzVMDtsPsfu5V+BHfr6vguCu0OOhsMR35Nhkq3Rrxb6Y+H0e2KwlRwxDkqTW3B2UWC59QHCyZXOZlRrTEEUcbsxVMIAzxsMZ1SefzWZt4PvAAsob7F/M5XK3N3Bc+00oO/DDzZjGNBxrwYhit4aRxmBsihvKDu/RjZnqmTKU+5FHC4thZGKFG8r2ZMNlOCIP0OM9XCXcUF6995QeJuUcT8F/qq/PuCmmk3FOGbaZ8v6Qso+n6D9f5X3ai22Wm34ZFtzwS/jtR322PWESlgQzFkuWfDDgiLPGp6nRKzL6M9HA8jCoEW8AUwiOECZHGCbH1R46InZIyc/8Qk3XnGUy5Ot79/IB3Mjj9ofpwmC2YbIxYtKageC4YdS1S635fLGLR2R/aOu+0ONpy+dzqdZxF/DTLYfHpV+zpEoDl4zhqhtGv/J+F7A7l8u9O5vNTgeWAZNCvLUO6SrdW2l3GgAC2zi8kgM9xn07h0HKPpoerzbtzTJmDeHwXu9LWT8+7VoLMET7MC3Khv5KKFWoFDLVEqhNBKXtDOyJIfUeur1HMI0ZNV0TG41ppMm4p9PjPc7A8IltHEGTc3bf3y2z4ZJve0gflA92Y/akh41T5/O0x0F/7g1LdMZkV/26u5t3ZWwyRmP3PSwhuMhy+R+/UPPK59gubcPYsLwrKFUJdy+/D33ODEtcPs4VpdfYKXIy4P7Q6/vGNwNvczIcG7HP0UhGK963Ar+s/F/Q8LKQ0dPjPVKJe/aiCdQWur37aE/fOOHtX9P2KShVKPffJg8Y2MbhNA/h8J6yFlP0o8vrlS6xJ/9zDKOZtLUY1z6y7z6tNUp7pO3jyftPEbd678Wx5g75HhT+EM8TteHnUQpeHHPxBsg4J+OYcygGL6J1OVWyOu2wH9OZmA50p1g2j0WIUAq42Gr8qncw9a4vClrz06DA+93Gz2h/5GQwgYdCjx1KMl0YnG25/Ik7vFDNCxHnrJfnwmDcxVsIwd+kW7k89Hki9DEFXGqlmDcOsftRvUIul+sByGaz7qrMiQAAIABJREFULZRF/LNDHTNtWgbLGpvNrJkzyxs2Wmv2bY5ZEcptZJp305w5MvL+8eUqwvAi8qV1WFY7GfeIYUwqLVh7z6Fj3yPoqhJ4gdL7yqtqBaHcSFPzpUxrXcqezmfp9p/H8zswjBS21UwQxvXSFrRkFjFn1iUYQ8T5tW5iz/rh25f1YjtB32fVS9Hbzs49D1Lyd2IIm+bM0cxsPxfL2l9z5xYgOvOml8FjGSseyuf5dXc3O8OQmZbFtS0t/Mkhh/Lqzp3ck+/vgZIRgj9qa+OC6WN/hfgmz+EnWwqxq651hh6z8/OXtPIxrelRCgP4TU8PPw4D5to217W01C02cncUIIwWcNe1xuUzjXqNiyv/xpNRTw/ZbHYucBtwcy6X++lQj9+7d/T9MeoxMJdZ64AgjHsdxe69WynmR+fIMjbMxwcK3cOtwz6RVqeJbu8hFN2UI6TVF6BKe+zc8yRdXQF5/wF0ZYUsVQ8yJsnCNA6hyTkb11jI7l3D+5wM0iPuzx0Gmaq884K/kh7vAQam9e3p2sW+7jW0p6/HMkffxGkoBufAjxV3+EVu9nr6Y8u+z0OFAodjcIHt8g9uK8uUj4ngYttlkbTHZVzTgAXCjExzAwgDOebjWCMD/qXUzfoBMfCf79nL51KtsS0EjgsFd8c83+ukGPMxj9f3ZvBrRjGqoFY2m50F/A74ZC6X+/5+jKvBWJgxfTgELo45b5zH01i01hTC51F0Uc/9JlQdFINlfcI9HFL20SMKKbmxxshNGKJ249UQrVVdELUOKnHp2hlF6T0U/CeHPZbJiqw41Q+eDhXwGoofB0VuC4tcZ6U4z3KZN86NqD/iNhF3jbV4P+K13VqxU8m+itrBaK15NvD5+2JnlXADrFGSm734RcGVdopzI+Jc55kOl1lju0E42Rjtt+XTlCfvz2Wz2c9Vbrsil8sNvxfpGFDu1308PV4Hg8PwrnV0XTeYqUAgN8eWxldjIUdU0j9y09Ym9xxCtYdAbuq7TZCh2T0HyzyEvPdExVVHYZuzybinY5n9ol4MXoaIjJBeenOyJyu+1phQN7thswpZV8/nC3hOBry3uI8AOFwYXGKneK+TGZe9mZNslzfJFL8JSlW/lrPTad5uDi9spStmD6YQbJeSb3k9LJcBJTRHGRbX2Skuc/rj0AWl+Hypi2dkEPutWy4D9irFtEEbpr2TYag184UBAuYJizMshyvsiUkVnEhGG/P+GPCxBo+lIWScExCU24Uq1YkwUrjmkTS550z00IaF1hIwan68WgcUvAEl33UopyMOv4+2bcwe4SjBEA7t6evxwhyB3I4QNmlrSZ+FWHvmTWgdoNGRhUOjmTAmA48HHv8XFHlVhqSE4ETT5sNuU2SmRJMwcCG2MrKX3uujrVrxv36BZiG4wdnfmP/w+GiqhaWmzaOhT0h5xf2ew2bSuau8+u1Skl8FJXYqySGGyXV2immGSUFrvl3q5gUZUNRwpGGwUys2DcjjXq1C/sProUUYnG2XN2G/6fXwVEyhTi8eUEQzbcBtWmv+qdTFgwPj3RrShuQ8yznohBsO0CKdtHMCaecEtFbDbvg00ZSCNRSD5YRyN0LYOOY8mlPnYwiHUO6ls3gHUkf3A6/GjqikjKdsLnz6qMYshEHKPq6myrH/fjs2Ic61svR4T0CMcURvTvZk4vnQ58ul7v4UO63ZFnrs0JJ/S7fX+ETONExONO3YqsIoFPBw4I2beAO83k7x+gE5yb0bhi9Jn38udvPaAEG+LyjxyVQzP/GLPDPgfe1W8cU3d4UlzrZdQq15YRjn4mjD4rBBv9snQo+HIzYqV1daDHwoNc65npOAA1K8e5kqwu0Fr9JVup/eNZrWUApXoordtKWvI+89NqRw28YRGEY7XvhS/GPMuTjmQkK1Ha19LGMaaedUTGP8+j9rrfHDjYRqG445B1+uZXD83hQzqnKyJwu3B8XI3OjlMuT3ocdFEUUZf+o2s6fYFbsxGMWuIboI1uP+oMTdQYkdSjJNGJxnu7zFTo8qDPPfXqFKuKHcwfDfS3k2j+D97KwIuw/kY+LgvTQB19mpmonwWRnE5jatiSl4OtA5oMV7qlAMVhB1ce3LTXjBqxEmBP2U7dBOosk9je7SI8RvZDoIkQZ8WlJviAll1FLedNINmQiV8ugq3YkvN9EbNjFEG0Kn0eRB2Ljm0TSnzmio/Vyj2BqzutSUKyYvihjykabFzU3TuMMvckdYYoOSQwaMZo7wXOdkwC/9IsvDgA5U3zfgNa140QvZpiR/PoL+J694Hvd5eVbGrJI3aVlnu7yW6ZX3kwbmGSYvRvQDMYHTTJtr7TRn2bV57vWSV61xiJhIrXko9Ni1J6A1CLnUSk24N2gi3pMAGVv5qAjVNnSdlVjGPouMWy6NH2z+W42PH67BZw1euI629JvqrrilytPj/aG8QapDLHMmaftUXHv0GTs93sP4ckPVbUp3YhlppmfeP+EFVEPRWmd80+tUIzpCcL2b4Xo3w/LQ57HQA61ZIUNWD1rBWsBFEeIVx8sy4B+KXeyI+Y5o4FdBideU5M/cZubXKR7pVJIvlbpZ1h3UdUEdiXA7wEV2eaEghOBaO82rXndVBo6gvNr+yIAJ5vmgxIOhzxLT5g12iottlzuCUuS4mhF0K0nLGLWU3SElXyh1sUqFfWusX4sin0q3ctQENtJKxHsSIOr0UTGMVmzz0BrRg0r6ndNfUp+yT6QYLEfp+vnXodpB3n+S1lR0WYHWiq7i7QRqW99tvswTyg6EcS2OedgQ7yj6OeNK6kO1HV9uwLUmQwFVPOdaLi9EXL7PFQZvjKjs2yUld4flTI5zLJdjTIsllsOSiiHDXiX5WqmHZTKgE818w+RSy+W6EcS7b/ULscI9kKdlwJ5iF99qmhZrEvCvpR6eGFZM2sTTis0RIZDDhUGHLnecnyMMrrRTXDrg3FzipHCE4M6gyFYlaRcGr7dc3lbJSNkZhvx5qZPtlfd0R+jxTS/PF9OtvMVJc4tfrOkZ+aD0WZnfxxtslw+4TQ1fBHzT6ykL9wDW6HJmzVczE5fBloj3JCBlL6THq00BNMUM0vZiLGMaYWnXIFG2SdsnVYUXTCNNk3seee/RGqOFwZQtyaIpBS9VCXcvijwlfxlO+vKh31QNclBl6KB71fgWPgykQ4bs05oFplXTS3sg19gpdmjJvX6J3WgEcLQw+dNUM+lBx93mF/iRV2BvZZ16i1/gEjvFX7rNfeIyzTD5fKaNTq3o1opZwqz7+lEMzpOux1otuSMo8uaIyWGLDHm+Tul5LwYQajjDcnHCgHWVK4e2SpHRR91m1qqQfVpxoulEThTn2y7nx1xdfGKAcPfSieazxS5+2TyDM02HB4ISD1SMLnrpQPHzoMh0w2joZm+XVqyImdBWyoANMmTBBK2+E/GeBKTtU5Cqm1KwGl25MLSMWTSnLkAIE8eaS3v6egr+C0jdiSHSuNaxpOyFEc91LK51FKXgRRTrKXgbRzyeUMVvjoZqpB6YZYSwsYzpBKp2khJkcK3a9zLWbJMhX6/kJReBecLgcjvFO2L6bAgh+IDbzI12mj+EHtOEyVkRaWobZcj3vTwDp6Mi8NugxNGGxTVO9Sq9bT+ccFJ1G5bVEhe336xlnaz7smiryr8NWrI5kLzVTvNu02KPUpxjuRxa6V0eZTwxHFbLoGaDtJdONHcFRa51MmzVil+FtT3bNfCHBmfq5LUmHxMo8oG9WsU0cR57EvGeBAghaEldSMY+FS98FdNoxrGOqrr8s8xDaE1fMqznM4RDxllKa/sJrN38XaLMCSwzPrfbEHXCOGL0VWwpZwlBqYPBjatSdnZcM14AlNZ8sdRddTm8SSt+4BdoEwZXOfHnoN0wudrJ0K0V27ViFkbV5tXdQYmo6wgNPBn6NeK9P5xs2bzsDz/b4gXp8Wvf5I129YbbsYbFdAR7IoTKpfYbJIH7Ao+3O2la7OhYc6A1dwUl1suQVsPgWjvFjDpx6fUyrBtP316ZeLbVudrYux+ZOlHMEgZHGRZrIjZZ5wiD141x58B6JOI9iTDNFjJm41x7XHs6GeckCv6zDGwiZRmH0eScGXtcyl5CMVgZkS9u4NrHjHo8aftYBBalYCVSdSJECtdaSMYZmfN7I/hdPs9LET/IgLJbez3x7lSSr5XyvCB9utDMM0wut1K8zS2v+Ip10uGKWrEi9Ll1QKHPSabNn7jNpEYRq73JaWKTlDwp/b4qySix7eVVpfia18Ozoc8/pFv7BLzdMDnHdrk9qM16ins3HSieCAMudWoFeY+SfLbYVXWO7/ZLfDTVzHkxIZMlpk29dmeFimgvMqzYxx3W4E1LQwiutlPc7PVUbZbalEv1x9pkuB6JeB/gNLuvxzYOxwvXoAiwjZlknKV1U/FMI02LexE9/qN9hgoGTaSc40nbi/drPCn7aFKxfVHGj/V+bQP9XnbVWdnpyop9YIHKBiX5Lz9PSpQdb46zbH4TRtdVthoG/1jspqM3YVDDOiV5TUm+lG4b8WabAi6zXI43LTqVYqUM2aDDWPHu5VHpc1dQqroK+HO3mTSCp3VARxhyqGFynulwV+ixM2JFawAzYrJsvuPVTo4dKP7L7+Esy4mM7R9uWpxgWCyLsRb7g/R5r5KcYTmcZNo8NygWnQauGAMDhKudNC1CcG/gsceEVqm5yHK5ImaC97Xmd0GJ3VpxomlzsjU2PYcT8T4IcO0jq3p8D++YhTjWArzwFZT2cK1FmMb49koeS45yynYIUQJ+SJ3V2wsyiKwSDIEHA483ORkusVzuM0s14jLfMAmV7hfuATwjAx4PvapKx6H4hVfg9qDIFq0QwCwEO9DDTuVbJn2uGZDpZAnBn6aa+fQhzWzq6CKDwBSCLUXFfREx5nkYPB16PBP6XGC7feYDWuv4HHGleDD0uCzmfX4l086benbVNPMC2APcGZR4l9vE36da+KaXZ5kM6NGK+YbFNXYqslCqEVxgp7jATg3ZVXBZ6PM1r4cNlQWADZxuOvxdurXhq/REvA9QpOqm5Hto7Yw6dUoIk5R9bINHNjm4pKmJ4w2LlYNWeQ5wSR0zhDUyjO2B3ZuyZwrBP6Xb+IGXZ4UMCNAsMmze4aT5Qin6h6+AVTLk9cMMoT4clPi+n+9bYWtg+4gysOO9mSwhaBmwgfpht4kOrVgug75XaEGwFcUvKmGW3wRFrrbTfDjVTKg1u+vEnot17nOE4HDDZG3M1U/vCFoNk0+nW/G0poSmFTHhdQKh1nyj1MOGAbn7AfCY9PmO1zOiQqnhkIj3AUYgd1SKa7ayOx9iGbNIO0tJx/QfOVgxhOAzqVa+4ZWbKxWABYbJlXYq9nIY4EjDjI23zhggeKnKKnYwzXUEpnkEGScPhN6QoZGhOGWYl/PTDJN/S7fxSOixVoXsVYp7Qq/qHBSBXwVFTjQtlsv4sE0bcOEQTkFHG1akeKeAswYd6wpR1wR4PHkk9PpSJwfz/Aj62wyXRLwPIJT26SrdjVR7+m4L1Q56Sg9h0oxjD21xdjBxmGnyT5l2ditFl5bMNawhS55PtxxOMG2WDfoxGhCbuzyQMy2HFwasYPvGIgyucYZ/yb83JuVvuFxo9fe/Xi0DtijJyabN9JiQkSFEOXQA/HOxK3LyCikL2Et1eo2YQOsQk9TbnQyrZMDmQSv0iyx3zH0h94e45lxQtpbTWjf06iAR7wOIov9ClXD3oilRDFcm4h3DDMNgxjB9SYQQ/K3bwte8bpZV8sMPEwZvsF1uHIZ/4g12mteU5IGg1JdXPUcYfNBtplMpbgmLSK0513Y4to655vAj29WkgPfbTVzvptmqJP/u9VRCOzANwfm2yz/q6iuGHiX5vl9gpQwJ0RRU/GsXNOypExbZR3nfYGmdVf980+Jf0m3c4hdZr0LSQnCG5XD9OPtTjpSzLYcf+vlIf6n5htnwsE4i3gcQqk6VolIjsyxLiGeWafIvmXa2ypAOrVlkWjUVlnEIIfh4qoU322keC32aheAyO8Uv/QJfLnX1/fB/ERS5wnL5eKql5kffXclOiWOhYbJNychNvxKwV5Q3OL9S6mGF6r+C2Ivm10GJ2Xv38lbKK9xQaz5b6mLZMDv3vc60eE3Lvg27wSjKJhVLqR+ymWNafDw9Ph6jjeII0+IC2+WOoDpo1IbgujGYeBLxPoAwjPgvu2EcfP2Ox5rDTYuBXccDrenRmhYhIsMvvtZ81+vhORmQ1+X88GvtNOtVyI/8QlWcOAR+G3oc5hm8c0DsvEtJPpjfR70611NNhw0ijDU92KMUz4Q+q1T0/Q8VCrzVLbse3R2Uhi3cJ5o2b3YyeFqzQUW3tmoCtivJl4pdZITgCjuFpzV/CD0EgvMth+PGKLVuPPi428JhwuTJ0KcbxRxhca2d4vQRNBsbLol4H0CknZMoBi+h9GCHeJe0dfyEjOlgINSa73h5ngg99mjFoYbJ+ZZbZWe2QYZ8sriPHQMKeHZKRU4GnGDasRt8/xeUqsT7f70C24ZoKjvPMCt9s6PFeZZhsknJ2GKYvVL2xWdzdYR7ljCYVYlfv860ebeTxhWC97pN5GTIkxGTg4ngZwMKgXqLgnpf5bZK1spHpqi5giEE73KbeFdMi4VGkoj3AYQhXFrTl5H3HiWQ2wCJZcwkbS/F2Y9Wrgn1+brXw28HCNIGJdnoF9Bo/thtZoMM+avCXnZFHNsNrK4jkPvQ7Faqrxjm2Qg3mYEcg8FldopjzJDHQ6+m3H2uMLjeTrFVK1J+tEXb4ZbVN+lk6kSDsqbFF9K1ZtNCCL7U1M73it38NizH9lOU0wsHpzMOfuce8OugyMmmNaKc94ORRLwPMBzzcJzMjYRyL+3THLo701PGUWgq0qkkjwXRTZJ+H3i822niF34hUrh7qc096UcB3Vr2bagaQtRtqH2B5WIJwSLT5q9SLfzML5BTIRZlf8qb3CZaDZNWTE4zHf4wqJOgA1zb0tJXX3+FnY7s1WJQ3qCL48nA457Qo7e3ZR4oDHOTNQT+1y9yquVOaPn5ZCcR7wMUy5xG2m2hR0xcq9X9ReuQYvASWhfLFm7W5PO1XKPCyGZOUDYU3q0Vrw7RtnW2YRIoGbkKnoHgm6Ue9qCZIQxmCyM2lxhAGf1id7btcpblsEsrLESNG/tn0q38XaGTZSrApywGC4TJ0lSK3cUC04XBkabFTW4TP/EK7K68zybKot6bahjFz/z+x/cykvyY1SrkT/N7+Uy6lYUTaHgwmUnOSsKkxAs20uM/NCD10cIxj6ItfQVCjI1jymiYK0yaILKd6vRKq9ehLv7PtFzOB/7bL1RFs03KGSC7K1WgryJxKP9oowItBnCcUZ0HLYRgZsz5WqsCXlFhX4/HkLLJwHWvvYYFLDQsrnPSXO9kuNByuSfwCNBcYLnMqyOou5WK7MI3Ul7Vkm97PfzrBBoeTGYS8U6YdGgdDhJugBBfriHvtdKcOm/CxjaYw0yLU02HhyOMDE43bdJCsNRyWB7TtvUi0+E9TgZTCA4RBveFHru04hAMNqiQXYPWqz5wCLCX2irPU02bU6zhF7H8xi/1mUUMRFb+vaRCNpW6aUFwpu3ydnd4fbItUd9zciBx/WV6WSkDOpRk5hhZnE1lEvFOmHSUgtWRxUZAxbx4cvGJVDOy1MPz0qdAeWPuDMvho5VeFu90MmxQIY+Gfl/+RwZ4h53mXQOyKi5z0lxWKc1fJwPeV4hOCNwF/LGd5mkZsElJMkKw1LT5sxFagG0dhgtPD3BnWOLMEaS6tQmDxaYdmaq40DA5xbDZrBVNAs4wbJ5TIfdENL6CXsd5xcxhTwcHD4l4J0w6lI4qL+m9b387ejSeVsPki5k2NsmQdSrkOMPisAFhBUsI/j7dxvOhz7OhT1oI3minmFZnNdkmDNIQabibAi63U7zbbaKIxiE6r3wohuves3MUpfjvc5vYVupi04BjDxUGf+I01UwEF2rNK4WQdRGTyTGGxTwjkakoRn1WstmsAdwMLKG8N/3+XC63tlEDSzh4cawF5P0niYrsWsaM8R/QMJlnWnVjwUstp25Z+EAOMUyWmDZPRqxeTzJtDq28TmY/mjJdaDs8I/2YbPB+po0iW2mRaXNzZhq/8otsV5J2w+A6K81Ms3bCsoTgTXaab3s9VVWhTZRd5Y0k4ySS/ZnS3gSkcrncWdls9kzgq8C1jRlWwsGMbR6Kax2NF66uul2QJm2fNEGjGn8+4jbTWerm5QGbf8cZFn/mNqaA5RI7zValuCMo0RHTj8QFLrZHV/HYLAz+aJjFKlc7aQ4RBveEJXYpxUzD4HIrNaJwzWRiu5T8xC+wppKmeYJp8163aVRuSXEIXceyqR7ZbPbfgKdzudzPK39vyeVyR8Q9PgyltqwkbpUwPLRWdOx9mO7CWpTycJwZTG89nZbM+BsVTyRSa+7s6WFjEDDftrmqubnG8Hh/6VaK+3t62Ccl9+fzrPR9JDDPsnhLSwvvmzatoa93oLMzDPng9u2s8as3sc9KpfjP2bNHE+KKPGB/Vt6tQOeAv2U2m7VyuVzktvrevfFxzP1hKGeLg5mpfm4Ep9Lq9vtblvJQyjfm/Uylc3MWcBY2SNhTGpsGY+cg+Knn0RlKDKAJQVabXBKYU+Y8jQfD+d7cXOpmTVCbffREqcRPtnVw5QibVM2cGd2zaH9K77qAgc9qxAl3QkLC5OaXfoHv+wXWK0kAdKG5T3r8c7FryGMTqllfJ4vn5bBxErk/4v0YcCVAJea9siEjSkg4AMhrxbLQZ/swO/JNNPcHXmThz7PSZ80YuMAcyKTqbCKnGhjx2p+wyW3AJdls9nHKMZmbGjOkhISpi9Kab3t5Hq44rmeApabDx1PNsS41w+UVGXB3UKKgNUeZFtfa6SF7f6wIfe4MSuzRipnC5GonxXGD3GhCrdkZs1osAi+GAYsmsYPNZONMy+Ex6dcUHzUBlzaw2daoxTuXyyngQw0bSULCAcCP/AK3Bv3Z2QXgUenjl7r5yijKvLXWLJMB9wUeD4f97juEHg8FHv+Ubo2dFH7nF/mG1zOgqVTAE6HHx1MtnDsgi8MEphkGeyIE3Iakt8gIucpOkVMh9wWlvn41bQje7mQ4poGTYPKpJCQ0CF0xFYhimQxYFfosHoHRwLKw7Dr+csyq+GUV8gOvwCciHGek1vwiKNZ0A9yL5ud+gXMsp68aUwjBOZbLOr82qeBE02bJFDZHmAiEEHwi1cJVdopHQw8LweWWW1W41QgS8U5IaBA+8Sa0PvCKDGvEW2vN/wVFHgw8OrRipjA433a5ynL5SqmbrXX8IAFejHHDycnoikWAnArZrhWzBzSseo+TIa8Vvw98dqFIASebNh93G2tFFmqNjyaNaLin42TjWNMeU8PkRLwTEhqEAxxqmOyL6KiXBhZHNI36X7/AD/1CX5OpDq1Y7YU8GXhDCjeAimnr5FSaQ0XJtwUMHokhBB9JtfBHjmRLs01zt8/cBq4U80rylVIPT0sfj3JvlzNNh0+nW5MKylGSdOlPSGgQQggutJzIH9UpplMT7/S15v6gVCOwCobdUjVrRK/sFhoWx8X0BGkVRmw+RKthcl5TU0OFW2vN3xa7eFj6FCm/vx7gfunziZjmWwlDk4h3QkIDeZuT4T1OhnmGgUnZTOESy+XTqdrwwxYVsilmdR3VH3wwCwyTdzvRbVqFENzkNHFYRF+SnVrxqUInncPoKtgIngsDVsZMRstUyEtDWLslRJOETRISGogQgve4TbzDydChFW1C0BTT2GmaMGlB0B0R+sgAhwuTtYNccyxgkWFxgmlzg5PmkDrph6fYDtfJFN8OajciX9GSXwRFPtCgPin1eFnFG71p4Anp87pkU3TEJOKdkDAG2EJw+BCOP+2GwVLTjjRyOMV0+LDbxLe9PC/KAB/NQsPiLU6mKs1vKDbWsUxbN04FRIcOkd9u70dnxIOZRLwTEiaQv0g1Uyh1s0wGBJR/kEtMm79INTPDMPlCpo1iJUNjuP23B1KviKeRHe7qcbHl8p/0RLr2pIE3Ji7xoyIR74SECWSaYfL/Mu28EPq8IkMWmhZLTbsqjS4tBOlRrk7fYLncE5RqTB0EcMY4hSosIfjHdCt/W+yqChE5wAedzH5Xnh6sJOKdkDAJONlyOHkMxHSx5fA2J8OtfoHefoQucJmd4oo67u+N5njL4TdN0/mhX2CVDJlvGLzFyTB7nKo39ynJnYGHj+YsyxnT/OvxIhHvhIQDnPe4TVxkufwuLCE1nGs5HDcBG4SGYXBTauw3SAdzp1/kB16BXZQze27xC1xkuXwi1TKlc8wT8U5IOAiYa1q8zxx/4ZxotsuQ73l59g0I1xSBO0OPBYHFDTGpllOBRLwTEqYYJa35ntfDMhlQ0pqFpsVbnTSLzSTdbjB3hqUq4R7I06GfiHdCQsL4oLXmc8VOnhnQY3tL6LNahvxjupXsARDLbSTFOi6PpVFaQE4WkgrLhIQpxKOhx7MR5gg7teL//ME5JQmLTSs2T2dBTPuAqUIi3gkJU4hVMoytVtw0TuXuU4nzLZfTI65G5giDG52ReUlONqb21JOQcJDRXCc7omkKZ06MFYYQfCHdxg+8PMsrlarHGBZvczINbb41EUzt0SckHGRcY6e4PSixc1BDK0HZfiuhFlcIPjQBKYpjTRI2SUiYQrQaJh92mzl8QKl8E3C1neIt9tQOAySMjGTlnZAwxbjAdjnTcrg3KJHXmnMsh3lTPASQMHKSTzwhYQqSEoJrJ+GGm9YaSbmfScLYkoh3QkLCfpNXkm97eV6QAUWtWWBavNlO8/oRtK9NGBmJeCckJOwXWmv+odRdVTi0RwaskyGfBU5LBHxMSDYsExIS9osnQp/nIwqHOtHcHpYmYEQHB6NaeWez2Tbgx0Ar5ba8H8/lck80cmAJCQlTg9XHBb/PAAADpklEQVQqiHSpB9iSFA6NGaNdeX8ceCCXy50PvBf4VsNGlJCQMKWYXsfurTWxOBszRhvz/nfAG/AcybVRQsJByhV2ituCIhsHrbIFJBuWY4jQQ3TWymaz7wP+ctDNN+VyuWey2exhwN3AX+RyuYfrPU8YSm1Zid1RQsKByLPFIl/avZuX/bKZ8nTD4KrmZj45Y0aVpVvCqIg8gUOKdxzZbPYE4OfAX+VyubuHenxHR/eY9F+cObOFjo7usXjqKU9ybuJJzk08oz03UmseCz12K8U5lstM88BbrE3E92bmzJZI8R7thuXrgFuBt+ZyueX7M7CEhIQDA1MIzkuc4MeN0ca8/wVIAV/PZrMAnblc7tqGjSohISEhoS6jEu9EqBMSEhImlqRIJyEhIWEKkoh3QkJCwhQkEe+EhISEKcioUwUTEhISEiaOZOWdkJCQMAVJxDshISFhCpKId0JCQsIUJBHvhISEhClIIt4JCQkJU5BEvBMSEhKmIIl4JyQkJExBDhgD4mw2eyzwFDArl8sl5hAkdnVRZLNZA7gZWELZUOT9uVxu7cSOanKQzWZt4PvAAsAFvpjL5W6f0EFNIrLZ7KHAc8AluVxu9USP54BYeWez2Vbgq/S7+ySUSezqankTkMrlcmcBn6L8vUko8y5gdy6XOxe4HPjmBI9n0lCZ2L4DFCd6LL1MefHOZrMC+C7waaAwwcOZbPw75S8cJHZ1vZwD3AOQy+WeBE6d2OFMKm4FPlf5vwDCCRzLZONfgf8Etk70QHqZUmGTGEu2jcDPc7nc8kpv8YOSYdjV/Rj4i/Ef2aSjFegc8LfMZrNWLpc76IUql8v1AGSz2Rbgl8BnJ3ZEk4NsNvteoCOXy92bzWb/dqLH08uU722SzWbXAq9V/jwTeDqXy503gUOaVIzUru5AJ5vN/hvwZC6Xu6Xy92u5XG7OBA9r0pDNZucCtwE353K570/0eCYD2Wz2EUBX/p0ErAGuyeVy2ydyXFNq5R1FLpc7uvf/2Wx2A3DphA1mkpHY1UXyGHA1cEs2mz0TWDnB45k0ZLPZWcDvgI/kcrkHJno8k4WBi8FsNvt74EMTLdxwAIh3Ql0Su7pabgMuyWazj1OO6940weOZTHwamAZ8LpvN9sa+r8jlcpNmky6hnykfNklISEg4GJny2SYJCQkJByOJeCckJCRMQRLxTkhISJiCJOKdkJCQMAVJxDshISFhCpKId0JCQsIUJBHvhISEhCnI/wci1vGoRc+OWgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.datasets import make_blobs\n", + "\n", + "X, y = make_blobs(n_samples=300, centers=4,\n", + " random_state=0, cluster_std=1.0)\n", + "plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "A simple decision tree built on this data will iteratively split the data along one or the other axis \n", + "- according to some quantitative criterion, and \n", + "- at each level assign the label of the new region according to a majority vote of points within it.\n", + "\n", + "This figure presents a visualization of **the first four levels** of a decision tree classifier for this data:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](figures/05.08-decision-tree-levels.png)\n", + "[figure source in Appendix](06.00-Figure-Code.ipynb#Decision-Tree-Levels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## Notice\n", + "\n", + "after the each split\n", + "\n", + "- Nodes that contain all of one color will not be splitted again. \n", + "- At each level *every* region is again split along one of the two features." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "This process of fitting a decision tree to our data can be done in Scikit-Learn with the ``DecisionTreeClassifier`` estimator:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T06:58:23.626281Z", + "start_time": "2018-12-26T06:58:23.573622Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "tree = DecisionTreeClassifier().fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Let's write a quick utility function to help us visualize the output of the classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T07:00:13.511603Z", + "start_time": "2018-12-26T07:00:13.476742Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):\n", + " ax = ax or plt.gca()\n", + " \n", + " # Plot the training points\n", + " ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,\n", + " clim=(y.min(), y.max()), zorder=3)\n", + " ax.axis('tight')\n", + " ax.axis('off')\n", + " xlim = ax.get_xlim()\n", + " ylim = ax.get_ylim()\n", + " \n", + " # fit the estimator\n", + " model.fit(X, y)\n", + " xx, yy = np.meshgrid(np.linspace(*xlim, num=200),\n", + " np.linspace(*ylim, num=200))\n", + " Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)\n", + "\n", + " # Create a color plot with the results\n", + " n_classes = len(np.unique(y))\n", + " contours = ax.contourf(xx, yy, Z, alpha=0.3,\n", + " levels=np.arange(n_classes + 1) - 0.5,\n", + " cmap=cmap, vmin = y.min(), vmax = y.max(),\n", + " zorder=1)\n", + "\n", + " ax.set(xlim=xlim, ylim=ylim)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Now we can examine what the decision tree classification looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T07:16:21.081356Z", + "start_time": "2018-12-26T07:16:20.926405Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize_classifier(DecisionTreeClassifier(), X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "If you're running this notebook live, you can use the helpers script included in [The Online Appendix](06.00-Figure-Code.ipynb#Helper-Code) to bring up an interactive visualization of the decision tree building process:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T07:16:48.072333Z", + "start_time": "2018-12-26T07:16:47.884276Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/matplotlib/contour.py:1000: UserWarning: The following kwargs were not used by contour: 'clim'\n", + " s)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# helpers_05_08 is found in the online appendix\n", + "import helpers_05_08\n", + "helpers_05_08.plot_tree_interactive(X, y);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Notice that as the depth increases, we tend to get very strangely shaped classification regions; \n", + "- for example, at a depth of five, there is a tall and skinny purple region between the yellow and blue regions.\n", + "- It's clear that this is less a result of the true, intrinsic data distribution\n", + "- It's more a result of the particular sampling or noise properties of the data.\n", + "\n", + "That is, this decision tree, even at only five levels deep, is clearly **over-fitting** our data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Decision trees and over-fitting\n", + "\n", + "Such over-fitting turns out to be a general property of decision trees: \n", + "- it is very easy to go too deep in the tree\n", + " - to fit details of the particular data rather than the overall properties of the distributions they are drawn from.\n", + "\n", + "Another way to see this over-fitting is \n", + "- to look at models trained on different subsets of the data\n", + "\n", + "for example, in this figure we train two different trees, each on half of the original data:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](figures/05.08-decision-tree-overfitting.png)\n", + "[figure source in Appendix](06.00-Figure-Code.ipynb#Decision-Tree-Overfitting)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "It is clear that \n", + "- in some places, the two trees produce consistent results \n", + " - e.g., in the four corners\n", + "- while in other places, the two trees give very different classifications \n", + " - e.g., in the regions between any two clusters\n", + " \n", + "The key observation is that the inconsistencies tend to happen where the classification is less certain, \n", + "> ### by using information from *both* of these trees, we might come up with a better result!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "If you are running this notebook live, the following function will allow you to interactively display the fits of trees trained on a random subset of the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T06:58:27.275825Z", + "start_time": "2018-12-26T06:58:27.101954Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/matplotlib/contour.py:1000: UserWarning: The following kwargs were not used by contour: 'clim'\n", + " s)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# helpers_05_08 is found in the online appendix\n", + "import helpers_05_08\n", + "helpers_05_08.randomized_tree_interactive(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Just as using information from two trees improves our results, we might expect that using information from many trees would improve our results even further." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Ensembles of Estimators: Random Forests\n", + "\n", + "Multiple overfitting estimators can be combined to reduce the effect of this overfitting.\n", + "This notion is called **bagging**.\n", + "- an ensemble method (集成学习)\n", + "\n", + "- Bagging makes use of an ensemble (a grab bag, perhaps) of parallel estimators, \n", + " - each of which over-fits the data, and \n", + " - averages the results to find a better classification.\n", + "\n", + "An ensemble of randomized decision trees is known as a **random forest**.\n", + "\n", + "This type of bagging classification can be done manually using Scikit-Learn's ``BaggingClassifier`` meta-estimator, as shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:22:51.279359Z", + "start_time": "2018-05-21T09:22:50.701772Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/matplotlib/contour.py:960: UserWarning: The following kwargs were not used by contour: 'clim'\n", + " s)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD3CAYAAADSftWOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzsvWd8XMd59n3NnLN9scCi9w4CBHsBG9hJkRRFSqSKZUuWbMWJHetJXpc4j2PncSwn8S+OHTtx4iROLFm2ZUuWLFmUqEZSYu9g7yB671hgezln5v0ALLANjQAJgDj/bzt7Zs69i8W1s/fchXDOoaCgoKAwvaCTbYCCgoKCwthRxFtBQUFhGqKIt4KCgsI0RBFvBQUFhWmIIt4KCgoK0xDxnt3p84t23rN7KSgoKNwv/Ori3kjDys5bQUFBYRqiiLeCgoLCNEQRbwUFBYVpiCLeCgoKCtMQRbwVFBQUpiGKeCsoKChMQ+5dqKDC1KKyKwUV3WuhFbuxMuMQtKI02SYpKCiMHkW8ZyIHq7eg0fo5cEQBANrsm/FA3t8jTu+YZMsUFBRGieI2mWl4ZQGt9ocHhBsAPPJsnGp4dBKtUlBQGCOKeM80Oh3R8LGksHGvnDgJ1igoKNwhinjPNBKNPVALjWHjOlX4mIKCwpRFEe+ZhkgZMqPfgEDa+0ck6MRzWJv19qTapaCgMCaUA8uZSGnmKRQnXMaFljUwadqwJPUSKJlsqxQUFMaAIt4zFbPOiU25+ybbDAUFhTtDcZsoKCgoTEMU8VZQUFCYhijiraCgoDANUXzeCveGs40L0WZfAlGwYXnae4jVOyfbJAWF6Ywi3gp3nw8rPoV2x6cAqAEA+6pKsS7ru0g19UyuYQoK0xfFbaJwd3F41eh2PQC/cAOAV87BhZZdk2eUgsL0RxFvhbtLl8sEicWGjftYzCRYo6Bw36C4TRQiwzhwoOpRWFzLwEFh0lzG1vzXIFI2pnXSorqgFmrhlQuCxo3qqok0V0FhpqHsvBUi83HVI2i1PwOPXAyvXIRO55PYV/H0mNcRKEe++RWoaB0AgMABk+YTbMh5f0LtbeiNR7PNPKFrKihMYZSdt0JkLO4SAELQmNW7GMArY16rJP0SFqR8FVdaixGnb0eOuW1CbHRLIg5V70KX60HI3AwCL/Sqy9ia/xNEaTwTcg8FhSmKsvOeqjTbzDhQtQNnGheD8Ttfp7E3FodqNqO6O3nijLsD1IKMpWlXJ0y4ZUawt/xbaHc+C5knABDBoYfDtxKHap4FAFg9Whyu2YhLLUUTck8FhSmEsvOeihyu2YQG67Ng3Ixmm4z63gvYMeufoFONrVXZRxVPoMPxMBii0dBrx42OA9hR+PKo5pq1Z9Fin4PB3TeHSX1hbC/kLnK6cQWcviURn3P68nCyfjlqer4AiSWjrteLyu4LeGjWD8f8HiooTFGUnfdUw+UT0WR9FIz7/bcCnL4SHK3bOaZ1qi1JaHfsAkM0AIDDiC7XdlxsmT2q+Zvz3kWy8TfQCDegFm4hQf86thX8dkw23E1snjQM9fkVqA11vZ+GxPy/NtRw+FbgSO0j98w+BYW7jLLznmo02xIh8bSwcacvfUzrVHWXBLU660ODVvs8ADdHnE8JsDX/bQCDdb4vNM9BQ+9aAByZMYexKOXWmGyaSLJjzqLd8Rg4DEHjBFYkGo6jtucvw+Y4fVn3yjwFhbuNsvOeaqREdUCkrWHjOjF8bDiSjbcAuENGZcRo6u7IrmN1pbjW/rfo8TyIHs92XG37Dk7Ur7ijtSLRZI3F2caFsHk0o7q+KKEeScY3QUln/4gLWuEaFqd8F8vSTkAcaDYxiFroDBtTUJimKDvvqYZe5UOS4V00257u3zlz6MQrWJ31zpjWmZdUiaruo+j1bEKf35rBqD6N5Rln7siuFttWcBgHHnNEocm6DcDpO1rPD+PA+7efQ49rExhMqOhuRbrpVazLPhx23b7KJ9HjXgbORRjVV7Gt4GXYvR+h1pKNvNhqRGtdA9cnGj9Ai+3pgZ25WriNFel/HJetgVxty0eLvRgZpsuYnXBnX4gKCuNAEe+pyOa8D1DZdRGV3atgULdjZcaJMSfHAMDDRT/DmcYz6HYVwqSpxaqME3fcMUdiprAxmYe6ZcbO2cYl6HbthP+zKLFkNFqfhtVzGibN4C+Hj6seQbvjM/D/WrS4c/BRBcHDRb9ArO562LoP5L2H6+3XUdezEmrBglWZH0Ov8o3bXgB459afo8e9EYAWLTYXqro/wI7CX0/I2goKo2TmiXezzYyG3hwUxd8M2qlNNfLjWpAf99a41qAEWJlRBqBs3PboVVXo9eQGjenEynGv2+Gch9DPocSScK1tIVZlDu7qe9xLEOrms3vnD7v2nMQazEmsGbeNgZxvnose92YM1mrRodv1IMo7D6IwvmFC76WgMAwzS7zfK38WFtcDYIhGRVc7UqPewMbc/ZNt1rRgdebLOFQbDadvHgg49KrLWJv9q3GvqxM7wsYIHEiJChVdOcLssf8aGS8djmIEFtkCAA496nsXKOKtcC+ZOeJ9rmk+ulwPw/+PJ/NENNk+g07nCcTrHZNrXAQYB/ZXPgaLuxSca6BX3cDmvF/AqPZOij3xBjuemPOPaLaZQcCREjVYzvVk/XK02leBQEaa6SiWpV8a9bqrMvdj761SuOW5/SMM0dojYck88fqTaLDOw6BwckRrzo3vRd0BcfrbaHNICP7fcSEt6to9t0VhRjNzxLvNMR+hOybG43CtbTnW5xycHKOG4VDNNrQ5nob/b9TrycCBKgG7Z//7pNqVGmUJenywegsarH8KQAsAsHWuhJf9DKszT4xqPb3Kh51F38XxugfhlhIRq7uJVRHmbsw9gIPVFJ3OVeAQYdJcwpa8N8f9esbKktRLaLQegdWzFoAKgBdm7ccoTqy957YozGhmjnjrVS0RRl1INo7fb3s36HYtRejfx+GdC8Zxx4eOd4N2R9/BnR8OA1qsmwAMLd71vfHociZiTmI51IIMvcqHLfnvjnivjbn7AExux3tKgEeKforzzYfQ5SxAsvE6Fk5ivLvCjGXmiHdp5mF03FwHl7Swf4QhWnMMRQn1k2rXUBCEp3GTiH7fyYVxfdiYHGEM6KtH8m75/4HNUwoOA2521KIg9pcoGYOb5V5h9WjR2JuMbHNTWJQKJUBJ2hUAVybHOAWFmSTeakHGzsK/x4n6rXBJyYjR3kZp5rHJNmtIko1HUWVZBEA3MGbSnJtSu24AICQ8Ysegqoh47eHabbB6tgw89rFsVFqexeLUyxDoOKpvDYNbEnG5ZQHMulbMim8a1Zx9lbvR4XgIMk/ExdYmpEW9ifU5n9wV+xQU7pCZI94AoFNJ2Jw3sXWk7xars06CQUSbbR0YNDCpr+KB/D/cczva7CZcat0ESnxYnv5xUOz1/spdYU0WNPQGNuX9JuJaVk9B2JhXzkZ9byJyzG1wSyLONq4CJRKWp5+GShhfNElZ40JUdP8pfCwTgAtX28uws/BfoRaG/gVzoz0bbfYnwdH360FiaWiwPoN2x1kkGmzjskdBYQKZWeIdiZP1y9BiXwNwikTjaazJmtjd+MWW2ajoehJeORUqoQ05MW+NOhpjbdZRAEcn1J6xUNa0AOWdfwmZJwIA3ivfhiWpP0RhfJ+rqctVitDPEKXuIZNhVEJ4w2GBdCNO34sb7dm43PZVeOVcABwN1gqsSP8hcszhae6jQWYElZbP9gs3AOhg967FoZqq/potkanrXT4g3H4Yj8XVtlJsyv3ojmxRULgLzOzaJkdq16Gi+69g966D3bcG1Zav4JOqBydsfZtHgxvtfwmXtBgyT4ZbWoDbXX+BDodx5MlTgBrLowPCDQA+lonr7Y8NPOY8vA5JpDE/s2IPILTeil51AyaNGzc6PtMv3ABA4JVn4WLrU3dse6vdDK8cXojK4c0Zdp5OjPRl4YVZp8RwK0wpZrZ4t9i2I9CnDKjR4Vw3YeuXNa+DxIOrAco8ERdbNk/I+m5JhNOnmpC1IuGTE8PGvAFjBvWNsOeNmqHjnW91bkdgZIp/PcYBrxxeSdErpY7B2mBidVaItCtsXC10DzuvNPMItGLwQaRBXYbFKeEp+AoKk8jMdJtIjOLdW1+BRy4Me45xXYQZd4ZII6ffC3R8Lbp8MsUHFV+EzbMEHCL0quvYkPOfiNUN3q+hNw43O1bDqGnBkpTz4CDQipEbEXQ6oiBziiRjb9C4WmyC5AsWVY3YDAA4UrseXikeAukA42oIxIEozXlsyXttSLs9EcTYx1LgkUSoaHuAi6MPlRCefTladCoJ8fp9aLV/Bv4vDLVQjcUpe4adpxIYdsz6exyv2wGXnAyjqg5rsxV3icKUY2aK95HarbB5N0R8zqCauJjdlekn0GR9BF551sCYitZgRfr4IhcOVH0aPe7tA4/t3rU4UuPD7uKfAgAO1mxBU+8zfY0Y7BwVXR4QOKFX3cbKjJ8jzdS3+7R5NNhf+RU4fAsBUOhUV7Eh+6eIN9gBAIVxr+Nae3K/qHKohQosTvl9f2LOFzGY9CQhwfCHEWO1NWIbnCHfHyJth0aUkGN+G7c6MwfcNCJpQmHc+KoAbs1/Gxeay9FsWwaR2rAi4wPEjKKejUHtxdaCiatAqKBwF5iZ4m0LKbDkR686i015L03YfVQCQ2nmD3Ch+Ul45BSohXbMT/oDDONMcbd5w7vhOH19vyJcPhEt1t0DHXQAAkALDi0cvhU41UDw+JzvAwAO1XwOdt/qgDWW41jdFwa+BOYnVyA/7ms427gGAvViRfpJqASG041rEJytKqLbVQqgT7wbrbE43/wEPFIStGIrlqe/jiRjL5akvomjdYUDESqUdCMjeg8oAZamXUW66eu41PoAKJFRkrYfZp1zXO8TACxOvYHFGHTvVHSlweoxY37SjXFHsygoTCIzS7wZB041rIRbygx7jqIDW/J+POG1QzKjO5EZ/Z8TuiaNEFtNad9BYLMtARJPGXKuy1cIm0eDKI0HTl9++PNSXtBjvcoXVj6A8+AyAwDA+8Xc5RNxrPZv4WUF/esBh2py8ficbyPN1I1dRd/EqYYN8LEozE08hDTTYLp9cpQV26LGV0lxKNySiPdvfx1271IAWtzuqsbs+J8r2ZEK05WZdWC5t/zLqOz+Jjxy6M7Vg3jD/ildIjaQ1KhDIAgspuVDnO5o/3MdEEmkUgB9UOqGqj/OWSDhccsCtY94/yh1+KGkQdV3oHeqYcOAcPvxyLNxqn4lgD5fdF7sJXjlGJxtfAaHazaB3Z38nCAO1TwKu3c1/P5vr5yLW12fu6v3tnk0OFC5Ex/efhLV3ckjT1BQGD0zZ+dd0ZWKXvdahH5h6cTzyI99E4tTp340wcWWYlR2PwqfnAS10ABKbBCIB3H6MqzPOQSgTxxTovagyfrZANeJHw6T5vTAwWWaaT8qu4sCel06kWT4OGgG48CF5vnocuUhzXQFcxOrsCX/d/iwQgObZwkAAqP6Mrbk/woA4GORwiAJGqyfhtN3Fh2OGByv+/uBPp1W70a8fzsTO0fZ1f5OcfrCQwS9UhZ63AbE6ia+qmStJQGnGv8fvHLffTvrH0Kj9UWszZ64uP2ypvmotjwBiSVDRVuRa34DS9OuTtj6ClOamSPezbb8sGa1AGBQ1UwL4e50GnCj46sBHdEBkdYjyfQmsmKCQ/Y25u5DXc8F3OpcDZmr4PKmgkOPaO01bModPFRclXkGOtX3UdezHgBFmuk4StIGE4iarTE4Xv8VuKT5AFRotbtQa+nrGvNw0YsAXgwrlLUg6SDa7DvBER9kk49l4Vjdw/DK0SENlil63GvQ6/79Xf3lI1JLhLFuRKnvzj0vtz02INwAwBCDJtvDYPzohJQ4sLj0uN31FwOfB4klobwrCflxXxnVoazCtGfmiPecxPOo7+0E44Gi4kOCcXoUF7rQvCVIuAFAYpmo7fk66npsuKr7BDsLfznwXFZMB7Jihs4kBIBTDcvQalsBQmRkRR/EotS+L4G+vpJ/gm7XA0DQF54O3a5tqOzaj/y4PtdMqBAlR/UiNeoNNNmeD7uf0xcey913v2j0egwTLt6VXSmotixDrL4a8xL34HTj/IBwRBcSDPvu2qGlV0oIG5NYAiQmDJueP1rONW+M8HlIxrmmjdOmBITCuJg54h2vdyAt6nU02z4FmSeAwAaz7mMsTZ16Fe0iMVxFQY4odLu241zTOSxNG92X0SfVW9Fo/QL8PuCr7Svhlv8NKzPO4VTDSnS7diDS54PDgLreBQPiHYlVmZ/g7Zu7ILHguG6t2Aqt2A2rZyMC3VcaoRLppjvv7H6sbg1abRvAoIFRfQNb8l7DJ9W70e54DBxGtNh9iFKfwqbcb+FC83ZI3IC0qNNYnBqeZDRRaFVNcEolQWNqoXFChBsAVEPkEIjUHXFc4b5j5og30OdO6HEfxY32BUg3VSAzJjwDb6pSkn4A75U/BHnISBI12hzzMNoypR1hdbhNaLJuBXAOXc5iDPXZILAjO+bysGvrVT4kG95Bk+2z/f50Dp14FWuy9kCv8uL98iz0eFaD8WhohErMTngpaAff49bhevsCpJuqkBUzfKLOyfrlqLb8H6C/Holbmoe95WY4fEsDut2rYPOuQXnnWTw46/Vh15soSjNfw8dVOXBJcwEIEEkj8mN/P2Hrr8g4ghbbQ/CywYghNa3EiowjE3YPhSnNzBJvAIjRuoIa204X6ixZYEEhehx9MdyDj43q0ZU8BQA5wsGizPpcJBpxqBRyN+J0HyIvduhdt59NeR+iuvsibneVQq/uwKqM4xBpn4tiZ9FL6HG/il53FDKi24OE+2DNFjRbPwWZJ6Kq24YY7SHsKHxxSD9xi20tEFJIyu4tAYc55EoCqycX96rQV6zOhcfnfAfnmhbBJcVgaeqJccf3B6IVJazM+Cdcan0SHjkJGqENC5NfHzKLVuG+Y+aJ93SlonsXOOICRkLVzAvGhRHXKe/MQEXXBoCwPv0PwCNn4tUr/w6j+hI0QnlA+QAGjXALC5L/G7MT6kZtc25sK3JjI8dtx2hdYQdrFpcezdYnIfM+fzFHFCzu7TjTeBkrM8oirsMR6TXLoOgNibbhiNKM3vaJgBJgWfrFu7Z+trkD2eaf3bX1FaY0iniPRI9bh7Kmzf01pj+ZtAbAEosb4QoNmm1Pwu49FmTjmcbFaOjdCpkbQYkDbt/sAFHzgICBgwLQgMMEHzPB4s5GnO51xNDT8MpxMOtuojTz2F1vBHG9feGAcA8ioss5B0Bk8Y7Tl8HRW4K+fpJ9GNVXoVM1o93xKPoKj0kwqM+gNFNxKSjcNyjiPRyXWopwvf2rkHjfwVvrrYewOOVHY9p9RoJx4FjdevS4Z0NFLViR/i5a7Mmo7NoKBg3idBexNjtYaDRiQ8RCWoFILBm3OooGDi0vthThdufXIsR7D6wKnXgaPpYEHwuOg+5xb8Cn5n5pwg7YRkOqqQrVFkdYSKdO1TrknA05n2B/ZRS6XKvBuQZ6VTk25f4CURoPbnacRn3PUkRra7Es/fyU60KkoDAOFPEejoruxweEG+gLzbvY8lcg+DGKxiHge8u/hB73g/BHXHxYuQYyM4AjFgBg9ayFoyIRDxYMds4pSfstjtelwiMXASAQSGfYLpWiF+mmQbvqejYNI9x9yDwKhIT/mpB5It4r///waPG/3uGrHDu55jZcbTvU/970uUN04mWUZn487Lwt+XsAhFcLnJ1Qh9kJdajvjceem1+HW8qCinYjI/o9rMg4fxdegYLCPUMR7+HwyeEpzT6WjbNNP0CD9VU8kLd3zGs2WuPCMj0llhFylQiLaz1k9uZAb8d0UzeemPMtnG+ZD8ZEFCdew4cVfwe3NK9/jhex+gNIjhos6xrZHxyMVqyHVtWONnsBQrNPbd41+OMNCcUJ79yzRs07C/8XZxouo9M1G3pVG0ozD4zrEI5x4GT9X8EjzwEA+FgOKrpzEKf/NgrimifKbAWFe40i3sOhFlrCakwDfbHO7Y6H4PR9NGTLr6FodyQFpKMPjcyN8DEBAh0ULoFyLEvrC9NrspqRZDwAt+8CJGZAkvEyStKCQ/iSjadh9QzW8wD6Qv362nzJ0InXUZr5CuL0drxxrQQeuTjEChE27wMoa14EH/snzEuK3Fh4IqEEWJl5BsCZCVnvatus/l8rgzAei/KuzSiIi9xrU0FhGqCI93AUxr+Oq22p8IXtjPvSkdvsscgxt41pzeKE27jVUR/xSyEQrVgx5I7zw4on0enYAYZoUPQi3vBemHADwMqMs3D6fol2xyYwZoROVYGVGb9Eiy0VKsGLOYmVA37g7Jg/oLzrWwgu9doH4/Go7N6OeUk/HdNrnQoQcISF1SA8VkdBYZqhiPdwzEuqRK75a/iw4mtw+EqDnlPRRqSFZAUyDhyu3YJu50JQ4kNG9MEwUdWKEnLMv0Z19+ch8fAvBUCCTryKpan/G9Gm8s4MdDh2DRzqMUSjw7EbtzpORXRt9DXNDe4EkxIV3gh4RcZ5WL2voNPxAHwsHaEuFLeUElbHZDRca8tDRfd2SCwKUepybM57eyDe+14wN6kCNztvwC3NHxijpAtJhvN4++ZfwiOlQi10YE7imwONlRUUpgHCCy+8cG/utOd/ho+UmKqoBRk55jOo6cmGjyUBECCQDmTEvIpcc23QtR9VPokW23PwsWx45Rx0OkvglmqRbgpOasmIboZRcx71vYMHc36i1Ifw+JwfwDxEpbvzzRth9y4PGVVB5m3Ijx19berbnWmo7JqFOH3HQH2PvNhyFCd8iIquWWGp7TKPRVV3HnJjT41afMs703Gh5TtwS/PhY+lw+BaivseMooTIYX93A0KAWO0FdDpN4JChEauQa34FVZbPwelbBpknwitno9U+F9kxB6FRklwUphi7/vx2pGFl5z0adCoJjxV/Hxdb5qDXnYYFyScQpw8XV4trNQLFmCMKjb2bsTz9Qti1ueY2lDVdh1taGDDqRppp+AzAWF01Wu0Sgv92EmJ11aN6LT6ZYm/5V2HzLgegQ5WlAXmxLw3YKFCOJam/wLkmFdzyfAw6GAQ4fCtwtHZHf3THyJR3PhjUfR4AbJ7l6Hb96q6UYR2KVFMPdpsGXT7H6lYHtaYD+iKJzjRuxgN5790zuxQUxsHMasYwHigBlqRex8bc/RGFm/HIzYsZ14aN+VmR/h8wqE5CIK1QC7eRbvollo+Qkbck9QqM6pMA/LtfBqP6FJakDl9vxM+hmp2wedejL3kF8LEMVFs+C5kN+kPyYluwLP0/EclX7PAN76sPRI7w2hl0cPqGfk/uBTIL9+sDAOOqiOMKClMQZec9UVAC6FTlsHsDd5ocUZqha4X3lW39wZjvs6voxzjRcBZWdzZM2lqUZhyDzCjeq/gT2L2L0Ncg4SK2FfwyLMnGEakpgZyNJms8MgOKQCUaukFhC4sTp2T0VesS9Bdg9axHYPajVihHumlyC4ItSz+GZtujQYfGAmnHopTh48kVFKYQinhPJGuyfo6jtRQuaTYIvIjWnMEDeRPfhVygHGuzjiKwyNKH5c/A4n544LHFnY79lT7sKPxV0FwVDS86JdJOxOl7g8Z0Ki+AcP+vLGtGbee8pAtotl2FRyoAA6BX3cbC5F+Mev7dQq/yYW7iT1He9Wl4pVSohDbkmN9GoiG8LZyCwhRFEe+JJNFgw+Nz/hlOnwoiZfc0tdzmnRs2Zo8wVpL2Dg5WL4KX+RsNexCn2x9W8c4tqQBCwzwnMkyjsqfFFo1DNS/AF3Afk+bqlEmMmZ9cgfnJ/zDZZigo3CmKeN8Nxpq4MxFQEn5PgvCxJGMvts/6Nk43PgifbEJKVBmWRGgDp1f5oBbq4JbMIeO1o7LnfPPuAOEGAA06nVvg9O2dlPdHQeE+QxHv+4U43Uk02Yow6F+WEKs/GfHaaK0LW/NHducUJ/wG19qM/bt0DwyqC9iYM7pmBl45NmxMYvHodpmgV02fJhgKClMURbzvFzbnvY9DNV50OlYAIIjTn8HG3H3jWnNeUiVmJ3wDV9sKYVRbURA3+mYPRnVNf1TLIGqhDinGoRo9KCgojAFFvO8nNuQcAHBgQtcUKcOilJtjnrc+513sLc/tTyjSQkUbkGf+7UChrfuE5pWzSipi0wsm2w6F+5d1QMQCeIp43w1cPhGHax+F05cJNe3GktQ/ItUUnpJ+P6MWZDxW/GOUd2bA4krEguTL0KmGzF5st+TpPzr711tlJqrWzH/p4/y0U3e8Q3d7jcIHp79VyrggPLjsn48ZdJZhsyabV84qAYDUU7fLQseGoyI2vcDiNKf+VPd5pcmDwl3j0BDjhPN7tBH6/KKd9+ZGU4A3r38LDt/KgccqWoUdhd+CSTP9O3vLjKCuNxlmbS/MOudELHnm5pPZBy/8xf/1SsZ0ABCpu2tRwds/277ih2OuuX258qHUj8r++htub3Q+AGhU1pr1C//7X1cU/7420vXvdnz7c7erStZzSaBpCdUXPpX/1//evjpn6VntvHXn5q4YMTLmxH+te3WsNt63MGD2Ebre0EMKfRp0la+R97qj4Jlss6Y7h15SKTvve8LVtjw4fEuCxnwsD6cbto06rXyqcrGlGLc6vwCvnAdKehCjPYKHZr083g41Z29+ZrdfuAFAYtq42z1rn124cs+YM4BPHH12l1+4AcDjM+Wca33iy5lfuPBm6LXn33ts9qUDjzzMfWoVAFRUpW96xfmf2tyHf99wLndF80jCnHuWLlheS75JJRLlMfCqaw/Iv/XqI0T4zBCW7hG+bOghWwkIBYDF75Jl5x+R/p/HiMlpHXifo4j3RGP1JCJSWVUvG7mG91RGZqRfuPv8u4zHotv1CI7V1WJdduRfdk6fClXdWUg3NQ+3S3f7jImhYz1yXPx/Lf2zMfuSBTktNVTxO30pqZHWEv57yQraL9x9ENS1zZtbldvbcKKlaNja5RlXSH7aDfp1gREzAGjcZP6i9xF35gn5X0Zra+FRujKmhawnnKicMfzSlS3yu9O1YEV8NUnU95K1fuEGALWHFBUeF7Zd2Sa/O5m23a8o4j0hC5XOAAAgAElEQVTRLEo5h7qexv6yqn5cyIo+NWk2TQTVlhR45byQUQqLqxiR3HJHatejsfdJSDwNl1o7Ea//CA8WvBF2XX1v/Gb2TVUN1uEaPgPe/5HkyW7bkDvf3W0lpSm3CpZeO516bm7wDnl5t5CuA1YHXu6xactPR1hrRYWYrwUC/06QOzXO0bhCkqroJr9w+9HYyWJTO4myJvIhMzWj2hGVf0Z4RGPDLI2bFBMQNQBonFi86H0h6uJO+Xcj3XsqYm4lGZQTQ+i4yo2RGmcr3CGKeE80epUP+bG/QJXlaXjlDKhoJxIMH2JeUuVkmzYuYnW9oOgFQ3DSjkjDharLaUBD77OQeTyAvmYO7Y7Hcbn1EhYkD5a3PFyzEQ29zy1iL0UvwktYgv/lr+I9IkbLbfmP/OHqooKOb0S0pQswuyzN6w4d/Bez3vLU0rWnv/HTuM8fwdtJZU2z2VvZF0mOKJE0AJBE3tqaz96KtIzDzC9qnFgcuFv0GPiN0bwdhCG8iBWHKPgijPcT1Y6o+fvFf1R5SVh9GQJC9T1YAWBainf9fHY1qYK2iDJJ8Y9xcKk3kV+ZTLvuZxTxvhssS7+IpWkX0eGIhllnHzFNvsetg8VlQmZ0+5QNpYvTOxCjPYpu9074q1GqaD0WpYQfplxpWzEg3INo0WhdOiDePpmi2bYrsPBVFo6TZ/QPnY7dJv5Q39groXFks9a9f/DVIw9tfMr/uHEer+rOkL6ef1rYAg5SVSIfcMQjYvnZqw/Iexd+IBgNFqwgHGqPgd+4slX+n5HvCnSn8ZO6Xr6WggxUSPTqccOSxoeMksk/IzwcSbj9ED608E91PEZ4O3PYb+Lr6NOij6TLlFscsfyTylVMafR8l1DE+25BSV8q+nAwDnxw+3OwuNeD8WiohSoUxb+MRSmj2v3dcx4qfAlHa2vQ4y6GSK1YlLIXaSZL2HXx+jrU93oABBex0qkGOw91OKPhY2ENntNpmRfa4nE1RHDGwHVlm/zOiBdS4NIO+TUAr431HlUr2AWdjbxkascmKsPk0/Gq2yvYS8PNEXzD14Xx6PnY4+mnELfWshMqFzuTeovmdafyZlvS0O4jhfGjiPdkcrxuLbpcu+Bv4OCVC3Gr8wuYn/SNKbkDpwRYn3MQwMFhr5uXVImKrtOwedcNjGmFa1iVMTgv0dALFW2Gj+UGzdWJU6Nw1Si49oC8D8Cos1gdsfy6oYdvDXTTMHAvE9DtMfDrV7fKPzd2wJBzQdjg03BrRSk7Lqtw71rGTQA+HSSniffMOkU/RWXo7XH8ys217MhoDmJnH6Lro9voGsIhOmP4pctb5ben6wHuvUAR78mk2zUPoW3QvHIeqrrTMCt+FE6DKczDRf+Ko7XXYPPmQiu0oTTrvaCGyiJlyIh+E7U9fwLG4wEwaMUrWJMV0T99P3BzLTuq7yV5egvWU4YYSY2allns5epl7DIAFJykJckV9EuCTBIBILaJ7ri2Sf6+NZkP/wtuCpFUQVILTgnfFaU+37e+l2/Q2kjGxZ3yb4ebN/sQXZdQQ5/3u6HULixY9L5gGGneTEYR78kk0mEfhQ2x+umfjSlSNmJtlTVZx1EYfxmXW9fBoG7H8rSy4X5xBPq2C7obKwIzIqcFFDj/iPyysQtvRHWQuLZ83sjE/p01AxJq6BN+4Qb6Qu0KTtMnzu+SX0y7TrKj22hu3Xz5zFA+/KlAxjW6wy/cAEBARIMFa9VOvD5cDHx0G10TeH7Qf4C7DIAi3kOgiPdksjjlXRyuXRbQ0YUhWnsU8Xr7pNp1L0k02EbqG9m8clbJWe28dXACP43rS0X/ivZX65atDE5pny7Y4+Cwx/EgAVZ5IAo+JIVeq3KTlJI3ha/orKSUgmjjGkhnVwb/3Y2N8if3zuLRQ6Vwvz6VYdJZid6rH/oXBOHhWhRpTGEQ5c2ZTFJNPVib/Xe41PIwfLIJMbobWJettOIKwF/4KTTj8dzzKwqWnbs6maYBAPTd0BWeFHaJHsS5jai5sVH+8E781D4NJFmFVsETHIpJOBf1VrKRgBAAEGQSH9uIJzV2HLvXmYtx9SQusYrMaS7kV3pTecRfh65o3NTZ+Bq/vQAgaVDdO4Lrx2Hml9QuLAw8D3Ab+bWJs/7+QxHvySbd1I10068m24ypRmBhqAO5D2KkjMfJQO2AeuGH4t+rPaQQAAy9wJI9KDr7hPzjMS9GgY5s9lZyJf1zQSbxAODV8HJJhQ6NiwTVHxAlkpxSTgtql7Ch+6NOMIveFT5r7CbbBEZM8XW8x5rI37m8XQ47n7i2Sf5gybtCuq4XqyiD0adFRd18+cWR1r+yRd6z6ANBp+/BcsKhchv4tSvbRp43k1HEW2HK4Bdsf7U+f2GoEy1FFXg7aULdI4XH6MrYRrqNyjB5dbz65lr5ZXsCxuSuKjwuPOAXbj9aG1medp1kN83htWO1qaKUnW3L57eyL9INkoZby1exo3MOCTsM1uDrZMotnRm8bqzr3ykZl0lBVCd52O+TFhiJiW7Do/HV5FhnLm8PvJYL4Od2y/+jt+AVvZUYOjN4x6giRihw8Q7DNmcq90y8R1NiU2Fy8fuPQ/9Wd8OvHHqPCIJ9BP81fsGObiHRs07Sz6ncJIsJsHRks/ckDXclVdK/oJxEAYDKS/LmfoKY05+Wx9TTUuVBWLcgCqKN6iTpwNjFGwCsSdwaGKN+a638wZJ3SInGReYDAAf32uP4Pnsiv2fnIrGNdGHgYSIAUE6iUiross5cOeJ5hdMMp9PMJ6TqpEJk7pl4Vz62fNodLM0k8t86U+L3LweK6NJrp1OxYd66Ze6rRyZKxI88tPGpwHv4OfFf6/4FR8e+HmdA5htRhfEfCrNdUbyhfA07sr3yF2nbb728cX/vK2t65KKBCI7UWzTfY+BX/MLtR+0k82IbSFx3Bh91i7audHbO2El2BgqbJPLW2sVs3O9TcjlJT7tJtxIGTWcGe1flwWGNkyRZUvnF2iXsniZxOWN4fUwblwnIQFgrB/f2JrIp58qiEqjBQgz2WG7nQmj37PsLxW2iAKDvyzX/rTMl/f7lgV3vCaxD6fNHnsI1rMOGeesAwKy3NBd0Nw77jxtJ6P1RI+dyVzRP1M4aAK7+6LmSnPOmHAJCozo5z2xq2r7F+3FaPT5ltCI4G11gJFb08IyIC5Gx/bPXLeY3Y1r5G6YObKMyEmQV6juy2WvjrWGdeZkUZF0SvuX3fRt6+IbuNP6b0RSt0ndDV3RMeFLtJmmSmndULWVvWjKGTtkfDRUr2ZnYJnJaa8cqAkI4OHOa+LH6Rbx8POv60fVCV3BS2EJlaBvnsEOdOcGumNFSfFDYZG4iDwsSEmUVmttz2O8rSsf/RTpVUcR7hmN0OLeqmLwWHJr2rXNvCF2yN9S/fOK/1r16Av3JkrvbSnasOAWChiH/KfLfOjOsi2zptdOpJ+LGdwC59pevZie+/l6p1MO+vFn+f3H78S+kB/kgIMTjTS/8I95A38c7XI8lDRpEH08M3H179fxKd/rYRe7ydvlNrRXvRbeSuM4s3iprMHwdm1GQXEEf8gs3ABAQjakDm8Gwd1j/MQMW7BO/rXGRBQAAB0HxEVJUtlv6G69hHJEpFCh7VP5R4XG6RmclWY4YXlm+mkVubj1GEitJSsEp4dsqH8kCgKhOsr29gf1P+dqxrZ9QRZLj6slzAiMmAKBeFCRV0S82zmXXXNFwTYStUw1FvGcwRqdrvVqW/4z01x+njM1aZSivPIqNwyZGWOrMBWVZQEmIgJcho8RSZy7o1jpT/bv0u8HnL3xv9vpbry4R3R4VAMzGO9DBgl/jMACCvrpZfpULbhQhU97TOJe9aewkJ2Ob6DYq9dUlubFO/tWd2uM2we028YHmzLln6YKEWrKFyETvNvFbV7fIfxhL+CCVSMRYacpAGR16nfwzdLHahXmBYyovyS88IWy+ukX+YLT3jwQTwW6uZxPe7i3rMn3ML9wAIDBijmukDwNjE+/UW3SNX7j9iBJJyi0T1lzfLO+fKHunEop4z2BEWV5JQhpHxAn2ZEGQiCyLw7oQLHXmAmRF3n0P1z5sIiJHSuv3rlb3C7efdJxCAq6hI1i7BtCTFke3Lrm8K5O91zyb1wO8HmCnx2NHJLLP0+K0G/QbAiPRAKB1kSWL9iL23KPyf492DY+RV2qdZGngmE/Dq5kIFtVGomKbSEZLIasM3U1rnCQ2ME7aD/XBeKev524jeklo9UkIPiSCAWOpayJpYA0d4+Cyx8A7I11/P6CIt0IopOgTcQMDlW+vZMcjuQHMWZaK0F03ACxjdZd4Zv35MpK5JPS5AVrGb6BMhLCqgwwq+BDWCwAAoEeLt3F9zVeu5MSP2Zeae4YuiK8nmwiD2h6LS9c3yR8NJyoJtWSzX7j96GxkqdoB9WhdF1c3y28s3otUrY0sIoDOq8XN6iXspUV7haeNXWSLwIg54xpt7spgv7+5gR32z6taJh8zN5HHRYmk+sdkynubZg9eM9Xw6niTxkUWB45JajSOtSDV7VL5k+g28oDaQ2YNro0rlcvZhYmxdOqhiPcMRhKEU1SSlgTuvls/LBQTqlVfBYDYRrqrYrn8o/YCPmylP5XXl2DweL5EOS9wymoxX+qs+2nVjrd5gMuiNOWWvw3ZuCMU3p79/L6vnvrLNVrZPbBrq8Zm9GCwQCEHBwEBiORsTEn83Z0Id95pujjtJv0rv29c6+DLF34gxFzaIf9+qDlDNGnQiD4Io1HujMskP/2G8DnRg2ymgsVu5u9e2i6/nnuezjd1kN3+zjuiRFLj6+ln9T3sjDOmz6frjoKnpZD9PKmSflrwIY2JaO9OY++GxmJPJcpL5dfmHyDZKjfmEBAqqXh9yyw25lhvnw7StU3yP+SfoY+qPEj06dB0Y5385v1clVAR7xmMXa87bHQ4Nf0HltrG94sNF76xa2DXpvKSvOzL9NPtBfJPAueF+rwNHs/zAudLAMAgeJFL24uew8moX7796EAJ2BNIKsPutpKJSLbZN+tzzcn22h8+UPXaTq3PEV3HNug/8r6YH3iNLKDFlsD2tczCqfZ8Nqr9vuiGWHhc2KRxIqk3kV+JryebAg81CYhgtGA1gCHF2xbPz+tsfBUBGRBxn47f8AvscFAJNPOq8LzKS/peC0O0qR2PZl0iV6JbyAK/cPsRZJKYeYmW3FrPBgIsq5azS1Ul7JLGCbVHD+9UFy97AuwnPy39bf4ZuljlRlTVcnbyTps4W5N574VH5Jcn2sapiiLeMxy7Qb8PEts3+09/sxn7bvxJlHQOp/FVONBXGI5qWAF2twVFj5xoKaooxa0CZDWUCbJsoJwHZRlSAtGc3bUZu9vCqyaGrBVEv7CLbohZl+lcRwzvaC0cPAgM5NeLv3vr14u/ewsAjB0wzt8vfkftwWygPwMxm/3u1jo26qhxlQvi0j3id/2RGlGd/BFZQJi/lDBoh/PH3tzADqtdJM7YhbWEweDT89vlpWxU3XkyL9PZohdBfUIpiC6+nq70GBD264eBu3qTIiQD0b7ONqO555SAApUr71/3xt1CEW8FLN70ky8ZbrQ8mICrFAAK8AF+g0/gRCLUs3qkHSsi9k6uAABOiI8DLoLgQzFTlMM7xLyIWOrMBSd2A7nfSnGnlNMvqnwki4G7M6/wszfXyj8vPC48q3aTXCagtyuTfVixip3zz7UnwH7mcenbs07StSo3iWmYx45YRhv21x+7UXRM2DoQYgeAgKiozM0cnAcWWfLoePlIu9n+mh9vhYq82gH1rFN0A5GJmkrQaJ3IkQXYGuewva2FvMlt5D0AvAjpQMREuMtXyx+b3iZr1W4yFwA4OHeZ+ImW2bx+KDvi6klc6k260prIKusW8Vujej8Upg2KeM9wTGdrTLqK9tUkQGaScA3L8e/YF/29TvpU3U1Lnbng+XO/qADCM2UZpV5G6WnC2A7SH5fHgS6tKP22BA1hTZf94YQAQotNVeCtpLLkCvojf+gYBdHqbWTtgv3IFn3UXzYXybfpLLeRf69hPh9YX9ZAvrmBhXexD4EQhnkFFQmW6gQp5fcJn1c7SDEofJzwMLcGBdG6DOy0yo0swqHx6vnNm+vln490j4AFBki6TVLzzwp/o/KS7NDL8s+QRZzK3828Qj8NBPvMJZE3Vy9h7/t0kM49Ir1QdFTYrnYj0RnNKyqWs+MJ1STJksq7JC2CDnHnfyQ8Et1KnhAYMcU2EW9SFT9Rtlv+t/s963AmoYj3NIUyplFJUrxXVLVxSu6456PxUmMs9clhccVxmgs3Lm5m33Odn+9CZlsJlv5Z34FjHQpCo016DfoXo5yudoGxYg44PGrVPrdGM6RwDxUuGNNKzKIHWaHjgq+vE/zAY0aiE6voxob5ctg9hmPDsjO5W1ad/KJB5yo487VHWZM1qW+HK/el8nBwFhhqJwu8/fKD8k98OnipBOrT4Y7f56zL9MlIwg0AokRSZp2g/yQwGhf6nCTylt6UvnKqXgO8Vx6U9wBA8SfC5uV/FP9N8CFVFtFiSeV7/PHMxnZijG4lu/xxzwRErbNiXdExev7m+tG7khSmNop4T0OiHM7doixvp0ASg6fBJ4qv2/W6O/qnbH1meX3WD/fVqHpdA6EajBBe+3xWz+Knj+yONCcsTJAQbjPo3wEwbNPfEjSU7Ye5oDTlVsGJ3UCogDvM3MZEdFEJaSFTw3aLJDT7ZgQIYVi9tuxrURpXFgD0XAm+BQGBJPAuyriBcqKXBd7Znc5fd5vgBoDx9pIUvSSs0UIgkYQbAFQeMsvYSYz2+MFCVOYmYo5rIM8KjMQAgCghPbYRz8TVk/Ndmbwr9TaZKzAStB4BobpekgfcSfUYhamIIt7TDK3bk6+S5U8TQAcAFMhQSdKzgiyfkwVhzFXcmE7NWp5d+VLKK6c+L/a4cplG7HHMTjkU9X/zfrNFvDLh9m/JuvJqGTJKSnGrAM8PhA8OQFDQwd9MTyI+QQQAn5rXyBQ2rRvzB2wm3N6RNbZsv9z0xuhYlX2gW72gDQ9o8Gl5Re0i9pvYZpJft4Cdc5oxpvczvpokxjaT3IZ57HJoSrZPw1vVHlI8lvUAgHAQwoO/qDKu0rV+4fYjMBKddp2u6cqU97Tn8vKkSt4bGm/uMQ7tH1eYfijiPc1Qy9Iyv3D7oUCizuNZZdfr76gLT+3fPXS17m+2fiN+75Vkx7y0HuespLtaC6IEDWVDZWfi36/gxNK1jzW/NmeW0KCprFghvy9KEAuPCc+pXSSXibBa0thHdYvHdgDX2RPj8jLRpaJeDQCkbb0Ja2UCwPq8JIxwpyWNH26bxZvbZg0f1x4GA5a+I3xR30PWU06MSVW0vSuDv3pjozwQKlm3kL1ecJpkqbwkr28K9wCQAQgURDPU0h4Dv2pL4EFRO24jbw118XBw5jH0xXP3pHKLPZ5/FNWORyiIloNzDnjMDfSp5a+T5U3F7PeN83j1mF6jwpRDeOGFF+7JjeqcpwpHvkphJFSSlCIyvixwjAM+r6j6oySKoy5nGoZA4SxOsfvijHfs150o2hYw4/H8uOo2WX/Ap4fPY4SnuZifaVjAPmqcxw51ZvGGsa7p8Wrk7MU1BUma3nQCkITldWCUe7uvp/R4IbR05LBf3l4TnC4veCBkX6RzVW4QZ+zQTX9nHacrY5vo5/2lYSknBl0v5tlj2DGXuW+eIxb21gL2idaOVlcULl7dKv+4LZ+/H9VJRI0reEfOwD2ygHaXiZ+8tkn+71Bfe3cab065TWcHNvr16nD58oPyK/5D0tZZ/CoT+EVTG5kncGIiICLlxKDykfToVrJMVvHLtkRM/0bXM4DPPyLcjjSu7LynGQ6t7qDK4dhAOZ/jH2OEnHVpNRH/wAqDvNS46dBXF77TYfY5c9rfLcpqfHOhmdu0cWogLqaVrAPDKb/45Z6lC1LK6Z+qfCSLEe7MuchPndsl/8dAt/cAorrI7MBa10CfgM8+Kv7jhRjpq373i08H6cZGNtg4OIp7rm6Rf7fofcRp7GQxYVB5dbhRvlr+j+HKuHIB/PzD0veLDwuPqFxI9erQcmudvCc0koRRyIKMhND5AiOxKeV0W9Oc0ddbUZh6KOI9zeCUSL0G/feMLvcjhLEERmmdXa8bV8W4mQIHQYU26ZrnZ7Nl7z/PWUACXMlaO1mWe47Or17GrhAZJOU2fWYgZJETvd5KNs35BNVXt8p7Q9f16hAx/VyUSXLBSeGhyw/JfxjKJp8O0tnH5Z8YO4lR9ELsGaKxb9g9DfBeGmZdABB9RANAiPQclYNdbwrTjymePKsQCUap22rQv94bZfyZzaDfywkZdw3pmQSviooNTLwBAAIiGix9Yh3bROJEb0gXBwBaW3D2o5/yNfI+nyqyD1n0ISwMMxL2eG4frXCPlvr5rNynQdgvMg7OHLF84DRab4E+/SrNU7mUzdx0QvljKUxJom6qo5NP0GUN89gVf7jeRCE+XlfueSejVPDRgYPC/up7JwGA+iBwgIfGIsqqyD5irx6+Czulv1m6R/y5wMhAX0sO7u1J5sOmfZsbSGzuebpb9MHsNqDm2iZ5zx03dGBA7jk6X2clyTVL5WPOGLiqlsn/llNGv6h2kzkEEBmFzR7L91/fwD4GgIXvCU9GdZJtAiNx2RdpU1cmey0wFlzfDd3sI8Jn1W6SKat4d1MRe6dp7tCHncYOGHIuCBt8Wt5bsYqdGG+IpcLQKOKtMKXQuLwFxq8VfXnZRyaz7BCE5Cra2pnBXg0sfTpexE1tne1rXMcSj+gXCjKJlwXe3pPM3+7O7OtfmX2RPklDikAxwh3VJWzPUGu6YuBuLmL/llxJnxW9yGYCum1x/EDVCnZxqDmGThiKDwvf87tndDasXfIuss8+If94rK9J7YB68V7hbzROspCAiLHN5Im2HPbS7TXsdFuB/AIA6Hug82rh82djZp8ns6PbyeOkP9pFlEhafB19Vt/DyvyFtBbsF7+pcfaXbHUR5Fwgsx2x8jd7Urkl1IaCE7QkuZJ+SZBJIgfnsY10x7XN0vetSeG1thXGjyLeCjC4XCtVkrwKALyieMyp056dDDs0Hm921x/zv9u6p9gEuc9VK0gkOa6BPqW1sVPj7Q0ZSPnXLKcaotUvJtTQ7LZ8Vh24u1d5SHro9UxAd0S3BgPmHRB2GCxkCQDek8zfb89j12zxvGcke/PPCA8FdpEBAK2NLEuqIKltI5ThDaX4sLBb66QDDRwEiSQl1NEnK1eys/5D1tDKhuZmuoSEhCkKMknMukSX3VzPjqRfIXlqZ3B3C1EiydkX6LZLqXJw2VYGJNTSxwWZJAIAASFqD2YXnBKeOL9Lfmksr0VhdCjiPcMxOp2b1ZL8RQJoAUDr8y2jnP3crtePWCdkotH6fFstl9MGhNuPKJHkuGj7rqZHHME/18dZXtZphrPOHN6JXVbxbniCnSZUQvyivcLTV7bJrwW6AhZ8JDwW00o+64820dRjPkD/tyNn5NZbYoQONxREZ7CQhLg6eKkMoSOXt43mtaidCPvCEbxIN7WT6Ei7ZADwaREWWsrA3dYEXgcAKg/RI4JGUBbcfQkA1G6oBB9SQ8dFD0kOHVOYGBTxnoboXe5Foiwv4IRYHDrth4zSOy7/qZLlDX7hBgAC6ESZbQBwz8UbnGv1qb3h4zqfa/5nyjoWzQ9+zpJifupEU1FFyg+S25gIaSy71aFS9AGgtYDvybrEF1FOBiIyKIguuoM8ufhdRDtNqNLbkOvVoMtgIcsDwwQJiNrUgdUARhTv7jReZujm2/zx4QAgqXh9YjXZmX6NLiAA8Z7B9Zvr5J/0pPIIb8wgkgYdCCnAy1Ros8XxIV0W5avlA1Fvk7UaNxkIO3Wb+KmmOX1lZusWseupt2mFyoOB7jSMcFtrfrgLy6uFT1ahVfAgKKtT0kzdRhDTHUW8pxkmu+OzImO7/N1vRIejtNdg+DtG6ZgO9QjngsHlXg8evmMjnOsnyt6xIAnChdynzq9t3l+k6izL7hsUZG7M795fMv/awcBry5BRgkqDftnXk57RNQvpAOSc8/zKlS3yvzhjIzc+UBGJEs75UtJ4tiwLQ7Zkay5ilZmXqBMID6fT9ZL1hl6yDQAM6Nuphi3AR/d/VVPCrkZ1kVdN7dhKZcRKatTJAu/ROekK/zUaFxbNOkm/cPbx4IYYodxeKf9x/n4yV+0hRUCfyPYk873DHX76dJAu7pReKDwm7FC5kegy8aqb69lAli4TwWoWyT/LvCI8I3qQyUR0d6ex95uL+3bmQVCgM4u9lVRFvyjIfX0pvRpeXrWMvTGa90Jh7CjiPYEIsmxUSXK8W62qByGjOmWnjGkNLveDhPNonyicdWm1YT/jA9Y3iIw9ENi2jHIUGVzunTaDftiY30AI50K03fEdgfPFkZ5nlE5K7WeHXnfMxJ1Jpb/63frq3y6Ns1YlWEm854/Z3zm9L9L1wo8KF6iaVX6fsah1kpLiw/jcuUeDy7ZmJDcbv/Donuf1Jsd80cZ8hPI5xMCvAYDGDnVuGV3pMPPW+oW8HOirJwIKHkn2SEit7cBdsx+XiY+6KMyVbfIe0Y33DBZi7E3iPateE/859Bq1i4SFLYZiT4C97FHp27NOCJtUbsS0FrATrYW8caR57ih4+uuPR6S5mNc2F0v/MPIrAW6vZqdbC/iN7It0o6ThveWr2NE7jpxRGBFFvCcCzmFyOJ8TGVtPALPO46nxqFS/duq0w4aJCZJkMjldL1AgHwBEL9suyux1m0Ef8Z9JJUkpBDCHjlPOIlakGwqDy7UpVLh5X+U+i0zIRate9+uxrDeRWA36N6HnbyV89ZoQP0yp2xI0lB2q0305dFztItmhY8/t3vPn8eae1f7HnGHXHFdz1MW3jJ3ZfxS/IEoklddwb2o5v3D+YfmHXj18HgO/preS9aFrhcaHA+9uWIYAACAASURBVIBPzaoIJxoAstPEz13ZJo9ptylpIfWm9B2GMhHW0GNOWRxdtIZPB+n6ZjniF929wprErVe2yUNG5ShMHIp4TwAGl7tUZOxh0p/NRoEcjc/3nEuruTxcAo3B7dnlF24AIIBWlOWtlLG9kfzYHpWqVufxNlIMujo4wGUqjKmpL2U8JXSMAMQjii/b9boxVeu7KxDCORm5djYxSQ7eifjAMbGoJ6r0+fNPDY5wRJnsQd3sCUCiZXdW+nvGOf76IARErXWQFZut1c+lPn2u5hPDmpfjX0lx6mxkGZFh4BRut4Ff1jrJbEEeLO/Kwb3NhfylmhL52kj2Fh8UNkW3kk1UhtGr45XXN8gvOuOCKxd2ZrIPU8ppob9qICPcbkllQYKcepNkppTTzQBIey47EtiUQmHmMPXFm3MAoKN1Q0wGImNzSUgaMgWyNB5vrlurGVJYKedhu2gCxAuyHM0o7Qh9jlMqeUXxd2pJepYCKRxwyJSesOu0Qf5gypha5/Es9Alig1etCvPsyoJwU2RMDrSZA50utepc6LVTmcQl9a+31Rd+GZLY1yTY6LXpPlt1WJVlqQX6qxdyDo1NWoU+F/UAklvkaoswKPyEY+kP3kHaQ9e3q3SSMCe/qvPGttxXX3zr8SAXTNERWhpfT58SfSRDprzLYeKnzM1kaew7QklrATvYFMkfDCD/FF0SX0u+SNF3CKrykex5H0N/5kn5B4HXVa5kF9xR/IXkCroJAO3IYkcCKyjmnaELU2/SrwmMmAFA30M26HvZf5avYaPuOaezQFd0XNgt+rusb5D33GnTX4XJY0qLt8nh+JQgs7UEMDBCbju02v/1qcZROe8uwUl45h0HrJIgDHvSLlNaK8jBG3MO1PlEMazxrR+H/v9v773D5KzOu//vfc7TZ2a7UEMSAiSa6AgQmF5jwLZcsMFO/CZ2XFJ+ceK8SdzivH4dpzjJm9hJbIdgGxccU0wzHUyVAQnTi0SRQEioIG2bmaefc//+mJ3V7Mxs1a60Kz2f69J1ac885cyW75znPvf9vd1VgdZr3Cg6PpHGxnpxzgXB6VZSEXc7Scsqjlf15bz/ANGgaVHZsVcbSt0lK2GeHAM7YymvV4YxrHPeHocZ+SA8Ryp1DIj80DJvjyxrSDbJkf9596O5w7q3vvnrQz+8rdimd1jmj7e+fOwmPD5n+elz11Y63NNba5QUvyalD6y2emOguNUpPJXkdYfdVxHwhe97Fove/yxIQPatm4XSmx1dS0958/LO1t5Hdva1DQYy1p6lVxmhfmL2a7TA66UD5rwmPlutqvR66FyvR3/71dN1Q558+2Y6vSrcVayAjq5vtAAAm5bx+k3LVNMqxllviEuqwg0AUlNLxybxbmBs4i0jyOPvNP7aqmaY9AMn3IojHr9cfS0zy5hZTFvxzvvBuYbSH6GBOUrmFbkwNHrN/Jg2T/YkZdu+rcUPThWoeF8woJUQD6WmMWJ6V9Fzb2stlQ+VzCcT4GpgU2SaP6kV2mZoIeKy6z5RP05aG1aSflQAA6EA5KTWF+SD8OWS5+5ysyNCXz73PSuObzVTtTS0zCeHE26htWMnycLINDeON6Nld2gp+x8ztP4AARLM8MJoOYD/E1nWkG7yi/5kzWvb/2TrPS88vgK4afYmrNy2/PS5a4e0auv3vGsLftArtT4WQBwbxgNvuh1yx4qgbe7duTMEU67tqC1gTVj9Zyvx9n2HQQUW8ot3zDlpefdld+fbbqi9Z+og3byMN5z6c3llbTm81NTatVFc0ky8QU3LxDULHrWnpEghjrlLXu720zIjwsH1r8sUbc3Oa8Zhq8SZVk1qIADYPh13yGpx/EjVoBnTj2kr3oZSx1Pd/ATz4VKpnJJyj6wQSWszH4QXEnNXKuWzvus80+w4ZRh+v+d+ORdF7yHmNiXk2pLrjJ4nTaT7Cvl/tqN4gaH1HN+2nmEhJvz4aifJQbXxcKAS35VaLQVwf/3xsWVtia3hEuaAQtm/1FDqvQKY7UTx9lTKW4o5r8FVb7IhrU2p9Vm1YR0BzHHi5NLIsr434sk3zV6zaiVwOgZW3nhrDYhQzHl3AKh1X1z+6h/2rgm6nTtnbRCnhXle8No1p5z21m27CgpLG7qgtrWdKT7Iv2hmBSvSJpvHTcYAYMdCfsh9gVcIpkJ1LM7xM+URfMKrHP9L+YlCt7hsuNdjhzeMdo0qVkBd9WMEMtx+mgsgE+8ZxLQVbyaE9Z0LGYiYaI/E5oTWdmvZ/z+C+UgAMLR+j5Ukr0SWeW3gOM/XH68Mo9xvGD9rvNLoRLb1VgSMu8FAPYlhbOEo7iago3Zckxg2DDMcVpzMNpW6goACUOnWYyr1EStOVseWOaaqv4kiNTvV+9ZCzA1jTRkQ8EsXDY0krMGC5fX9Nzct4w2blqkNr29eYV780vbDUfe9E748sGOT6tpxUGOxSeLwm1ZES+vG3mg2pQ3L9fNmiG93bBYXCoV85PErz1+ofjTaWyEFcvvpxPpxBmsAKnbx4qun6h+Odp0qW5bqR1u20/tqP0SU5B1vHK/2/kZ1xriYtuIdmea9MopPrk2N00I8sTvVhOMh5wfvrQo3ABBgSuAoN06+aqbqtv58bkrS6ZwoWiK1nh3Y9urxvlclZTkV4j5D65UEmACggbVl1xn3atlOkhX1AkpAwU6S02LLvGm81/PC8DgrSS8h5lYtxIaSY1+jDMMHs2gp+x+XWh8PAEqIp/tz3jWa8LrkXb4alawaMWwO/EhUu9YDwOBqvI7+cj55/YXCrwrAB2vHlYF3+mc1t2pd9y51zVG/ollWgKMAUOzg5ZGEtNKpZ2i3ntEY6GHZ8HcaOfzMGyfoq7cePr6uQtuW8JauN/jH7VvwHpliTmpi086F+obSrCZPABrofIu6ghb2x9vPM2PqmbbiHdr2K8T4BztJLgKQU0K8VPTcPZY/amh9VrNxAixD6wuMJPllapqTtnlKWhutZf8vBPOJBJhWkm6OTPOq0XLFgUrRTT4ILhSaF2hBb4XS/Iap1NGaqLvsOndN5ANPCdrECvUZKUoJGrXwA6h8CNlxcjEBnibaYmh9PqESmxVaH14Iwlm9hfzXWsr+x0ytB7vUC60PaiuWDiGgc8CWtZpXHaeCJvx0smrL4a8CjavxWtadzL844Wn/XLHJ6wAq1ZPFLr4nzqHp969/NvofuyL963kv0SKhIDcdxesne9NPG9BRjl/0+iuGTwDAYFghHX3oE/Jvujby9S9cqO4azzVfvEDdZQa4r3U7tffM453NbFvnv0gHLXpWftoMsVQLFINWXvWb96j/ru/Wk7H3mLbiDQCBY78UOPaEVlu7BTMIw5voE9Bip+mSyRTvfBCulMyDZdECmG8nyZW+Yz894gYmM1pL5b8cPFcDivQTvfnc3zapJxkRmaYtXhSflkq5wXecJ620/BvJu/plaqKnfMcZ1QzKiaJDvCj+IgGdACC4rv15ZexoK07mSK2PqT9fAg1jBNh2qi4MgIaQVVNumr3ml1ixvGduezXne9Rc+NKnth2hcq/db121qG3r63Nypci69Y0T64yrNHDYKrEi101LozxvWnumfqBpufgk8txF6j+PvgeJU6QThUYngUCAKRRmdW7Cpw57WPSvO7PiRT5WEhfpjkXckI5aZeGz8verG5tSoyPXg0uPul9ufuFClXVtmiZMa/GeCqRSeSNV7ZFlbh4pd5yBMmGoyU7NaztC0xybiIx1Xlovqh8jYIGhVGtqGMN2WPHC6ETBvLx2TDCf5IXRSb7rPAlmKvjBe6TWRzBQjizzrtBuzD0fcBf8KAGdplKJE8e9DGxSwBomvKOF3Fxy3TtH+kAwk2SWG8fnSaVPqgr3wPtohgBYAOPKL86NfkgNN81eswo1xlMrty2vP+Sysx846qSjXrpMe3pBCDNa397xi+Bb70QD2SsNC4flN8k/8vrofAIJ7AQKN9Jpqz+o/rbZhuZkERYQrfmA+vdT/0d+2fHFkGpaAhmt22kFgHGJ90i0bqVWMxraNYhA5PXyERi66ZuxF9l/xJsZrWX/k1LrMwG0eFG0PrTMHzTbfAQRtBBPCq3fM3g6KiLEQE8s5Y2TnROtiXbKuqwxBnYoKYvDnAIAkFovqC8QIkBKrRcAeLKtVP6iYD6lKqAyio9n0Ncj2xrMI3aD8BgrVZ8kwBs43yRgFir/kJK4fbQsEy8IljtJ+kfNyvcH3kttCASa6KXYst62k/RxU6nD6jOLmp2vhBi1inE8HOq93X7OktV/YJlqUBDn+T3n3Vs4oqlAzX+BFrt9dAaBBoMjtk8nHfaoeFdt95kpg6lpP0pgcj84ggIHmlAWjCEGZUaM+ZN5n4zdY78R73wQnie1vrRaqEHAoU6c/G5g259vFpboy3nfb/H9ktR6GYNiRfQiCyqGlvV4ahiT3hnEd+xfFPzg6Gq5PAN+KuWdo/WnjEzzcVOpD9VuLjJQjEzj8ZzvnykqOeSDENDlJMlFkW19B6hsJDpJ8r+rwt0MofVy0vpqFmLYknU7Ve8bTrgH5rRBExWJuU0TbSg7zg+E1o4StAEK15vA+6iJix9X/vVoIR4teu7YN0oH8r1XbTn81Vrb15RJ/F35rI/vZO/Uyw98Lm8ZasjTlWA+pisprgbQkKHTtlUsrS+0IRDZJdojolbq5DX2JpxQ66+iwUn3gTypmSJxDrE2uAcxDek8L1Nqt0uwonzzPYCJ4hRhH3m/vNIOaLEyuHfrUn3bxmN5XJYP+yP7jXhLpY6oCncVAg62kmRuXFe5V3mRdH8u9z97an6pYfT15XNfyAXhRcTcEpvGY6Ftvz7aebFlbk2S5OcD+dizNPBOKuWtsWVt8cLo09QkakHMgwJkJekFzdLyhhxfWYlLxgh+I8wN5lgMxAACTfRq2ba/l9SkGObL/oWmUh8SwGwGwlpP8YFzwUBPKsWtJde9ZaQPjpE4fe7aJavPFeZRn58zn5+YteyVc0/v3PQHwUJtAanRNKAjJbjpCnfzkXp1xyb6aNV3pDJPTvrmjN1FcHd44Xx1x3F3yHxhBy4hDU9L9O9YqK957VQ96qb2eEltbLZiDEmDJI2820+5KM+TKt7H3SE/75SrNriERc/QkX5BfWnHWBpRaMD2YUUe4v2tQnS/EW8m6kdjMVuvknLEKsg9iRYiKua8W5u9JrR2vDA8G6Cw7DqP1K7IiznvVqnUvVaSHhybxnolZQBUMmPqr8MAUilri40aVtzVENHgvIhe1kKM2NKLiTaBeUgnFSXogaLnXV1fmSnTtMWq5JB3DsyzwVZ1IESVFD3vxvFuvAIYjHfLBOLk643/a4d0NLbZaH9xLsTLJfz6u9vxMC3GWViPlhobv77U27H27YUtABqEo3ce9/TO5evbtuD9UlOnElwst/O9G07SL45/ghNAAM9cqq4D0NS1MDok6Cyd3beSbd0hi/LN1hu7bhS+nNCHnt+GtW6Rz6oNEaUOXuud07wrz0SZ/SrNs8p0XO2YVHTAwhfERTsOHjkP/ogH5Dntm+kymWJWamHTtkP1tetP1pO6FzWd2W/E23fsWwt+sFwAi4BK2lsqxEN7qlpzd3DD6Egnjv9YoBJzNIuly0qO8/eJZQ5mCygpg0DKISKihFgrtF5WO8bA+lojKyXES1Kpete9qkVspIlWF11n5KpGAKFl/tSN4i4BHMyA1kQvlR3nJ81K6t0oPrV2Q3M4CDigrVT+WtF1/l4ZxoTyjJc+Ks6yQxrSh3HuQx4OeMzBm6d14iqcjPekL/cfmPb3lQLvjZvvP+eHz647YtgsjOcvUrflt9MD89fRMTsW8bqdA02L9wbzXqKD5r8sLpUpWspdesMDf7LzVJ3nxQCg29W7un9328Fd/zHv7yZy7RfOU3eedIs80O3Du4RGIbHx6saj1X9P9urWDOFUaxJqoSat1mqZ8wrN73qTPiE1tQCADNE6b5347KZl+k/2F5Ot/Ua8U8PoLXrul70wuozArUrIF0quMyOqyuwkubwq3AAggCVeHH2kzzK/PdJ5xZz305ay3yK0PoEAqYFXFNHLXhgu9x2nUjbuuTe2lv0OofWpBHRUwywEEANGbBgPKMMYcdMUAELb3hBZ1ue9MDyZiULftp+uXTE7UXyIlSRnAkhSIV5lQNeHseohAJL5uHwYXdmXN/57tDk0w/aHxm0BQCYChfUmtp8W4gl9UPL0a4df03bjrHvHes3SAVxadwBPWnbHRJj1Os0++En5JSOt2NM6ZXHqiV/rwpp/3PW5o1vUieWT+xfnVreMuXx+EAE8uVJdlevGtV4f5d9ZxNumIiyx6Shev/B5rLNCDBbEaeLyjkX60ZHOm/uKOLMq3FWMhA48ZLU47eWz9Yz4u95d9hvxBipx5f688ZO9PQ8wQ2qdU0IEY7G6JW703xbMIzZ2taN4sZMk5wNIQ9P4ltS6y1L6Y5L5FE5SbaXlZ/pz3je0EHFfPve9Qtl/21Lq94fcFzAMrQ4F8JsxvS0iVXbdhiqYvB+cZaXppwmVhruG1sV64eZKxoRulnVCzA2t2sbKtkP0Yy3baaVgGkwzDNtVvOni4kYRwZRvOXeNR7inCwtfEJdUhbvKvPtd2DsFos6BXykBS3WmswCMX7wHKHegXO7gqXs6FcDrJ6lvLX5a/K4R0mJtoKd3Lt/15vE8YjcnZTRWhDI4jXLonrK5TjP2K/GeDnhheJwdJ1cScCADOxMpby/lvBEr5JhoK+oEXIOG3cxxw/BoN07+vJr9IbU+m4Gg6nlCgJDMJ+SD8H39Oe86AIhM40lTqSupJpeaAUA3eo6PFzNN310V7oH7N/qWAEIDPc3CKUyNXc7HypbD+a2uN/na1m24TKaYrUxs2n5u+anW7dZz2A7gpllrAOCwh8W7Wt6h47VEsOlIfee2pbx5lEuPStqZuOUV/SdYG5z17ou5YQ3AJgKpxn0CoyRh9u8SbwppY/7B1knfzJxsti3lt7ctVX87nnNeOV3dfeItdJ4Z72oRF3t4fv2JWcw7YwogrU07Tj5Vdf4jIG8p9TtOFK8LbWvY1VFsGjfacTJPALMBQAMbAtsatmelnaSX1KbtDaQBNqThCaVOaCuWDgagE0M+qoGNEjii5jxI5uNIa3N33A7RJIWwPmzCFeHuaHJcMbSsYXssjoXnL1K32SXc3baFZu1cwNtSx02B/uXVFMJjb5eXt22jDxPIBIClj9GpRqK+Xu2iPhF63//O+fEh4Udg4YDoKL/sn9b/UMd/z/ku8QQ2X5vQP5vX5Hr5XAINxob9OemW0hwVI0UXRbTReS73ExFMbMNyuhMWEK09Q3198VPi/UZMXbHLm146W/18f8o4ycS7HmaRC8IVgnlWYFkPpaYxabvrXhSd0sSyNW8lyWkjibfvOM9FhvE5L4rPZ0JYdpxfjSSmxNystF+h3mIXOIy4khYnkvQkDTQ0ACBglqF0ayLG70xYhYneRF2Yhyvl6l0EdGrg7UTKOy2lrkBd9ksq6K5qEwbS2ij4wUcF8yEAipFp3j5W+4Qoj3jbkoHVdLXScuW25fK62b8p7KSzq8INVLId5q0Vl2w+Sv3HRN5v2p448SHhh2Gh4kdiIKdmpRcXL+l+ruWXnROOlUcHBe3+acWzWEGtfr/wT/h6+03zfmOvIIWW1MaGtxfQ1V3/Pm+T6kg9Y5tZmqwPiunKzkX8zs5FatTN9H2VTLxrEEq5LX7wZcG8jAAyg2BlnMjvlzxvUjZAFIl3GIjrU/iYqDTcOYPnGka5aBi3DM5VazsfBB8kzfOZ6J2yY9+oBoqHtBDrm2SZbAAwm4CWgUySoLYwhwCnmZ9LpbOP3K2MisC2fuiFUSsBSwGkmui5fs/9JjELK1UHRpa5QQsRtxZLBxvM51TP08DGsuMMmpG1lP3PGcxnVr924/hITfQ3y+231vTMbb8Sf7B2yUjzWPWfZ10LALhp9hqs3LYcN81eYwZwSDeGcXRnuhArtzeU04+FqEMdAgtD4tEgCH1wcDZWbpvQE0xpVrrM79AXQCJfzeV8/EeBb/fT6s519mNpi04BzBaVfxn7FAc2rW7OxLuGfBB+QDIPppUR0G6m6v1gfmQyemiGjr3OSZJnhpg9AevLrjO+jt+VbvVflMzHD37tB0f25nNfYCHSouf+qKXszxLMxwEwNNGLZcf5VwIbdpycwUTbzTT9IAGL665cVESPCuaTCHA0sDkyzWtH6+wzGpFlbY5M8y/cOD5Mg4LItjZWXwsMY131/3353L8V/OANqfXBTNTrW9at1UwXI01bJXN9SmOnkyQXRrZ11YWLnrt2pDmswYLlp//BQ1cOVlzeNHuN2wd36a/lxcrTWpaG1uWk8+PHaUXfqCZczTATsR6+dSkwNBsC7emzdKA/7msyQwT9zkcBquwbDHodwIva9fLi2X3XebYek9tjxkyk+X59Jt41NMvgIGCuoVTLSOZQ46E/5/1jPgg+QJoXgGinb9s3jbe9mBdGxwvmIc57AjgsF4bnljzvHi1E3FvI/52Zph2k2Ygtc7CRQGyaNwJAa6l8lNB6iHgrIV7sz+d+5ETxYqnV3MC2n5w0/3QiBLa9bpRjdDHnNS2BJ4aJJvnAADcZa2Q53lqzZhFQ7RvU9QbNOuwR+RUzoYMAgMEKALTFpfCg+PEXrt14yzCXGhXb1DsNoe9PtbgMIAMABOnnWt14fB/SAySKcgwMk11EuSg1TvTsOBPvvYxmCELFGmlP3C8T7xo00dYm5lBbUiknzctECxH353I/E1o7bhSdSKw9YHzpTVLrA+rNqABA8NBeholhDHvdoud+v+D7ltB8DACthXi6mPN+CgAD8fcJp5dNBYlp7NAhvSQrTxMAAAbCRBpNDbrzvn++maoLAOQ10esl17m69rf9oKfFyqpwAwCBZFpQbz53wxtfKJ4YjhrGGo2OfPyDYmg+l6R0tBC8tdVN7hNiBHuBETAllwh4mwd8b4bCsSH1a7s73z0JM1AMjeMSJZZJ4s0tXvKQ2EOCNxUoDbOnbH9GaToGAKTgZzty0Xcn+vMeK5l411BynRtb/OCIasybgZ7EkL+YjJBJLXk/ONNM098WwGw7SQMVJ6v6ct63xxqe8B37YaPsf7CafQJUutWHpjmis50dxQc5cfxeAjo00ZtF171KE0UACEKMaIA1HSjb9rdzUfQJqmxY9idS3uu7TkPfRS8IT7BS9ftVoyvBvKAQhDkUcNfpc9cuWbUS4AcOWIKeoUV8FFJb0qUmpTqPCGhxk99gjDnyo1yLbVPdGCbykwB11nhcain40YKT7pny/Emiu2R9ItXi3QCZKZh3FsUZnYXo6zNVwHvL1v9SWlxQ/VppurCnbPmdhfj7U3nfTLxr0FIGvfncVwayTboCy3p4MrNNgEq6oJmmV1SFlwBXan1ePghfHNLhfRgKZf8yQ6kzAJAG+ggwGdieGPLW2DK3DneekaatXhR9odpZXjIf31r2T0YlC8VhonVF1/mumgLHxMkiscx3ei3z70c7zkzT0+odCgXzUSvSDd9OF721pn1Rz/K32y95Dhusw2qPUa3qrXB+MqKHy2SiNKyesv1ppWkZAGUIvbojH19D1NitptVLVrlp+nwpNM7SmmYRITEkv9biJsO3BpqGBLGYnWpxHgaze4g000n9vnlOWy4Z9fd/OqK0OKJxjBrGJptMvOsh0mXPXTVVl3eSZHFtqTtQ7fCul6BJh/da8n5wjqnUx2uzVRTRo7353D+N9nTgRdFvVYW75r7zBpPJmGcVgpB6C/l/GOt7EUq5+SD8ADHPYaKtJde5UQ+YYu1VCKpJsy5Va+b18lWbrjv20sWL7a3GscRkpgW1ceuVvT/bk38RPWX7k7UrtlSL+T1lq9yRj5vm8FsG93fkk3H3Ix2JYmAcHSbyYgYKUvCr7V78s6l83I9TuWRw47UGpfeMre6UQByiPi2TMOWLgEy89zCxlJtdoEmHd2roTl6PodTJ9WmGxHzYQIhnZBh2/VCT1mRHkNbGmOxXmUWLH3x5MDunkvFyRG8+9xUQaSNNW9wovggAAtu6JzWMPebeGBnmr2Qcn1bbCUkTPV3b2CJclERPPP3K1xb906zDnU1m5/qvbnsimaX2aEGL0pU2Y7sgSitx02ELsCaTcigP8WPjzwFqB4BU4bjusj2rqxD9y1Td07XSp8NE7qyEf6qwMiSPvJldAzPQW7YuTTSdCkAagp9uz8XXN3ti2ROYkh+KU16KwYIpjiw59f4q+6Z4M1M+CM+QWi9RgjaVXfe+0Zoa7CmUYZRTIe4d6PBuAZWuMmXXuX0spzcZ0wN52yMSm+bDRhRdOJJ3NwMRjzG+PxBaGpJLLpiX5YJwhRai24njP6uGhgw/uCi0rH/ZU/1IQ8deJ5j/n5WmFxNzQQnxWslzf9xwoAG8+VfvjOihMZUQENf/4Gh8beF2iyAxzq8KdxWl6bQwoR845uRav1axDC5bUt8QK3F55d7sS6EfbHGTJ8Z6jV7fuiRW4veqmTyJ4iO7y5bZmY9/OhVzHo32XHx3b9kMEiVWAGDL0L9u9ZIRjbUmg31SvFvL5T+Rms8hgAwNGKXyyQMNeafFhkh/PvdTLwifN5Q6jol2llznnrGUn8eGfMRJ0pNqi2u0EE+BSIMZI/leh7a1wVDpVWaqLgHQwUCPqIRN8sBgm7HHx/o9Esyz6hs9EECGUseQUh21m6kCOMBOkpV7spm07zpP+cC09vUwpH4sUeIgoNrejEPT2KOOeE3SLMkqBtYnHDP6p6m6aXs+vj1O6RE/MpZbhlo33hz1RNEpVeGuQKSUOAnAXhFvAGjLJQ8DmPpWeDXsc+LthNFSqfn0WmERzCfmg/D0kuc+sjfnVovvOs8BGFcHFt91nxAc/Keh1DnE7CghXlKCXm3vL/49gAOYaHNkGtfV9uWUSnluFJ2aSmNDyfMe9IKwSXyJQwAAIABJREFU30rTiwic06CnwEwgspUQLxQ9d8y5zYmgl03VpE0Ps9XMXIqYG6xZ9zc0QxKgqp+x7bn4ul7fKqWKTgSQmoZ+tM1L9tjvqCX1E4Gm83Z9eFTnSYdrhhTU9Elvcu5tcL9lTHiDslkR6X5XWLrPibep1KGEofFdAoRgPW+4c2YSJc99GMyP5ILwNKn1YjtVnx00oWLucuKkK7Ksz2khorzvn22m6mMCOMBKVejE8cuC+eBdsWBGKuj+vnz+m+OdhzLN7UjSCHXfawhRYubNYB6Sk8xETVdXbhgeZcfJ7wpgHgClBD1ZdN3/mhYbn5NEf2CcECbycmaaS+Cttql+0eKmTxAB7bn4DuyljuwtXrIm6BNvAHRI3Ut7LHacpOT2h+aHtKa5gnh7wU2utwweMc/eEPxkongZarr8SKGfGemcfZF9TrwDy3rMDIIrajerBoo5GvKBZyJCa6ul7H9JMB/brJmBAOZ7YXRW2bEfMFP1YYGKORIBzmA5fQ1S8wlSKVeNUywTw+jRRE/IGq8RDewILfNuBtJcGM0lYAlQMaHybauhfN2O4wPdOPlCraeKofm8Fj9wewv5UVMCZwKpIjeIjc8CFe9tBrWHCf1VmBjbDaF/056Pr57KFe5oOKa+IUzocwANfghLwc/uiTkxg3p868vM4migsnnT69PhXYXoiyPdvz0X39xTtqxU0XIAQgp+rj0fN+5p7OPsc+KdmkZPHMv/MZX6gAC6GOhLhbgzcOxX9vbcJoNcEF7WTISrMMBMKNpJsrA+JXEYTDSp1hwLfTnvXwt+sG3A5a8vtMzbI8vaBACxaf5FLgyXA6Cy46xuFkt34uTCZmZYgvkYqVReSbnblY57m2JonF0V7l2QBDA31fLS7pIVWga/pBQd6FjpY66lR2+6O4m0eskqLsOLlTgLDEcKfrk9F1+zO9cshcbhqaJ5eSf9tSG5wfqBGejzzfPiVJzJoKEGaiyO6PfNM9tyyQPDXZ8I6MjHw/by3F/Y58QbAEo573ap1AN2nBwZG8ZrqTk5viTTAaF1Q1edWpjwctlxnjDTtJWbpCQykNZ2q9FEL05UJFmItD+fa77iIdJl1x0tg2Ck37+9kvY1FpghypGxRArd51p62MIoABCEfoywm6y0uDiIsRIgGQfiQ2Gs/6c9H09qLvdotOWSewFMqJtQlIjWVFGnZ6s3mCF2luy/0EwnAGRFJfkRx0y/3+qlj9ees7NkfUZpcXFt2KMWzTRsA5DuknVZqugsBrlS8CutbvxfpsH7TIhtPOyT4g0ASkrfd+WTe3se44GYpRPFSxIptqWmOSRVS6ZpPh9GVwjmI+vPY6CsgVdYiLdLtvXz1rL/aan1KQDytU0PFNELKdETptZnAMgx0WtF17lqz7y7RmLDeETG8fn13eM10TPTtTF0KZRL/cj4LIMOBhCWQ17TkY/+dbjClhY3eSxKxcvM1PBzq1BbsEKFWIn3poruM+T0EKQ4pdb+wPyIZpovCO/k7OQG19JbmEE7S9anlRbvAlAoR+YGQfpVzeLUXWfTnCgxPsycrq6aNUWJaFdanDGccAPc49lp04yb3rJ5VqLEx6v51ErTgl7fsma1ROPes9kX2GfFe6bhBeGJdpL8LwEsYqCooviRvpz3XRABzCgE4V/JGifBQXcLoDeW8ielnHcPABTK/gcMrX+r9toa2JwY8uay497PglIAE3bMm0wCx35ZavWfZqo+RMAsAIkiWlPMed/Z23MbDj82Ps4Q1Q0+VzGd2etbGwce4xsggm7z4m8UA/PDqRbHAjhwl3Cxqs/0AOiAMJEL8zIdc9FKPcygOBWdptS9u1MtyQzq9a2/ZBbLAEAxUAyspaaM/ncxNM5SWvxW9YmCgYMVi4aMIgYWxqnosE29AwASJeYA1KTWgBMCNtqmutkxdVP/+ESJk1HTOQgANNNRSsOSApPjfjmDyMR7OsAs7CT5HQEsAio9HqXWF+eD8JWS5/5qwAJ2SGyQAKREj5cd5zu1/itS6wZPBQKckuveOy3y3JlFIQguFkofzIJ2lm3n5pInH9zb0xoLqSKPmQ6qH1eaDh7pPMvg/s5CfBUA9JbN8xMlTkHl89fXTOcOPZq3O6ba2HiV0VGarGIgj41TeQWDFgLYYUl9Z3s+ntCHdTEwl9c/MTBoUTE0L1CaFjcJBeXQyDZT6sGwpWenr/iR8SaDFtVcNbUM9e02L3lolCrJZh9EKWFmGlrtLpl4TwPsOFlAwEG1YwQIqdXhAH5FzIVmmSUsaH0T46wmXbWBtmLpqwS4Soi1Rc/90ZhK4AGYcTJbsvZCy9owUhHQWGktlT9nMJ8NAFCMFt8/oS+f+9Kk+YZPIUJwSEAP11WpEmHM1YhtueQ+APcBgNYwdpbsvGY6sbIC56Ip9a3jDZkEsZhdCs1PaaalAHI1BSzzYiV+u7tkcnsuuXUsPz5moD8wT001HcIMu1l4gxkOgZvtI/UQdDdDDHQ04rIl9R21q39BUI6V/jCMjd/hyu98jyH0r9q85MHR5mcb6qEgoZMBGvyQkIKfmmrrVaCSo99Xti5NNS0WgrsLTnKzZfBeNXHLxHsakBpyByL0oq5RLxP1AoDv2L+2SulGASwcfA3oDS3rV/XXCk3zTi+Oj61uVDKgCCgI4HgAEFof3uL7+b58/lsjzWkgJfHzopLZYrpR/Irv2N+KLGvCXdXtKF4kmU+tHRPAYbkgvLCY83450evuKQRBG1LfkyjxMYAG4vS8KWcnE1rZCoG0qxB9vT80lytFB7pW+rhr6XF3mS+F5mc1ixOav0pWouQn3imKU9u8+Bsj5VAzAztL9p8pTWdWRJsVwOVasQS427PT+wnQfb44iUEHDYynhtT3t3nxz/sD83zN1O6Y6rGcrRp84Vvc9Dd5J306jOWBpqF3mpKHLDiYQT1l89JUiRVEeKstF//AlBy2eOlT7NO34lScy4AniNdKwW93l6yPWIZ6Mu+oKfM17y7Zf6q0OBMAtAJ6y3RcZyH+ghS8x1wo68nEexqgpCynQvzK0Pq91UwQDbxetu1bgUpWR2Bb/+7EyZXEfCCAd2LTuCU2zQYzq9Cx14LwdTtJLiSGDeY5tR3hAUBoPnY0A6q8H1whmVdUvybgCDeKfzeyrK9P9H0aWh1YvzkJAMTc0DV+utKRj28tBsaGKBWnEMEv2MkvLbP5CqwcyoODxLgAgGVKvabVSx6vP4YIaHWTNQAm1HItSSk/sOIeASJmWtYfmFd2FeL/Gu6oYmgepzS9a9dqmyTAFkG/wqB2It5uSX2HbeheIui2XPSlYmhexkwtptTPtnoVe9r2XHLnaPMWBO3ZjeEhZuCdfvubDFoKEJixrLtkn97qxp9zLL1j4B6PKQ2zu2R/JU7lhwAgUWJlnMobhnNk3B38SC5Smk6uHWOIQ/t887c68vHNw5031WTiPU3oz3nX5IJwnaH10UzUX7at25Rh+NXXQ9teG9r2X4/mYTJw7Guhbb8GAG3F0p+DeYh4D1gHjHgRwdwQxyXmxWO5/3AEtv2klaRbaq1pGYgSY+9lBVW6upgnKUVzcnb6iGXqUd0PC276fAF4fqRjioE83I/Nv6y65ylNZ+kS/Xii8efhEIJjAAEGPGpGQvPItqupokOGeoYAAJmmoR8xZbrOj43fiVL5mXf65eWm1He35+NfdubjEXuHjpc+3zylKtw1c2gphubHHSv65+pIv2+9W7M4ruYYN1Hi3YmiO+pX8rvLwCZrw6KDeVch4N5gv/MDmLYQoey5j/flc1f157yf1wp3/XHjITHkY4yh3sKK6IUxGGE1jWnuTtxbCxHFpvFDDbzBAGtgeyrFdYHj7DHDqlqUhrWjaP9NEMsvxUp+qse3/q23bJ4z+pmjEybGu4fanpKVKHHu8GdMDCkQG0I/VvkYqsI+0BgeEYQdI13LMtSzQH1RDZctmT7tR+ZnKlknVGDQoliJ3y6GxqQ3HFCaDm22rtBMC+u+btLTkzqjRDbv1rsb5J3kKYDrwoUcWqZePdn3Gg/Zynsfp+y6q0j7eVOpswF4urJhefVo54WWeZsbxUdVy+sZCBIp75mE+TzmO7zaSpI5iWHsHG/z5cmkz7feNzRWTB1xKt+vOXl4LOXhpdBYmihxUM5Oft0kltywEmYgX/vgEiWiUwgOd3el2JGPr+71re2poiOJUHbM9C6lRVeYyM/ssnzlXqVx0PY++x8MyU+35+Kf12d25B31WpTK21MlLqrkn3PRkPp2zaKFgcVD70punIgVcPDycPPSDNlbtj6oNB1KhH7XSm/L2eqNkd6LKfXzqRYfqt8oFcRDiqGE0Bug69eevNW1GmPsu4sUSFwzvTpMjI9yZd9phyn13QUnHfa97wky8d4PKOW8uwGMq3N5aNuvaRJfcuL4IgLs2DAeDRx7Un5ZmUjtzsbnZKE0NZiVMTA/TkWnY+phm2NohuiubOydCpAVp+IK21A/HcgkAQBIwa9ohZNqzxPErxMBQSznFUPjD7kSqw6k4Mc789F3J9p1nAjcnotvBXArABQD46hE0dEC+nESiDXTXGY6ERBtDCBRfHh32ZLN/K878/E1QSzuCRPjaNdKn3FMvT2IxHxUnt6cuvs2fzocoLtk/7HSA08bDJRC8xhB/OWRLABavPS5qE88oSFOrX3M1ExHFAPjqIJb6dfZ5iX37SyJY6o/A4B32oa6XlbCSJNOi5c+WeD0N1Equkyp+6ZDXnkm3hnDElvmttgyf7S35zFVSMFbdeP6eqsldfdI5/X55gXVzIMK1Bml8oNKJw9X/6jbcvH13SX7AKVpOQBbEL9ccJKrAKAUGp+qmjEBsJWmi3vK1paOfHzTeN9DlIj2cmScR8RBi5veWwyMs6JUfhKgSg9PzdsJ3D+0GIhIKToBw/hfu5be4lrxYNaLa+vNpYif1Ezv2nUUv5V3kmHdEONUtNVv8gE024+Md7tW/IOR3lNXS/wP2/ud/8LAU9/AuW1hIi+oijcRdFch+qdSaByWKLF44OlnSlP3iMCOqd8Z7bgkJbcYmu/VTF1S6A2tXnLnVDRXzsQ7Y7+l1Utu6i7RMs10dGWVx/2WoW8ZLW9Ya7G4cZTmBbFcWE1XEwTVVYi+lSjKaSbLNnQPAChNtuYGC1YoTYfVj41Gn2+cHCbGZ6ux9Z1FeeFA7W1N82U6gOta502Eznz0L72+tVFpOogIPTk7uSWM5ZI+X54JQFiGfqzVS35d8348NMks4iYbfw3HMASaa5NbP5B30nUAJlyNOtkoDbPHt/6GudKUWGmB7pI4vKuwa7N1ssjEe1+EWeSCcAUBju/Yj8yEApi9gRQcdRWiv+4PzHcpTbNcK33EtYYPl1Qh4iaP/bzDMfXb9aOVePaumDYRJwCKwNBMBSKM2xwsSuXK2k1RBi1GJfOkfm4+gJYh/teSx2WRLATSjnz8P9Wv+3zz9DCRfwyQBwBhQqfqMhXac/HdABDE8nQ07DxybBlq1E0+IZAK4lc0U01NAGtD8ogZPtOB/sA8vyrcVZSmU8qRPDhnq/WTea9MvPcxzCTtygfBXxKwlAAyS+nKwLa+Fdr7hiXuZEME3eol42pf1erFt3eX7BM0i2MrIxyYUt9pSB4xBgxU8ptNqR9MlPgwQANtyHiba6Zj6WE6BGY6oMlwjMYV6nYi3gjmdgAkJT/dkYt/Nt77DblJKs6vCncFshNF5wC4O06pJVHi0kbfFgQ5W70wluu3uMl3+gMTmukwAKEh9GNtXjztC7k0U0MXKYCcRIkFQCbeGSPgReFHBDD4CC6AhU6cfCS07a/tzXntS0iBuLMQfbXfN89WTLNtQ60ZT3VfRz6+rs83306UOB7g0LPUnZ6txtXHsQKXAKozg+IXiLiNK6I3EP4RxzEDBH4z7yTfrC2OYYYYePKYZxn66YEwRFOUJkszTFNymbkxhAEeWIXHxlKgma0rtfb55mUd+fiG0d6ZbeqeWWb0DaVhESGdipjxVGAb+slUifcMzQvnbXk7HXOD5bGSifc+htDckEFB3DiWsXsIgm7LJQ32BGNloLv4kA7jpVAeFibG+cwwLUOvafWSVcOdX8nqbljZQgr4nfnoH4JYLixFxu/z4NNBxVSqHBkf8mz1z0AllW9n0f6iZjoJIEqUeH+Uilvqs1AqPt3Wp5QWJwOwBPE6IfhNpTHEtEoIXgcAjpWuDRLZB1BDEYtm6hrL96fm/cyokF/eSdfGqbguUeJigA4g8EbbVD9r1pRid8nEex9DE20XPCR9F0w0ahw3Y8+RKMolqWh1LbWlmmvdHxjHBrHxeYDaACBM6My0SAd0FhozULSGsaNo/SmAhoIUZrQRAZ6tNpZCs2H1y4zB4pY+3zxfs1i+61VyUiUujhJxh23qQbOtnrL1AaXluwfvz7ScWD8mhb5PaToOgBTEL7R68dUAYBlcMqW+OVHit4fmazMbQr86vu/WzKMjH9+QKvplnIpOx1Jbp6qlXCbe+xihbd3ghdGhVRMrDWyPTeMXe3te+wNRItrDRB7hWunzlsHF+teZge6S9XupFmcBaC2FxnrXUj8ouOnzYSwvqgp3BTJSLT7CjJvri2l2FO3PMsQZzeYgiAdT/Ih4C9dVJhJh8HXdJM8doNYokYfYph60LFCaDm94L6AlnfnwE5rJBED1Bk0d+fjG7pIZJUpeUfHv5kAKXlVwk4d2lqyPKU1LCSjbprobDCNK5RkApGXoxweeSmY0huTQkGpKaxky8d7HiCxrU2IYn88F4QUArMC27ksNY69aV+4P7CxZH00rj8qtYSK7TalvrjUtihLR0eubX0SNbweDDg1i+r28k36ea/xeanD7A/O0+vAJoz5/ujIK8FsFNxnMCHGt9LpyZM4H6EAAIPCGnJP+vPq6Ifj1VLOuWx3vcK203q6gYSOWKtbDPFJRTEc++WWq0vvKkXGiKdWbnq037Sja/5/S4vzqjIOYjgcgqjHimqyVUc2t9ncy8d4H0UJEM8FidV+hFMpDUyXeu2uTijoSJT4YJmJVtaijPzA/C4gG9z8GDg5iuYAIfXXRLgAEpeubFw8HgcCba8v084561bX05/oD4zwCkhY3ebA2h73FSx5JSuIEpXFaZe7ca0n9C9MYmjXjmOpuP6ZjakrtU0Pqh0ezuQliMa8Umh9jpgVRKnYmSt+lNJ1YN2+v7msrUXQ2gEy8RyET74yM3SRO5QmNrnPUEkTGKY4Z/7KSpTFsEU6/KXV3i5P8e19gfXdomy8uenb6KHOl6EUKDojABP0UQzYz0GrYFJOC4+EsWonAXYXoX0uBvCNM5QpL6idavHRt/XEFN32BiL8RJsYFzLBMqZ9qyyUPDvsNQcWTuxiYf8oQlffNtChMaDHGUDDETIt2Fq3fK7jJjZbBo7o87q9k4p0xbZFp2poLo8sIXEiFfK7sOqsmo5vPZCMFb0xUQ/ghNg39GgAQsUITYR3YwHvYNLhogothor4bpfIDAOYB2GwZ+pYoEYv7ffMvGDSPwFttU9/cVYi//U7ROam+F6Qh+c3xzt2P5EI/MT7JTEsDLS6J+uUzjpXepLXoyDvJaimQAEDeUevyjhpzJWMxNI5j0JKho9QO6M11zR2a9fH0Ui3f11umI7sK0V/tiU45M5FMvDOmJUaathX84P9W+3pKnV4gy/4h/fnctPNaaXGTx+NUrN5VEchaCl6VdyqrWEFQhtCPpVq8d5fAc2wIfUtHPv5x9TptueQ+rZMHo1TOsgy1Q2uyusv2vwE04OxIh4YJfcI21QuOqb4dJvITAM0GOBbET7V54/cKL0fGx5lFdUPS0Uyn+pG5HCAZJnKza6VXtbjpU+O9LlVyzBsCQVLwE8y6SzMtJXDJEPyQBi1Umk5BpQp08FiGWNoXmOe255LddrPcF8nEO2Na4oXRe6rCDQAEGIbWZ0ulrldSjqvH41RDBHQWor/v981zlKYFUvLrre7QjImOfPyDXt/akSo6GkDgmOqeqslSLUIgdS21BQD6ffOCqnDX3K3dj4yzO/LxL3IqfbYUGqcZkt/OT9CeVDdpqLxrJUzzw1h+tOCkT4/SGLiBvJM+HyRyLTMdtWuUtxec9JbaNMQq3SXr9UTJz9SPMzfmimdUyMQ7Y1pCzM3+aNuk1q3TTbyBwaKd+4d7nQhoz8W3AbhtzNcU3I2GcAwgqBIHNiQHI91zLBC4hxuqNHfBoAWJEq2WoZs15xj+upX2bt8shubHtMYCIuxwrebCDQAtbvLAzpJYWXmSGLx7n2enD43nvvsTmXhnTEuUFOtkqs6jmm5PDGyIDWNYL+h9jRY3eSJKxYvMVLWPBZFe2+KNvFk4HixD3xWlNH9oHHoIOwyhx22aBQC2qbttMxqx0XUVQ3LgmOl/RYn8cCW+j22WoW4dyVd9fycT74xpScl172kt+wdLrU8H0MLA+tAyfwiicT2+z2SIwO1e/PX+wLxcM80VxNsLbnLDZFbsteWS+0ohvxUl4nTNIGZxHIMGwlUcWFLfuac2DFu9dA1zuibVlDME++MN1UxHdMXeFpmfd8b+AxH68rnvGqm61lCqI7TMjSCaEeZEk4lpcNBZiK+ZynvknXRd3ql4YitNdr9vXqyBFtvQj+eddI+WsxNVbXRnNkrD7Cnbn1WajkGlmOm5jlz0ncn8IMzEO2NakxqyPzVkViG6h2CGEIK7bamf8Ww17tTDjAq9Zevj1UpSAFCaLugpW+XOQvz9ybpHJt4ZGRkAgF7fPDNK5G8DNDtMOPBj49ed+ejbE+2tuT+jNB3RZOzIZsdOlPr2yxkZGfshWsOIEnnFrmwPcpUW5/X55nl7d2YzFGpSlEXNuhxNnEy8MzKmgCAWc3rK1vv6A/O4Rs+S6UcQy4MAml8/nmpx6F6YzozHlPphoNa0iyNL6klNe8zCJhkZk0x3yXpvosTlA1aoaZSI1Z2F6B93N+OgFMrDg9i4hIFWSfx6Wy6+tlq+PsI5S4LYuJyZ5hDxVtdKr8s7qmET0jL1FkS8s7YnJgAI4lG7pWc00p5L7u714SepWAEAlqF/PdlWt5l4Z2RMIklKuUSJ9+3yHSFDM53W55vnT6TMW2sYfYF5nlI0X7E4E6AOAEgZx3WX7NmzWqJ/HO5cpckuR+afVi1hmWlROTLnu5b+03r/bVNy2ZD6nlSJD1TNsYj0S61eMuaiooyhtHnJIwAemarrZ+KdkTGJBIlcUr96BQClhzZFqEdrGFVPk+pqOkpEW59vfoUhljQ9h+mEMBEdjqm7m73eHxjnVoV7F3Rgf2Cc28xpsDMf/6wYGM/FqThBCN7R6ib3ZqZQ05dMvDMyJhHHVK8GsdEQfpCC3xrunN6yee6Am+B8wNxqGfrW9lx8Ryk0PjiccA9gMZM93IuEYYV32PDNgN9Kg+fKZBEloiNOxdycna7LPhh2j0y8MzImEcvgsin1bYkSH6qUnLMSxGtaveTeZsfHiWiNUvk71XAIgHlxKj4axOJpzcN7jgAAEb/smGrLcK8X3OTBuCgvY9BBg+eA32xxkwfG/852D2ZgZ8n+Q6XpXIBMPzZiQXpVVyH+f9PQ5XdGkIl3RsYk05GPf+FHYnWYGCcbkjcWnOTJ4QSqHBln1Aj3AFQIYuN0QbxFN2SqcAigXxCvzzvJ90cSPimQ5J3km+XIuFxX/FEcBvI9ZfuPWt34O6bBe8zgq983z1KaLtpl+UqWZnF2T9l8qSOf3L2n5rEvkYl3RsYU4Nl6k2fHm0Y7Tkp+G4pTgGr+Fpml4O2eld7e69OhmmlZxVmQt7pW+r0WN/3N2Oeh3goT+YZWdGal+Bye0nR2X2DprkL0rxN6cxMg0eLIWq/uCkSJEu8CkIn3BMjEOyNjL1JwkqeiRDytmZZXxwTx8y1u8igRdFch+koxNE9SmtoLTvKIIce/Wk41LUNdC6JmHeGnEkHcPYyb1oQcCzMy8c7I2KsQAZ356O96feu9WtN8IXhrq5fcUi1JJwK3uMma3boHENdHXwgYtuv7VFBwklu7y+ISgNp2jXLsmuq+PTmPfYlMvDMy9jJCIO3IxzdO1fUtQz0YJnTsrk7trA2pH5uq+zXDNDjoyEWf6Slbf8nAfAA9tqGub/HGHgLKGEom3hkZ+zitXvoYfKg4lWcDsAzJT7V58R17eh6mwf4BrdFX9/R991Uy8c7I2A9o9dLVQLp6b89jbxHGYlY5Mi5hUM6U+tnJLlXfG2TinZGRsU8TxGJef2D9NUDzAEBpOi8t0YLOfPyzvT233SFzFczImKEoTXaciqy7+iiUI+PSqnBXICNV4hylYe69We0+2co7I2OGwQx0l6xPpFqcBiBHxK/mnfR7nqVGzSvfH2GumoQNoUVp4UqhR3RlnM5kK++MjBlGr2++O9XiMoBmAeQxi2NLofGpvT2v6YoheV3lI28XRLzelHpGt9fLxDsjY4aRKnFkpeJyF8x0aJJSfm/NaTrT5sW3S6HvArgP4JRIr/Ws9OqZ7qmShU0yMmYYRPDR2J2nJOo8ujMqEIG7CvF3kpR+kmqRd0y1ZaYLN5CtvDMyZhyuld4B8PZdI6wNqR8dravO/o5pcNG19g3hBrKVd0bGjCNnqzcI+JofG+9mhmdI/XzbMJazGfsumXhnZMxAPFtt9Gz13b09j4y9RxY2ycjIyJiBZOKdkZExaSQp5f1IHsjcYN6dMclkYZOMjIzdZqBw6PdTLc4EUCiFxnrXUlcP9MTMmAKylXdGRsZu0+eb56VaXAJQK0CCIQ4NYvl72Qp86sjEOyMjY7dJtTiioXAIdEiYyDl7a077Opl4Z2Rk7DYEblZq3mNK3bfHJ7OfkIl3RkbGbpN30tsI/OauEVaG0A8bkv29N6t9m2zDMiMjY7exTd3T5sVfKUbGZczUYgr9fIuXPLy357Uvk4l3RkbGpGCZurfTjH+8t+exv5CFTTIyMjJmIJl4Z2RkZMxAMvGPIyXeAAAA40lEQVTOyMjImIFk4p2RkZExA8nEOyMjI2MGkol3RkZGxgyEmBv7KWVkZGRkTG+ylXdGRkbGDCQT74yMjIwZSCbeGRkZGTOQTLwzMjIyZiCZeGdkZGTMQDLxzsjIyJiBZOKdkZGRMQPJxDsjIyNjBpKJd0ZGRsYMJBPvjIyMjBlIJt4ZGRkZM5BMvDMyMjJmIJl4Z2RkZMxAMvHOyMjImIFk4p2RkZExA8nEOyMjI2MGkol3RkZGxgwkE++MjIyMGUgm3hkZGRkzkEy8MzIyMmYgmXhnZGRkzEAy8c7IyMiYgfz/Rd2m/XSSxDMAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import BaggingClassifier\n", + "\n", + "tree = DecisionTreeClassifier()\n", + "bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8,\n", + " random_state=1)\n", + "\n", + "bag.fit(X, y)\n", + "visualize_classifier(bag, X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "In this example, we have randomized the data by fitting each estimator with a random subset of 80% of the training points.\n", + "\n", + "In practice, decision trees are more effectively randomized by **injecting some stochasticity in how the splits are chosen**: \n", + "- this way **all the data contributes to the fit each time**\n", + "- but the results of the fit still have the desired randomness.\n", + "- when determining which feature to split on, the randomized tree might select from among the **top several features**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "You can read more technical details about these randomization strategies in the [Scikit-Learn documentation](http://scikit-learn.org/stable/modules/ensemble.html#forest) and references within.\n", + "\n", + "In Scikit-Learn, such an optimized ensemble of randomized decision trees is implemented in the ``RandomForestClassifier`` estimator, which takes care of all the randomization automatically.\n", + "\n", + "All you need to do is select a number of estimators, and it will very quickly (in parallel, if desired) fit the ensemble of trees:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:28:30.271183Z", + "start_time": "2018-05-21T09:28:29.792594Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/matplotlib/contour.py:960: UserWarning: The following kwargs were not used by contour: 'clim'\n", + " s)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "model = RandomForestClassifier(n_estimators=100, random_state=0)\n", + "visualize_classifier(model, X, y);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We see that by averaging over 100 randomly perturbed models, we end up with an overall model that is much closer to our intuition about how the parameter space should be split." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Random Forest Regression\n", + "\n", + "In the previous section we considered random forests within the context of classification.\n", + "\n", + "Random forests can also be made to work in the case of regression (that is, continuous rather than categorical variables). \n", + "- The estimator to use for this is the ``RandomForestRegressor``, and \n", + "- the syntax is very similar to what we saw earlier.\n", + "\n", + "Consider the following data, drawn from the combination of a fast and slow oscillation:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:31:40.938055Z", + "start_time": "2018-05-21T09:31:40.742169Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rng = np.random.RandomState(42)\n", + "x = 10 * rng.rand(200)\n", + "\n", + "def model(x, sigma=0.3):\n", + " fast_oscillation = np.sin(5 * x)\n", + " slow_oscillation = np.sin(0.5 * x)\n", + " noise = sigma * rng.randn(len(x))\n", + "\n", + " return slow_oscillation + fast_oscillation + noise\n", + "\n", + "y = model(x)\n", + "plt.errorbar(x, y, 0.3, fmt='o');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Using the random forest regressor, we can find the best fit curve as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:31:50.952109Z", + "start_time": "2018-05-21T09:31:50.598789Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "forest = RandomForestRegressor(200)\n", + "forest.fit(x[:, None], y)\n", + "\n", + "xfit = np.linspace(0, 10, 1000)\n", + "yfit = forest.predict(xfit[:, None])\n", + "ytrue = model(xfit, sigma=0)\n", + "\n", + "plt.errorbar(x, y, 0.3, fmt='o', alpha=0.5)\n", + "plt.plot(xfit, yfit, '-r');\n", + "plt.plot(xfit, ytrue, '-k', alpha=0.5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Here the true model is shown in the smooth gray curve, while the random forest model is shown by the jagged red curve.\n", + "\n", + "As you can see, the non-parametric random forest model is flexible enough to fit the multi-period data, without us needing to specifying a multi-period model!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Example: Random Forest for Classifying Digits\n", + "\n", + "Earlier we took a quick look at the hand-written digits data (see [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb)).\n", + "\n", + "Let's use that again here to see how the random forest classifier can be used in this context." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:33:28.977941Z", + "start_time": "2018-05-21T09:33:28.852002Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['images', 'data', 'DESCR', 'target_names', 'target'])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_digits\n", + "digits = load_digits()\n", + "digits.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "To remind us what we're looking at, we'll visualize the first few data points:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:33:38.130046Z", + "start_time": "2018-05-21T09:33:36.539685Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# set up the figure\n", + "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", + "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", + "\n", + "# plot the digits: each image is 8x8 pixels\n", + "for i in range(64):\n", + " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", + " ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')\n", + " \n", + " # label the image with the target value\n", + " ax.text(0, 7, str(digits.target[i]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We can quickly classify the digits using a random forest as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:33:55.884986Z", + "start_time": "2018-05-21T09:33:53.297093Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target,\n", + " random_state=0)\n", + "model = RandomForestClassifier(n_estimators=1000)\n", + "model.fit(Xtrain, ytrain)\n", + "ypred = model.predict(Xtest)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We can take a look at the classification report for this classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:34:14.349847Z", + "start_time": "2018-05-21T09:34:14.344599Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.97 0.99 38\n", + " 1 0.98 0.95 0.97 44\n", + " 2 0.95 1.00 0.98 42\n", + " 3 0.98 0.98 0.98 45\n", + " 4 0.97 1.00 0.99 37\n", + " 5 0.98 0.96 0.97 49\n", + " 6 1.00 1.00 1.00 52\n", + " 7 1.00 0.96 0.98 50\n", + " 8 0.94 0.98 0.96 46\n", + " 9 0.98 0.98 0.98 47\n", + "\n", + "avg / total 0.98 0.98 0.98 450\n", + "\n" + ] + } + ], + "source": [ + "from sklearn import metrics\n", + "print(metrics.classification_report(ypred, ytest))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "And for good measure, plot the confusion matrix:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-21T09:34:33.359550Z", + "start_time": "2018-05-21T09:34:33.001144Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "mat = confusion_matrix(ytest, ypred)\n", + "sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)\n", + "plt.xlabel('true label')\n", + "plt.ylabel('predicted label');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We find that a simple, untuned random forest results in a very accurate classification of the digits data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Summary of Random Forests\n", + "\n", + "This section contained a brief introduction to the concept of *ensemble estimators*, and in particular the random forest – an ensemble of randomized decision trees.\n", + "Random forests are a powerful method with several advantages:\n", + "\n", + "- Both training and prediction are very fast, because of the simplicity of the underlying decision trees. \n", + " - In addition, both tasks can be straightforwardly parallelized, because the individual trees are entirely independent entities.\n", + "- The multiple trees allow for a probabilistic classification: \n", + " - a majority vote among estimators gives an estimate of the probability (accessed in Scikit-Learn with the ``predict_proba()`` method).\n", + "- The nonparametric model is extremely flexible, and can thus perform well on tasks that are under-fit by other estimators.\n", + "\n", + "A primary disadvantage of random forests is that the results are not easily interpretable: \n", + "- if you would like to draw conclusions about the *meaning* of the classification model, random forests may not be the best choice." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "< [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) | [Contents](Index.ipynb) | [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb) >" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/code/.ipynb_checkpoints/09.09-machine-learning-summary-checkpoint.ipynb b/code/.ipynb_checkpoints/09.09-machine-learning-summary-checkpoint.ipynb new file mode 100644 index 0000000..285bad0 --- /dev/null +++ b/code/.ipynb_checkpoints/09.09-machine-learning-summary-checkpoint.ipynb @@ -0,0 +1,3055 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "***\n", + "***\n", + "# 计算传播与机器学习\n", + "\n", + "***\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "![](./img/machine.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 1、 监督式学习\n", + "\n", + "工作机制:\n", + "- 这个算法由一个目标变量或结果变量(或因变量)组成。\n", + "- 这些变量由已知的一系列预示变量(自变量)预测而来。\n", + "- 利用这一系列变量,我们生成一个将输入值映射到期望输出值的函数。\n", + "- 这个训练过程会一直持续,直到模型在训练数据上获得期望的精确度。\n", + "- 监督式学习的例子有:回归、决策树、随机森林、K – 近邻算法、逻辑回归等。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 2、非监督式学习\n", + "\n", + "工作机制:\n", + "- 在这个算法中,没有任何目标变量或结果变量要预测或估计。\n", + "- 这个算法用在不同的组内聚类分析。\n", + "- 这种分析方式被广泛地用来细分客户,根据干预的方式分为不同的用户组。\n", + "- 非监督式学习的例子有:关联算法和 K–均值算法。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 3、强化学习\n", + "\n", + "工作机制:\n", + "- 这个算法训练机器进行决策。\n", + "- 它是这样工作的:机器被放在一个能让它通过反复试错来训练自己的环境中。\n", + "- 机器从过去的经验中进行学习,并且尝试利用了解最透彻的知识作出精确的商业判断。 \n", + "- 强化学习的例子有马尔可夫决策过程。alphago\n", + "\n", + "> Chess. Here, the agent decides upon a series of moves depending on the state of the board (the environment), and the\n", + "reward can be defined as win or lose at the end of the game:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "- 线性回归\n", + "- 逻辑回归\n", + "- 决策树\n", + "- SVM\n", + "- 朴素贝叶斯\n", + "---\n", + "- K最近邻算法\n", + "- K均值算法\n", + "- 随机森林算法\n", + "- 降维算法\n", + "- Gradient Boost 和 Adaboost 算法\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn做线性回归\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# 线性回归\n", + "- 通常用于估计连续性变量的实际数值(房价、呼叫次数、总销售额等)。\n", + "- 通过拟合最佳直线来建立自变量X和因变量Y的关系。\n", + "- 这条最佳直线叫做回归线,并且用 $Y= \\beta *X + C$ 这条线性等式来表示。\n", + "- 系数 $\\beta$ 和 C 可以通过最小二乘法获得" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:22.109042Z", + "start_time": "2019-04-22T08:22:20.811040Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import sklearn\n", + "from sklearn import datasets\n", + "from sklearn import linear_model\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.preprocessing import scale" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:24.400103Z", + "start_time": "2019-04-22T08:22:24.390296Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# boston data\n", + "boston = datasets.load_boston()\n", + "y = boston.target\n", + "X = boston.data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:25.362696Z", + "start_time": "2019-04-22T08:22:25.356162Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", + " 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='|t| [0.025 0.975]\n", + "-----------------------------------------------------------------------------------\n", + "Intercept 36.4595 5.103 7.144 0.000 26.432 46.487\n", + "boston.data[0] -0.1080 0.033 -3.287 0.001 -0.173 -0.043\n", + "boston.data[1] 0.0464 0.014 3.382 0.001 0.019 0.073\n", + "boston.data[2] 0.0206 0.061 0.334 0.738 -0.100 0.141\n", + "boston.data[3] 2.6867 0.862 3.118 0.002 0.994 4.380\n", + "boston.data[4] -17.7666 3.820 -4.651 0.000 -25.272 -10.262\n", + "boston.data[5] 3.8099 0.418 9.116 0.000 2.989 4.631\n", + "boston.data[6] 0.0007 0.013 0.052 0.958 -0.025 0.027\n", + "boston.data[7] -1.4756 0.199 -7.398 0.000 -1.867 -1.084\n", + "boston.data[8] 0.3060 0.066 4.613 0.000 0.176 0.436\n", + "boston.data[9] -0.0123 0.004 -3.280 0.001 -0.020 -0.005\n", + "boston.data[10] -0.9527 0.131 -7.283 0.000 -1.210 -0.696\n", + "boston.data[11] 0.0093 0.003 3.467 0.001 0.004 0.015\n", + "boston.data[12] -0.5248 0.051 -10.347 0.000 -0.624 -0.425\n", + "==============================================================================\n", + "Omnibus: 178.041 Durbin-Watson: 1.078\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126\n", + "Skew: 1.521 Prob(JB): 8.84e-171\n", + "Kurtosis: 8.281 Cond. No. 1.51e+04\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The condition number is large, 1.51e+04. This might indicate that there are\n", + "strong multicollinearity or other numerical problems.\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf\n", + "\n", + "# Fit regression model (using the natural log of one of the regressors)\n", + "results = smf.ols('boston.target ~ boston.data', data=boston).fit()\n", + "\n", + "print(results.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:29.198868Z", + "start_time": "2019-04-22T08:22:29.179869Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "regr = linear_model.LinearRegression()\n", + "lm = regr.fit(boston.data, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:30.210025Z", + "start_time": "2019-04-22T08:22:30.203639Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(36.45948838508965,\n", + " array([-1.08011358e-01, 4.64204584e-02, 2.05586264e-02, 2.68673382e+00,\n", + " -1.77666112e+01, 3.80986521e+00, 6.92224640e-04, -1.47556685e+00,\n", + " 3.06049479e-01, -1.23345939e-02, -9.52747232e-01, 9.31168327e-03,\n", + " -5.24758378e-01]),\n", + " 0.7406426641094095)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm.intercept_, lm.coef_, lm.score(boston.data, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:31.110418Z", + "start_time": "2019-04-22T08:22:31.107129Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "predicted = regr.predict(boston.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:32.479326Z", + "start_time": "2019-04-22T08:22:31.916490Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(y, predicted)\n", + "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", + "ax.set_xlabel('$Measured$', fontsize = 20)\n", + "ax.set_ylabel('$Predicted$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 训练集和测试集" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:36.365683Z", + "start_time": "2019-04-22T08:22:36.360788Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n", + " 4.9800e+00],\n", + " [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n", + " 9.1400e+00],\n", + " [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n", + " 4.0300e+00],\n", + " ...,\n", + " [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", + " 5.6400e+00],\n", + " [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n", + " 6.4800e+00],\n", + " [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", + " 7.8800e+00]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "boston.data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:48.265456Z", + "start_time": "2019-04-22T08:22:48.261247Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data,\n", + " boston.target, \n", + " test_size=0.2, \n", + " random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:51.873960Z", + "start_time": "2019-04-22T08:22:51.869286Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "regr = linear_model.LinearRegression()\n", + "lm = regr.fit(Xs_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:52.561738Z", + "start_time": "2019-04-22T08:22:52.555669Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(30.24675099392396,\n", + " array([-1.13055924e-01, 3.01104641e-02, 4.03807204e-02, 2.78443820e+00,\n", + " -1.72026334e+01, 4.43883520e+00, -6.29636221e-03, -1.44786537e+00,\n", + " 2.62429736e-01, -1.06467863e-02, -9.15456240e-01, 1.23513347e-02,\n", + " -5.08571424e-01]),\n", + " 0.7508856358979673)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:53.518402Z", + "start_time": "2019-04-22T08:22:53.515220Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "predicted = regr.predict(Xs_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:54.585839Z", + "start_time": "2019-04-22T08:22:54.380438Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(y_test, predicted)\n", + "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", + "ax.set_xlabel('$Measured$', fontsize = 20)\n", + "ax.set_ylabel('$Predicted$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 交叉验证" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# cross-validation \n", + " \n", + "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", + "- A model is trained using k-1 of the folds as training data;\n", + "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:23:06.421218Z", + "start_time": "2019-04-22T08:23:06.407755Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-1.5841985220997412" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, boston.data , boston.target, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:03.323654Z", + "start_time": "2019-04-22T08:24:01.612164Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAIABJREFUeJzt3Xu8VHW9//HXh5uKaYBcRK7mJYPSbW5Qyk4JaEonL5llUZFHH+Q5lZl2UTk/TYsuVl6qkye8pB12qGkUJkpIntQ8qWAoAipIXkAUxGuAKO7P74/vmvbM7LXmumb2Hub9fDzmMTPfdd1LWZ/5fj/r+/2auyMiIpLRo6tPQEREuhcFBhERyaHAICIiORQYREQkhwKDiIjkUGAQEZEcCgwiIpJDgUFERHIoMIiISI5eXX0ClRg4cKCPHj26q09DRKShLFmy5AV3H1RsvYYMDKNHj2bx4sVdfRoiIg3FzJ4qZb1UmpLM7Ggze8zMVpvZOTHLP29mG81safQ6LWvZNDNbFb2mpXE+IiJSuaprDGbWE/gv4EhgLfCAmc1z9xV5q97g7l/K23YAcAHQCjiwJNr2pWrPS0REKpNGjWE8sNrd17j7G8D1wHElbvthYKG7vxgFg4XA0Smck4iIVCiNwDAMeCbr+9qoLN+JZvawmd1kZiPK3BYzm25mi81s8caNG1M4bRERiVOvx1VvAUa7+4GEWsF15e7A3We5e6u7tw4aVDSpLiIiFUojMKwDRmR9Hx6V/ZO7b3L3bdHXq4BDSt22IbW1wejR0KNHeG9r6+ozEhEpWRqB4QFgPzPb28z6ACcD87JXMLOhWV+PBVZGnxcAR5lZfzPrDxwVlTWutjaYPh2eegrcw/v06QoOItIwqg4M7r4d+BLhhr4SuNHdl5vZRWZ2bLTaGWa23MweAs4APh9t+yLwbUJweQC4KCprXDNmwJYtuWVbtoRyEZEGYI0453Nra6t32w5uPXqEmkI+M2hvr//5iIhEzGyJu7cWW09jJaXpugI59ZEj63ceIiJVUGCoVHaCeeRIOPxw+Pzn4YADYJddctfdeWeYObMrzrI8SpqLCAoMlclPMD/zDPzlL/Cxj8GyZXDllTBqVGg+6tEDBg6EE0/s6rMuTElzEYkoMFQiLsEMsGQJ9OwJU6fCk0+GnMItt8DatXDuuYX32dW/1pU0F5GIAkMlnn669PIpU+CLX4TLLoOFC+O36w6/1sv5m0Rkh6bAUA53mDMneXlSgvnii+Fd7wo5iE2bOi8/77yu/7WedO5Kmos0HQWGQrKbd0aMgAkT4NOfhn32CQnlbH37JieY+/YN+3ruubCfTHPRz34G3/529/i1fuGFISeSrdDfJCI7LAWGJPnNO2vXwn33wUknwcqVcNVVHQnmUaNg1qyQW0iyYkXIP2zd2tFc9OUvw/nndw4yGfX8tf7GG+G8+vUL3wcOLP43icgOSR3ckoweHW7e+UaNConltPa3116hqWn69NzmpL5963djfvNN2H9/GDQoPF319rfD6afDJZfU/tgiUjfq4FattJt3krZbvz7c/GfNCkEHQi3k8svr92v9f/4nBLvzz4feveHgg+GBB+pzbBHpdhQYkqSdjC22v8wjrn/9a2jSqVdN7s03Qx7hkEPgIx8JZePGwYMPwvbt9TkHEelWFBiSzJxZXoK5lP317Vt8f+PHw5gx8MtfVnaccv3617BmTagtZJLP48aFZq0V+bOzikgzUGBIMnUqHBfNUFpqgrnY/jLNRYX2ZwannAL/938hyV1L27fDd74DLS3w0Y92lI8bF94bpTmpqzsHiuxgFBgKee45OPDA0IP5ySerb/PP7hFdaH+f/Wx4gunaa6s7XjFz5sDq1bm1BYB99w1PJzVCYOgOnQNFdjAKDElefTU8oXPMMfU/9pAhob3/V7+qXTv/W2+F2sKBB3bUjDJ69IDW1tICQ1f/WtdQHiKpSyUwmNnRZvaYma02s3Nilp9lZivM7GEzW2Rmo7KWvWVmS6PXvPxtu8wdd4SbclcEBoB/+7dQY7n99nT3m7mR9+oFjz8OH/pQuKnnGzcOHn4YXn+98L66+td6d+gcKLKDqTowmFlP4L+AY4AxwKfMbEzean8DWt39QOAm4OKsZVvdvSV6HUt3cdttsPvu8L73dc3xp0yBwYPhmmvS22f2jTzjqqvib+TjxoXAuHRp8v66w691DeUhkro0agzjgdXuvsbd3wCuB3LaJtz9TnfP3EH+CgxP4bi14x4Cw+TJ4bn+rtC7d8g13HILbNyYzj7LuZGXkoDuDr/WZ86EPn1yy3bZRUN5iFQhjcAwDHgm6/vaqCzJqcBtWd93NrPFZvZXMzs+hfOp3vLlsG5d1zUjZZxySvjVPnt29fvati2+5zXE38iHDYM99ywcGJJ+le+yS8jRJEkzLzF1ajiP3r07Euif+YyG8hCpQl2Tz2b2GaAV+GFW8aioi/angcvMbJ+EbadHAWTxxrR+QSe5LYpbRx9d2+MUM3Zs6Nfwy1+W3+Et++Y7aBAML1BJi7vBm4VjFwoMM2eGXEW23r3DeFDvfW8YIDA/AKSdl1i9Ory+/e2QUH/nO9X/QqRa7l7VC5gALMj6fi5wbsx6k4GVwOAC+7oW+HixYx5yyCFeU0cc4f7ud9f2GKU65ZTQD9rMfdQo99mzi28ze7Z7376Z/tMd23/0o53L+/ZN3ue3vx3Wefnl+OXt7e6DB7vvvHPu+d19t/uAAbnHAfeddnLfbbfO5RC2rcQFF4RjP/NM+P6DH4T9PfpoZfsT2YEBi72U+3opKxXcAfQC1gB7A32Ah4CxeescDDwB7JdX3h/YKfo8EFgFjCl2zJoGhldfde/d2/3rX6/dMUo1e7b7LruUfiPPGDUq+eY7e3Z4LyXQ3H572G7RovjlS5aE5bNmdV42fHj8OSS9zEq7Jtna29332cd90qSOsvXr3Xv2dP/GN8rfn8gOrm6BIRyLKcDj0c1/RlR2EXBs9PkO4HlgafSaF5W/D1gWBZNlwKmlHK+mgWHu3HBZ/vSn2h2jVIVu8IWYpXPzfeGFsN33vx+//BvfcO/VK6xX6jkUep11lvvPf1564PrLX8J2116bW37sse5Dhri/8UZ5f6/IDq6ugaHer5oGhunT3d/2Nvdt22p3jFJVeoMfObKygBLnHe9wP/HEzuXt7e6jR7sffXT8dklBbY89Ojdn7bKL+wc+EL9+oRrS6aeHbV99Nbf8d78L286bV/7fK7IDKzUwqOdzNs96TDX/EciuUOkz+p/4ROeySgcATEpA339/GNbjk5+M3y5p0MDLL+88ZtSVV8Jdd4W5KfIlPU67bRvccAOccALstlvusilTQu/xq68u6U8UkVwKDNlWrIBnnun6x1Qz4m6uO+1U/Ab/+OPhZjlyZPUDAI4bFx5nff753PIbbgjB8/iEJ4wLDRqYNGbU+vXx+4p7nHb+fHjppdDXI1/v3vC5z8Ef/hB6j9dDVw8NIpKmUqoV3e1Vs6akH/4wNEE8/XRt9l+J7GRxz57Fn5Zau9a9Rw/3c85J5/h33RWuyS23dJS99Zb7sGGhLT9N5eRUTjgh5BHefDN+XytXhm0vvjjdc4wT9xRYKQ8JiNQZakqqwG23hb4DI0Z09Zl0yP51/Z3vwCOPhDGMklx9dVj3tNPSOf573xt+BWc3J917b+gAmNSMVKm4GhKEcaOyvfhiqA18+tOd+1FkHHBAGM7kmmtqP+lRdxgaRCRFCgwZ//gH3H1392lGivOFL8CuuybPxfzWW2Hso8mTYZ/YfoLl23XXMHFQdmC4/vowiVH2HA5pyG9+Gj4c9tgj3NxffLFjvRtvDDPPxTUjZTv1VHj00TC3RS11h6FBRFKkwAChPfgd7wg3m1/9qvu2D/fvH349//rX8OyznZcvWBByJF/4QrrHHTcuJJvdQ/C56aYwLHh+0jcN2TWkZ54JuYRnn4Vp00IZhP9GY8eGCYYKOemkkAc56qjatv1rID/ZwSgwZIZoyAyzsWFD957o5cwzw835Zz/rvOwXvwgjsh6b8iC148fDpk3hhv3nP4dEdNrNSIWOfcklHU1Hw4aFGsC6dSFAFjJvXrhWmzeXN/xGuYnkmTM7D13es2dpT4EpaS3dUSmJiO72SjX5XGknsq504onu/fu7v/ZaR1naSedsixeHa3LDDaGfx667um/enP5xkrS3ux96aOf/RsUSvJX8t60kkXz33WG9fv3CQwKZYT+WLi38dylpLXWGOriVKK1ewvV0773hHH/yk46yCy8MZatXp3+8bdvc+/RxP/PM0EHt5JPTP0YxI0aUf5Ov5L9tJcFk8uQwZlQmWL74YhgratKkENTSPJZIFUoNDGpKasT24QkTwhM3l10WmkpqkXTO1qdPSARffnloUlq0qP5NHmvXxpcXSvBW8t+23ETyPfeE2f6++c2OJ6r694dvfStcp1tvTe9YInWiwDBzZpg/IFulvYTr6eyzYc0a+N3vapd0zmhrCzcrjx773Lix/nmYSm7ySb2vC/23Lfc4F14Y8jqnn55bfvrpsP/+8LWvhYca8q1Y0TF/RKnHEqmXUqoV3e2Vege3yy/PrcY3Qhvv9u2h+aJPn3DePXp0HkwuLd2hyaPS9vhMB8HMNpdcUnz9Hj06Nz1deWXndTO5hR//OH5f8+aF5T/9aW75E0+477WX++67hyHL86/r1KmFz1GkQijHUIZHHw2Xoq0t3f3W0uzZYXjweiQuu0seppwhw/OtWxeu1xlnFF7v/vs9J5E8eHB4nzIlBONskyaF3tdJifj2dveJE0Ne5sUXQ9natWHwwQED3Jcty/2bRo50HzcuHP/kkyv/W0USKDCU48EHw6WYOzfd/dZSPX/Fd4caQxo++9nwRNVLLyWvc/zxISi88kpH2RVXhL/3zDM7yjJDhRSrgSxdGtbbbbdwk+/VK0xY9MAD8eu/+ab7YYd1vtZ6WklSUGpgUI4BOoYz2HXXrj2PctQzcVlJW3139NWvhj4NV10Vv3z58pCzOeMM2H33jvLTTw/9Ry67LHQwHD0a/uVfQt+Dfv0KH/ORR0KfhtdeC7f47dtD+WOPxa/fq1d858VGGmJDfTMaXynRo5QXcDTwGLAaOCdm+U7ADdHy+4DRWcvOjcofAz5c7Fip1xgWLAi/yu65J9391lK9f8VX04zTnRxxRHj0NW4Sn6lTQ40ibuKh7dvdW1rK/yVfyX+n7tJ0Vwn1zejWqPMMbj0Js7e9g47pPcfkrfMfwH9Hn08Gbog+j4nW34kwPegTQM9Cx0s9MGRmbXvwwXT3W0v6B1iZTEJ4zpzc8ieeCEnns89O3rY796XoLhr53JtAqYEhraak8cBqd1/j7m8A1wPH5a1zHHBd9PkmYJKZWVR+vbtvc/e/RzWH8SmdV2kyTUlxI3t2V4XmO5BkH/kI7LdfGGbDvaP8Bz8IzThnn528bb36UjRy0536ZuwQ0goMw4Bnsr6vjcpi13H37cArwB4lbltbjZhjgOQJbyRZjx4hX/DAA2H4cAjjLl17bcgfDB2avG29bvKZoJ/Z79ve1jhBvxE7jEonDZN8NrPpZrbYzBZvzAx4l5bNm8N7I9UYpHLTpoXeyZdeGr7/+Meh9/g3vlF4u2pu8uXW7KZODYP+HX44HHRQYwQFCD2+8zVKbUf+Ka3AsA7Int1meFQWu46Z9QLeDmwqcVvcfZa7t7p766BBg1I67UgjNiVJ5XbdNfQSv/nmUEO49NIwv0SmBpGkmpt8pTW7lhZ46KGOIce7u0yNa6edwvvw4Y1T25F/SiswPADsZ2Z7m1kfQnJ5Xt4684Bp0eePA3+KkiHzgJPNbCcz2xvYD7g/pfMqzZYt4R965n9m2fENi1orM3NCb95c2jAf9W6+a2kJk0itWVPb46Rl/vwwxMzNN4fvV16poNCAUgkMUc7gS8ACYCVwo7svN7OLzCwzOcDVwB5mtho4Czgn2nY5cCOwArgd+KK7v5XGeZVsy5bwKzJp7BrZ8fzoR53LumNfgYMPDu9/+1vXnkep5s+HiRNDPw8z+Otfu/qMpAKp5Rjcfb677+/u+7j7zKjsfHefF31+3d1Pcvd93X28u6/J2nZmtN073f22tM6pZJs3qxmp2TTK0zNjxoSnpZYurd0x0uqQtmoVrF4NU6aE2f3e/e7aT6sqNdEwyeea2rJFgaHZNMrTMzvvDO96V+0CQ2YGw6eeCo/vljrLXZz588N7Zt70CRPgvvsaJz8i/6TAAAoMzaiR+gocfHDtmpJmzOh4+CKj0ia1+fNDENt77/B9wgR45RV49NHqz1PqSoEBOnIM0jwaqYNgSwusXx/m2k5bWk1qmzfD//5vaEbKOOyw8K7mpIajwADKMTSrRukg2NIS3h96KN39PvFEyCvEKbdJ7U9/gjfeyA0M++8f+osoMDQcBQZQU5J0b5nAUKw5qZwk8qpV8MEPhhzGzjvnLqukSW3+/NBD+/DDO8p69Ai1Bj2Z1HAUGEBNSdK99e8fmroKJaCLJZGzg8awYTBuHGzbFjr1XXVVR8e0PfYov0nNPQSGI48M84NnmzAhTGP6yitl/cnStRQYQE1J0v21tBQODElJ5PPO6xw0nn023Ki/9jU48MAQBNatgz33hKOOKr9JbcWKkJPIbkbKOOywcMz77itvn9KlFBhATUnS/bW0hMl9MuN65SuURD7llM5BA+CKKzo+m8GkSbBoUe6os6W49dbwnnlMNduhh6qjWwNSYAAFBun+Dj443LCXLYtfnpQs3n13ePPN+GX5wWTSJNiwIcw6V47588NAf5lhRvKPP3asEtANRoHBXTkG6f4yCeik5qSZM+OTyD//echPxMkPJpMmhfdFi0o/r1degXvuiW9GysgkoNXRrWEoMLz+eggOqjFIdzZyZEhCJz2ZNHVqR1NOfr+MUjvzjRwZJjG6447Sz2vhwjBkeaHAMGECvPxy8jzX0u0oMGjIbWkEZoUT0O6hCWjixM79MsrpzDd5Mvz5z8nNT/nmz4d+/To6s8WZMCG8K8/QMBQYFBikUbS0wMMPw/btnZc9+GDom/CpT8VvW2pnvsmTwzDf95cw8n17O9x2G3z4w2GgvyTvfGcIHsozNAwFhkad1lOaT0tLaPp8/PHOy+bMgd694WMfq+4YH/pQqFUUa05qawvJ5ueeC+sW6kzXo0d4OkmBoWEoMGhaT2kUmbkZ8puT2tvhhhvCL/cBA6o7xoABcMghhRPQmX4RmUmONm0qPiLrhAmwfDm8+mp15yd1ocCgpiRpFAccEHoW5weGv/wF1q5NbkYq16RJ4df9P/4Rv7ySEVkzHd2SmqjSmhNCUlFVYDCzAWa20MxWRe/9Y9ZpMbP/M7PlZvawmX0ya9m1ZvZ3M1savVqqOZ+KqClJGkXv3mHym/wnk+bMCdNpHnts/Hblmjw55DHuvjt+eSUjsh56aHiPa05Kc04ISUW1NYZzgEXuvh+wKPqebwvwOXcfCxwNXGZm/bKWf93dW6JXDaepSqAagzSSgw8ONYZM7+Tt2+E3v4GPfjQMYpeG978/zH+elGcYPjy+vNCIrP36hdno4p5MSnNOCElFtYHhOOC66PN1wPH5K7j74+6+Kvr8LLABGFTlcdOjHIM0kpYWeOGFMN4RhFzACy+k14wEofbx/vcn5xkOOqhzWSkjsk6YEAJD/pAbjTLNahOpNjAMcff10efngCGFVjaz8UAf4Ims4plRE9OlZrZTgW2nm9liM1u8cePGKk87i2oM0kjyh+CeMwfe/vb4cYqqMXlymP9hw4bc8kcfhdtvD0N2lzvJ0WGHwYsvdjxVtW0bfPnLyWMz1XuaVeU5/qnAw8eBmd0B7BmzKKee5+5uZomjb5nZUOB/gGnunukbfy4hoPQBZgHfBC6K297dZ0Xr0NraWuYoXwUoxyCNJPNrfenScPOeOxdOPDE0/aQpMzzGnXfCJ6O0oHu4ke+6K9x4IwweXN4+N20K7wccAHvtFXImTz0Vgtqf/9y5Oelzn6vubyhHJs+ROYdMngO67wROteTuFb+Ax4Ch0eehwGMJ6+0OPAh8vMC+PgT8oZTjHnLIIZ6a737XHdxffz29fYrU0r77up94ovtvfxv+312wIP1jbN/u/va3u592WkfZzTeH4/3kJ+Xvb/Zs9759w/bZr7PO6lg+apS7mfvw4e577um+++7uS5ak8ucUNWpU53ODUL4DARZ7CffYapuS5gHTos/TgN/nr2BmfYC5wK/c/aa8ZUOjdyPkJ8oc1jEFmzeHqmP+BCMi3VVLS2hKmjMn/GqfODH9Y/TsCUcc0ZFn2LIFvvrVMH/Dv/97+fuLSzAD3HxzeM/umf3MM+Gx1n79QpPVXnvVvnlHeY4c1QaG7wNHmtkqYHL0HTNrNbOronU+AfwL8PmYx1LbzGwZsAwYCHynyvMpX2bIbbO6H1qkYmvWhKeRtmwJndtqYfJk+Pvfw7G+971wk/zZzwoPf5Gk3BvviBFwxhmhL8X69bV/jHXEiPjyeuc5uokK/gt3cPdNwKSY8sXAadHn2cDshO1r8FOnTBpyWxpJWxvcckvH93/8o3Zt4Zlf+PvsE97f9z74wAcq29fIkeHGHlee5Kc/jT+nGTPS/1unTIH//u/cskrmvt5BqOezpvWURjJjRniaJ1stnvlva4NvfSu37G9/q/zXeqlDf2erV/POxo2h9rXvvmFocwjjQJU79/UORIFBs7dJI6nXzTIuJ7B1a+UBqJyhvzOSahNpN++cdVYYw+l3v+vIefzyl00bFECBQYFBGku9bpa1CEClDv2dUUkto1wLFsDs2XDOOWEK0rFjQ/ny5ekdowEpMCjHII2kHjdLqF8AKiS7lpFxwQXp/ZLfvBlOPz3MF3HeeaFs8GAYOFCBoatPoMspxyCNpJImmUrUKwAVk6llZGoqPVK8ZV1wQdj3lVfmzpc9dqwCQ1efQJdTU5I0mnKbZCo9Rj0CUKlGjID3vCdMJVqN7GEvfvzj0Fcj/0mrTGBIGqqjCSgwqClJJF49AlA5pkwJQ4FXOtlP/vDeEAb1y3/SauzYcIx166o73wamwKAag0hjmDIlDDNebNrRJKU+aaUEtAKDcgwiDWLChDCSbKXNSaU+aaXA0OSBwV01BpFG0bs3HHUU3HZbZe3/pQ57MXBgeDpJgaFJvf56eFeOQaQxHHNMmKTo4YfL3/aQQzqXJT1p1eRPJjV3YNAkPSKN5eijw3u5zUkPPADz5sHhh5f2pNXYsbBiRdM+mdTcgUHTeoo0lqFD4b3vLS8wbN0K06aFbW+5pbQnrcaMgddeC0OAN6HmDgyqMYg0nilT4N574aWXSlv/P/8TVq6Ea64JczyUoskT0AoMoByDSCOZMiX84v/jH4uve9ddcOml8B//AUceWfoxFBiqY2YDzGyhma2K3vsnrPdW1kQ987LK9zaz+8xstZndEM34Vh+qMYg0nvHjYcCAws1JbW3haaMPfjDMRheXeC5kjz1gyBAFhiqcAyxy9/2ARdH3OFvdvSV6HZtV/gPgUnffF3gJODWFcyqNcgwijadnz5CEvu22UHPIl+nhnMkPbN8OX/5y+XNJNPGTSWkEhuOA66LP1xHmbi5JNNfzRCAzF3RZ21dNTUkijWnKlDDBzpIlnZedd17nHs6VTGaUeTIpLvjs4NIIDEPcfX30+TlgSMJ6O5vZYjP7q5llbv57AC+7+/bo+1pgWArnVBo1JYk0pg9/ODxymt+c9NJL6c0lMXZsaFVIexKkBlBSYDCzO8zskZjXcdnrubsDSQ/+jnL3VuDTwGVmtk85J2pm06PAsnjjxo3lbJpMTUkijWngQDj00NzAsGJFyD8kKXcuiSZOQJcUGNx9sru/O+b1e+B5MxsKEL1vSNjHuuh9DfC/wMHAJqCfmfWKVhsOxA5p6O6z3L3V3VsHDRpUxp9YgGoMIo1rr73g/vvDENqDB4f+Da+9Buefn85cEgoMVZkHTIs+TwN+n7+CmfU3s52izwOB9wMrohrGncDHC21fM8oxiDSmtraO2oJ7yDe8+WbII1x4YTpzSfTvHzrF1TMwZM8XMXp0+QnzlKQRGL4PHGlmq4DJ0XfMrNXMrorWeRew2MweIgSC77v7imjZN4GzzGw1IedwdQrnVJotW8ITDr171+2QIpKCGTM6xjrLaG8Pk+9AenNJ1PPJpPz5Ip56KnzPBIc6Bo1exVcpzN03AZNiyhcDp0Wf7wXek7D9GqBAw2ANZYbcNuuSw4tIhdJKMBczdmyY+rO9Pd1pRePEzRexZQucdVYIgmec0bE8EzSgJhMoqeez8gsijScpkVxugrmYsWPDfeLJJ9Pdb5ykoLZhA5x2WjqP4JZIgUH5BZHGM3NmOgnmYuqZgE4KaoMHJ29To0dpFRhUYxBpPFOnppNgLmbMmPBeSWAoNydw2mmdy/r2hUsuCX9fnLRrSJGqcwwNTdN6ijSuqVNr0r6eo18/GDas/MCQSSSXkxNYvhx22inUENauDTf9mTM71s/eH9SmhhRRjUGBQUQKyQyNUY6kRHJSTuDJJ+E3vwljOj39dOenqepVQ4ooMCjHICKFjB0b5nMoZ8ykcp+auvzycMM/44zkfab1CG4JFBhUYxCRQsaODbPA/f3vpW8zJGHIuLicwEsvhUdiTz4ZRoyo7BxT1tyBQTkGESmm3CeTXngB3nijc/+onXeOzwn84hfhXvS1r1V3nilq7sCgpiQRKaacJ5O2bw+//DdvDkNzZHICPXuGe80xx+Suv20b/OQnYXa5gw5K/9wrpMCgGoOIFLL77qGJp5TAcN55sGgRXHEF/L//15ETuPdeePVVOOWUMNxFxpw5sH59t6otQDMHBncFBhEprq0tDNKX6ZeQ1B/hxhvhhz8M80ufckrusvHj4eKLYd68kGiGcA/60Y/gwAPLm4+6Dpo3MGzdGt4VGEQkSaY/QmbAvkID2518Muy3H1x6afy+vvIVOO44OPvsMGprz56hFvK+93W78dqaNzBoyG0RKSapP8IXvxhqBqee2jEaqnvomPab38TvyyxMSeoOzz3X0aT0q1912fDaSRQYVGMQkSRJ/Q5eeSXkEbZtyy3furXwwHbf/W5ujgFqOhhepZoTG+DZAAALv0lEQVQ3MGhaTxEpJmksohEjkpt/Cg1sV6/hwqtUVWAwswFmttDMVkXv/WPWOcLMlma9Xjez46Nl15rZ37OWtVRzPmVRjUFEikkaxfV736ts6O96DRdepWprDOcAi9x9P2BR9D2Hu9/p7i3u3gJMBLYAf8xa5euZ5e6+tMrzKZ1yDCJSTKExiioZ+rtew4VXqdrAcBxwXfT5OuD4Iut/HLjN3bcUWa/2VGMQkVIkjVFUycB2dR4Mr1LVDrs9xN3XR5+fAxIGCPmnk4FL8spmmtn5RDUOd9/WebMaUI5BRKpVydDf9RguvEpFA4OZ3QHsGbMoJ43u7m5mHrNeZj9DCfM+L8gqPpcQUPoAs4BvAhclbD8dmA4wMo32ODUliYjEKhoY3H1y0jIze97Mhrr7+ujGv6HArj4BzHX3N7P2naltbDOzXwKJ/cLdfRYheNDa2poYgEqmpiQRkVjV5hjmAdOiz9OA3xdY91PAnOyCKJhgZkbITzxS5fmUToFBRCRWtYHh+8CRZrYKmBx9x8xazeyqzEpmNhoYAfw5b/s2M1sGLAMGAt+p8nxKpxyDiEisqpLP7r4JmBRTvhg4Lev7k8CwmPUmVnP8qmzZAr16QZ8+XXYKIiLdUfP2fNbIqiIisRQYREQkR/MGBk3rKSISq3kDg6b1FBGJ1dyBQTUGEZFOmjcwqClJRCRW8wYG1RhERGI1d2BQjkFEpJPmDgyqMYiIdNK8gUE5BhGRWM0bGNSUJCISqzkDQ3s7bN2qGoOISIzmDAyvvx7eFRhERDppzsCgIbdFRBI1Z2DQtJ4iIomaOzCoxiAi0knVgcHMTjKz5WbWbmatBdY72sweM7PVZnZOVvneZnZfVH6DmdV+5hwFBhGRRGnUGB4BPgbclbSCmfUE/gs4BhgDfMrMxkSLfwBc6u77Ai8Bp6ZwToUpxyAikqjqwODuK939sSKrjQdWu/sad38DuB44zswMmAjcFK13HXB8tedUlHIMIiKJ6pVjGAY8k/V9bVS2B/Cyu2/PK+/EzKab2WIzW7xx48bqzkZNSSIiiXqVspKZ3QHsGbNohrv/Pt1Tiufus4BZAK2trV7VzhQYREQSlRQY3H1ylcdZB4zI+j48KtsE9DOzXlGtIVNeW5kcg5qSREQ6qVdT0gPAftETSH2Ak4F57u7AncDHo/WmAbWvgajGICKSKI3HVU8ws7XABOBWM1sQle9lZvMBotrAl4AFwErgRndfHu3im8BZZraakHO4utpzKkqBQUQkUUlNSYW4+1xgbkz5s8CUrO/zgfkx660hPLVUP5s3Q69e0Lt3XQ8rItIImrfns/ILIiKxmjcwqBlJRCSWAoOIiORozsCgaT1FRBI1Z2BQjkFEJFHzBgbVGEREYikwiIhIjuYMDJs3qylJRCRBcwYG1RhERBIpMIiISA4FBhERydF8gaG9HbZuVY5BRCRB8wWGrVvDu2oMIiKxmi8waMhtEZGCFBhERCRHVYHBzE4ys+Vm1m5mrQnrjDCzO81sRbTuV7KWfcvM1pnZ0ug1JW4fqdK0niIiBVU7Uc8jwMeAXxRYZztwtrs/aGa7AUvMbKG7r4iWX+ruP6ryPEqnGoOISEFVBQZ3XwlgZoXWWQ+sjz6/ZmYrgWHAisSNakmBQUSkoLrmGMxsNHAwcF9W8ZfM7GEzu8bM+hfYdrqZLTazxRs3bqz8JDJNSQoMIiKxigYGM7vDzB6JeR1XzoHM7G3AzcCZ7v5qVHwFsA/QQqhV/Dhpe3ef5e6t7t46aNCgcg6dK1NjUI5BRCRW0aYkd59c7UHMrDchKLS5+2+z9v181jpXAn+o9lhFqSlJRKSgmjclWUhAXA2sdPdL8pYNzfp6AiGZXVsKDCIiBVX7uOoJZrYWmADcamYLovK9zGx+tNr7gc8CE2MeS73YzJaZ2cPAEcBXqzmfkuhxVRGRgqp9KmkuMDem/FlgSvT5HiD2sSV3/2w1x69Ipsawyy51P7SISCNozp7PvXuHl4iIdNKcgUH5BRGRRM0XGDStp4hIQc0XGFRjEBEpSIFBRERyKDCIiEiO5gsMyjGIiBTUfIFBNQYRkYIUGEREJEdzBgY1JYmIJGq+wLB5s2oMIiIFNF9gUFOSiEhBzRUY2tvh9dcVGERECmiuwKDZ20REimrOwKAag4hIomon6jnJzJabWbuZtRZY78loQp6lZrY4q3yAmS00s1XRe/9qzqcoBQYRkaKqrTE8AnwMuKuEdY9w9xZ3zw4g5wCL3H0/YFH0vXYUGEREiqoqMLj7Snd/rIpdHAdcF32+Dji+mvMpStN6iogUVa8cgwN/NLMlZjY9q3yIu6+PPj8HDKnpWajGICJSVNE5n83sDmDPmEUz3P33JR7ncHdfZ2aDgYVm9qi75zQ/ububmRc4j+nAdICRI0eWeNg8CgwiIkUVDQzuPrnag7j7uuh9g5nNBcYT8hLPm9lQd19vZkOBDQX2MQuYBdDa2poYQApSYBARKarmTUlmtquZ7Zb5DBxFSFoDzAOmRZ+nAaXWQCqjHIOISFHVPq56gpmtBSYAt5rZgqh8LzObH602BLjHzB4C7gdudffbo2XfB440s1XA5Oh77ajGICJSVNGmpELcfS4wN6b8WWBK9HkNcFDC9puASdWcQ1kUGEREimqens9tbXDRReHze94TvouISCdV1RgaRlsbTJ/eUWN4+unwHWDq1K47LxGRbqg5agwzZnQEhYwtW0K5iIjkaI7A8PTT5ZWLiDSx5ggMSR3iKu0oJyKyA2uOwDBzZucnkfr2DeUiIpKjOQLD1KkwaxaMGgVm4X3WLCWeRURiNMdTSRCCgAKBiEhRzVFjEBGRkikwiIhIDgUGERHJocAgIiI5FBhERCSHuVc2501XMrONwFNdfR51NhB4oatPopvQtQh0HQJdh6CU6zDK3QcV21FDBoZmZGaL3b21q8+jO9C1CHQdAl2HIM3roKYkERHJocAgIiI5FBgax6yuPoFuRNci0HUIdB2C1K6DcgwiIpJDNQYREcmhwNANmdk1ZrbBzB7JKhtgZgvNbFX03r8rz7EezGyEmd1pZivMbLmZfSUqb6prYWY7m9n9ZvZQdB0ujMr3NrP7zGy1md1gZn26+lzrwcx6mtnfzOwP0fdmvQ5PmtkyM1tqZoujslT+bSgwdE/XAkfnlZ0DLHL3/YBF0fcd3XbgbHcfAxwGfNHMxtB812IbMNHdDwJagKPN7DDgB8Cl7r4v8BJwaheeYz19BViZ9b1ZrwPAEe7ekvWYair/NhQYuiF3vwt4Ma/4OOC66PN1wPF1Paku4O7r3f3B6PNrhJvBMJrsWnjwj+hr7+jlwETgpqh8h78OAGY2HPgIcFX03WjC61BAKv82FBgaxxB3Xx99fg4Y0pUnU29mNho4GLiPJrwWUfPJUmADsBB4AnjZ3bdHq6wlBM0d3WXAN4D26PseNOd1gPDj4I9mtsTMpkdlqfzbaJ6JenYg7u5m1jSPk5nZ24CbgTPd/dXwIzFolmvh7m8BLWbWD5gLHNDFp1R3ZvavwAZ3X2JmH+rq8+kGDnf3dWY2GFhoZo9mL6zm34ZqDI3jeTMbChC9b+ji86kLM+tNCApt7v7bqLgprwWAu78M3AlMAPqZWebH3XBgXZedWH28HzjWzJ4Eric0IV1O810HANx9XfS+gfBjYTwp/dtQYGgc84Bp0edpwO+78FzqImo/vhpY6e6XZC1qqmthZoOimgJmtgtwJCHfcifw8Wi1Hf46uPu57j7c3UcDJwN/cvepNNl1ADCzXc1st8xn4CjgEVL6t6EObt2Qmc0BPkQYLfF54ALgd8CNwEjCyLKfcPf8BPUOxcwOB+4GltHRpnweIc/QNNfCzA4kJBJ7En7M3ejuF5nZOwi/nAcAfwM+4+7buu5M6ydqSvqau/9rM16H6G+eG33tBfza3Wea2R6k8G9DgUFERHKoKUlERHIoMIiISA4FBhERyaHAICIiORQYREQkhwKDiIjkUGAQEZEcCgwiIpLj/wPiXBik9DBJmQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "scores = [cross_val_score(regr, boston.data,\\\n", + " boston.target,\\\n", + " cv = int(i)).mean() \\\n", + " for i in range(3, 50)]\n", + "plt.plot(range(3, 50), scores,'r-o')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:34.174960Z", + "start_time": "2019-04-22T08:24:34.155764Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.45059442471362826" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_scale = scale(boston.data)\n", + "scores = cross_val_score(regr,data_X_scale, boston.target,\\\n", + " cv = 7)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 使用天涯bbs数据" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:46.198546Z", + "start_time": "2019-04-22T08:24:46.171912Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlelinkauthorauthor_pageclickreplytime
0【民间语文第161期】宁波px启示:船进港湾人应上岸/post-free-2849477-1.shtml贾也http://www.tianya.cn/5049945019467527032012-10-29 07:59
1宁波镇海PX项目引发群体上访 当地政府发布说明(转载)/post-free-2839539-1.shtml无上卫士ABChttp://www.tianya.cn/743418358824410412012-10-24 12:41
\n", + "
" + ], + "text/plain": [ + " title link author \\\n", + "0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 \n", + "1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC \n", + "\n", + " author_page click reply time \n", + "0 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59 \n", + "1 http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('../data/tianya_bbs_threads_list.txt', sep = \"\\t\", header=None)\n", + "df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})\n", + "df[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:47.185301Z", + "start_time": "2019-04-22T08:24:47.169337Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# 定义这个函数的目的是让读者感受到:\n", + "# 抽取不同的样本,得到的结果完全不同。\n", + "def randomSplit(dataX, dataY, num):\n", + " dataX_train = []\n", + " dataX_test = []\n", + " dataY_train = []\n", + " dataY_test = []\n", + " import random\n", + " test_index = random.sample(range(len(df)), num)\n", + " for k in range(len(dataX)):\n", + " if k in test_index:\n", + " dataX_test.append([dataX[k]])\n", + " dataY_test.append(dataY[k])\n", + " else:\n", + " dataX_train.append([dataX[k]])\n", + " dataY_train.append(dataY[k])\n", + " return dataX_train, dataX_test, dataY_train, dataY_test, " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:48.122580Z", + "start_time": "2019-04-22T08:24:48.081523Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variance score: 0.42\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Use only one feature\n", + "data_X = df.reply\n", + "# Split the data into training/testing sets\n", + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), \n", + " np.log(df.reply+1), 20)\n", + "# Create linear regression object\n", + "regr = linear_model.LinearRegression()\n", + "# Train the model using the training sets\n", + "regr.fit(data_X_train, data_y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % regr.score(data_X_test, data_y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:49.133689Z", + "start_time": "2019-04-22T08:24:49.129343Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[12.179091917198399], [11.387872315966666], [11.323941765302724]]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train[:3]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:50.276495Z", + "start_time": "2019-04-22T08:24:50.273286Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "y_true, y_pred = data_y_test, regr.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:51.151351Z", + "start_time": "2019-04-22T08:24:50.992991Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADT1JREFUeJzt3UGIo/d5x/HfT7MbEjWBwM4cjNfzvimUQJtDzAqXYujBEFhMaHvoIUH1KUVgCDi0UGp0ykHXkLMgpinzkhBwDsWkBEM3BEPiROPaIfamJQ3W1CWwE0JIFkFLsk8PO7vZXc+MXs3onVeP9P2AYEd6X73P/r3+8vJKIzkiBADIo9P2AACAxRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJXGriSbe3t6MsyyaeGgDW0v7+/i8iYqfOto2EuyxLTSaTJp4aANaS7WndbblUAgDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3AJxDVVUqy1KdTkdlWaqqqsaP2cj7uAFgE1RVpcFgoNlsJkmaTqcaDAaSpH6/39hxOeMGgDMaDof3o33PbDbTcDhs9LiEGwDO6ODgYKH7l4VwA8AZ7e7uLnT/stQOt+0t2/9u+5UmBwKALEajkbrd7kP3dbtdjUajRo+7yBn3C5JuNjUIAGTT7/c1Ho9VFIVsqygKjcfjRl+YlCRHxPyN7KuSvippJOnvIuLTp23f6/WCTwcEgPps70dEr862dc+4vyzpHyTdOfNUAIClmBtu25+WdCsi9udsN7A9sT05PDxc2oAAgIfVOeN+WtJf2H5X0tclPWN779GNImIcEb2I6O3s1PoSBwDAGcwNd0S8GBFXI6KU9BlJ/xYRf9P4ZACAY/E+bgBIZqHPKomI70j6TiOTAABq4YwbAJIh3ACQDOEGgGQIN9CwNj5oH+uNL1IAGtTWB+1jvXHGDTSorQ/ax3oj3ECD2vqgfaw3wg00qK0P2sd6I9xAg9r6oH2sN8INNKitD9rHeqv1RQqL4osUAGAxTXyRAgBgRRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJKZG27bH7T9A9tv2X7b9hcvYjAAwPEu1djmfyU9ExG3bV+W9Jrtf42I7zc8GwDgGHPDHREh6fbRj5ePbtHkUACAk9W6xm17y/abkm5JejUiXm92LADASWqFOyJ+FxGflHRV0lO2P/HoNrYHtie2J4eHh8ueEwBwZKF3lUTEryTdkHT9mMfGEdGLiN7Ozs6y5gMAPKLOu0p2bH/06M8fkvQpST9pejAAwPHqvKvkMUlftb2lu6H/RkS80uxYAICT1HlXyY8kPXkBswAAauA3JwEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbG6mqKpVlqU6no7IsVVVV2yMBtdX5WFdgrVRVpcFgoNlsJkmaTqcaDAaSpH6/3+ZoQC2ccWPjDIfD+9G+ZzabaTgctjQRsBjCjY1zcHCw0P3AqiHc2Di7u7sL3Q+sGsKNjTMajdTtdh+6r9vtajQatTQRsBjCjY3T7/c1Ho9VFIVsqygKjcdjXphEGo6IpT9pr9eLyWSy9OcFgHVlez8ienW25YwbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkMzccNt+wvYN2+/Yftv2CxcxGADgeJdqbPNbSX8fEW/Y/oikfduvRsQ7Dc8GADjG3DPuiPh5RLxx9OffSLop6fGmBwMAHG+ha9y2S0lPSnq9iWEAAPPVDrftD0t6WdIXIuLXxzw+sD2xPTk8PFzmjACAB9QKt+3LuhvtKiK+edw2ETGOiF5E9HZ2dpY5IwDgAXXeVWJJX5F0MyK+1PxIAIDT1DnjflrSc5Kesf3m0e3ZhucCAJxg7tsBI+I1Sb6AWQAANfCbkwCQDOEGgGQI94aoqkplWarT6agsS1VV1fj+5z0mgBNExNJv165dC6yOvb296Ha7Ien+rdvtxt7eXmP7n/eYwKaRNImajfXd7Zer1+vFZDJZ+vPibMqy1HQ6fd/9RVHo3XffbWT/8x4T2DS29yOiV2tbwr3+Op2OjvvvbFt37txpZP/zHhPYNIuEm2vcG2B3d3eh+5ex/3mPCeBkhHsDjEYjdbvdh+7rdrsajUaN7X/eYwI4Rd2L4YvceHFy9ezt7UVRFGE7iqJY+EXCs+x/3mMCm0S8OAkAuXCNGwDWGOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuJPh68AAXGp7ANRXVZUGg4Fms5kkaTqdajAYSJL6/X6bowG4QJxxJzIcDu9H+57ZbKbhcNjSRADaQLgTOTg4WOh+AOuJcCfC14EBkAh3KnwdGACJcKfS7/c1Ho9VFIVsqygKjcdjXpgENgxfXQYAK4CvLgOANUa4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIJm54bb9ku1btn98EQMBAE5X54z7nyRdb3gOAEBNc8MdEd+V9MsLmAUAUAPXuAEgmaWF2/bA9sT25PDwcFlPCwB4xNLCHRHjiOhFRG9nZ2dZTwsAeASXSgAgmTpvB/yapO9J+rjt92x/rvmxAAAnuTRvg4j47EUMAgCoh0slAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnBvgKqqVJalOp2OyrJUVVVtjwTgHOa+jxu5VVWlwWCg2WwmSZpOpxoMBpKkfr/f5mgAzogz7jU3HA7vR/ue2Wym4XDY0kQAzotwr7mDg4OF7gew+gj3mtvd3V3ofgCrj3CvudFopG63+9B93W5Xo9GopYkAnBfhXnP9fl/j8VhFUci2iqLQeDzmhUkgMUfE0p+01+vFZDJZ+vMCwLqyvR8RvTrbcsYNAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMnUCrft67b/w/ZPbf9jE4NUVaWyLNXpdFSWpaqqWujxZRzjtO23t7e1vb39vn2XMVfTf486+7Sxvpug6TVhzTdURJx6k7Ql6b8k/aGkD0h6S9Ifn7bPtWvXYhF7e3vR7XZD0v1bt9uNvb29Wo8v4xh1tn903+eff/7ccy3qLGuxiuu7CZpeE9Z8vUiaxJwe37vVCfefSfr2Az+/KOnF0/ZZNNxFURwbx6Ioaj2+jGPU3f7B29bW1rnnWtRZ1mIV13cTNL0mrPl6WSTcvrv9yWz/taTrEfG3Rz8/J+lPI+Lzj2w3kDSQpN3d3WvT6fTU531Qp9PRcXPY1p07d+Y+voxj1N2+jkXmWtRZ1mIV13cTNL0mrPl6sb0fEb062y7txcmIGEdELyJ6Ozs7C+27u7t76v3zHl/GMc7y3FtbW+eea1FnmXcV13cTNL0mrPkGm3dKrgu4VLKK12C5xs017vPiGjcWoSVf474k6WeSPqbfvzj5J6fts2i4I+7+IyyKImxHURTv+8c37/FlHOO07a9cuRJXrlx5377LmKvpv0edfdpY303Q9Jqw5utjkXDPvcYtSbaflfRl3X2HyUsRMTpt+16vF5PJZO7zAgDuWuQa96U6G0XEtyR961xTAQCWgt+cBIBkCDcAJEO4ASAZwg0AyRBuAEim1tsBF35S+1BS/d95X23bkn7R9hArjjWqh3WqZ1PXqYiIWr923ki414ntSd33Vm4q1qge1qke1mk+LpUAQDKEGwCSIdzzjdseIAHWqB7WqR7WaQ6ucQNAMpxxA0AyhPsEF/EFydnZfsn2Lds/bnuWVWb7Cds3bL9j+23bL7Q906qx/UHbP7D91tEafbHtmVYZl0qOYXtL0n9K+pSk9yT9UNJnI+KdVgdbMbb/XNJtSf8cEZ9oe55VZfsxSY9FxBu2PyJpX9Jf8e/p92xb0h9ExG3blyW9JumFiPh+y6OtJM64j/eUpJ9GxM8i4v8kfV3SX7Y808qJiO9K+mXbc6y6iPh5RLxx9OffSLop6fF2p1otR98lcPvox8tHN84qT0C4j/e4pP9+4Of3xP9oWALbpaQnJb3e7iSrx/aW7Tcl3ZL0akSwRicg3MAFsf1hSS9L+kJE/LrteVZNRPwuIj4p6aqkp2xz+e0EhPt4/yPpiQd+vnp0H3AmR9dtX5ZURcQ3255nlUXEryTdkHS97VlWFeE+3g8l/ZHtj9n+gKTPSPqXlmdCUkcvvH1F0s2I+FLb86wi2zu2P3r05w/p7hsDftLuVKuLcB8jIn4r6fOSvq27LyR9IyLebneq1WP7a5K+J+njtt+z/bm2Z1pRT0t6TtIztt88uj3b9lAr5jFJN2z/SHdPnF6NiFdanmll8XZAAEiGM24ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMn8P7Lcj2jEg96EAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(y_pred, y_true, color='black')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:52.301659Z", + "start_time": "2019-04-22T08:24:52.130224Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot outputs\n", + "plt.scatter(data_X_test, data_y_test, color='black')\n", + "plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:53.326537Z", + "start_time": "2019-04-22T08:24:53.321437Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('Coefficients: \\n', array([0.68623605]))" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The coefficients\n", + "'Coefficients: \\n', regr.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:55.007412Z", + "start_time": "2019-04-22T08:24:55.002637Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Residual sum of squares: 0.98'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The mean square error\n", + "\"Residual sum of squares: %.2f\" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:55.875656Z", + "start_time": "2019-04-22T08:24:55.846855Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " if __name__ == '__main__':\n", + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " from ipykernel import kernelapp as app\n" + ] + } + ], + "source": [ + "df.click_log = [[np.log(df.click[i]+1)] for i in range(len(df))]\n", + "df.reply_log = [[np.log(df.reply[i]+1)] for i in range(len(df))]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:13.823742Z", + "start_time": "2019-04-22T08:25:13.811227Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.62'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)\n", + "\n", + "# Create linear regression object\n", + "regr = linear_model.LinearRegression()\n", + "# Train the model using the training sets\n", + "regr.fit(Xs_train, y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % regr.score(Xs_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:18.210290Z", + "start_time": "2019-04-22T08:25:18.010690Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot outputs\n", + "plt.scatter(Xs_test, y_test, color='black')\n", + "plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:26.241798Z", + "start_time": "2019-04-22T08:25:26.227633Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.6837007391943056" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, df.click_log, \\\n", + " df.reply_log, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:30.245410Z", + "start_time": "2019-04-22T08:25:30.227128Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.7188149722820985" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, df.click_log, \n", + " df.reply_log, cv =5)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn做logistic回归\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- logistic回归是一个分类算法而不是一个回归算法。\n", + "- 可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。\n", + "- 简单来说,它通过将数据拟合进一个逻辑函数(logistic function)来预估一个事件出现的概率。\n", + "- 因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "$$odds= \\frac{p}{1-p} = \\frac{probability\\: of\\: event\\: occurrence} {probability \\:of \\:not\\: event\\: occurrence}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$ln(odds)= ln(\\frac{p}{1-p})$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$logit(x) = ln(\\frac{p}{1-p}) = b_0+b_1X_1+b_2X_2+b_3X_3....+b_kX_k$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/logistic.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:46:50.277195Z", + "start_time": "2018-04-29T07:46:50.272229Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "repost = []\n", + "for i in df.title:\n", + " if u'转载' in i:\n", + " repost.append(1)\n", + " else:\n", + " repost.append(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:06.292994Z", + "start_time": "2018-04-29T07:47:06.270715Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[194675, 2703], [88244, 1041], [82779, 625]]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]\n", + "data_X[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:45.269303Z", + "start_time": "2018-04-29T07:47:45.259792Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.61241970021413272" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "df['repost'] = repost\n", + "model = LogisticRegression()\n", + "model.fit(data_X,df.repost)\n", + "model.score(data_X,df.repost)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:59.648431Z", + "start_time": "2018-04-29T07:47:59.633936Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "def randomSplitLogistic(dataX, dataY, num):\n", + " dataX_train = []\n", + " dataX_test = []\n", + " dataY_train = []\n", + " dataY_test = []\n", + " import random\n", + " test_index = random.sample(range(len(df)), num)\n", + " for k in range(len(dataX)):\n", + " if k in test_index:\n", + " dataX_test.append(dataX[k])\n", + " dataY_test.append(dataY[k])\n", + " else:\n", + " dataX_train.append(dataX[k])\n", + " dataY_train.append(dataY[k])\n", + " return dataX_train, dataX_test, dataY_train, dataY_test, " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:48:27.726443Z", + "start_time": "2018-04-29T07:48:27.710922Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.45'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Split the data into training/testing sets\n", + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "# Create logistic regression object\n", + "log_regr = LogisticRegression()\n", + "# Train the model using the training sets\n", + "log_regr.fit(data_X_train, data_y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:48:56.873331Z", + "start_time": "2018-04-29T07:48:56.870219Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "y_true, y_pred = data_y_test, log_regr.predict(data_X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:39:12.344043Z", + "start_time": "2018-04-29T07:39:12.338223Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_true, y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:39:13.175680Z", + "start_time": "2018-04-29T07:39:13.171386Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.50 0.17 0.25 6\n", + " 1 0.72 0.93 0.81 14\n", + "\n", + "avg / total 0.66 0.70 0.64 20\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_true, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:43.039620Z", + "start_time": "2018-04-29T07:51:43.034812Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:47.690742Z", + "start_time": "2018-04-29T07:51:47.683127Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.60'" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create logistic regression object\n", + "log_regr = LogisticRegression()\n", + "# Train the model using the training sets\n", + "log_regr.fit(Xs_train, y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % log_regr.score(Xs_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:55.780061Z", + "start_time": "2018-04-29T07:51:55.771924Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic score for test set: 0.595745\n", + "Logistic score for training set: 0.613941\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.03 0.05 39\n", + " 1 0.59 1.00 0.74 55\n", + "\n", + "avg / total 0.76 0.60 0.46 94\n", + "\n" + ] + } + ], + "source": [ + "print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))\n", + "print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))\n", + "y_true, y_pred = y_test, log_regr.predict(Xs_test)\n", + "print(classification_report(y_true, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:52:53.880925Z", + "start_time": "2018-04-29T07:52:53.866672Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.53333333333333333" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logre = LogisticRegression()\n", + "scores = cross_val_score(logre, data_X, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:53:26.825100Z", + "start_time": "2018-04-29T07:53:26.810871Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.62948717948717947" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logre = LogisticRegression()\n", + "data_X_scale = scale(data_X)\n", + "# The importance of preprocessing in data science and the machine learning pipeline I: \n", + "scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现贝叶斯预测\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# Naive Bayes algorithm\n", + "\n", + "It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. \n", + "\n", + "In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. \n", + "\n", + "why it is known as ‘Naive’? For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "贝叶斯定理为使用$p(c)$, $p(x)$, $p(x|c)$ 计算后验概率$P(c|x)$提供了方法:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$\n", + "p(c|x) = \\frac{p(x|c) p(c)}{p(x)}\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "- P(c|x) is the posterior probability of class (c, target) given predictor (x, attributes).\n", + "- P(c) is the prior probability of class.\n", + "- P(x|c) is the likelihood which is the probability of predictor given class.\n", + "- P(x) is the prior probability of predictor." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/Bayes_41.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Step 1: Convert the data set into a frequency table\n", + "\n", + "Step 2: Create Likelihood table by finding the probabilities like:\n", + "- p(Overcast) = 0.29, p(rainy) = 0.36, p(sunny) = 0.36\n", + "- p(playing) = 0.64, p(rest) = 0.36\n", + "\n", + "Step 3: Now, use Naive Bayesian equation to calculate the posterior probability for each class. The class with the highest posterior probability is the outcome of prediction." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## Problem: Players will play if weather is sunny. Is this statement is correct?\n", + "\n", + "We can solve it using above discussed method of posterior probability.\n", + "\n", + "$P(Yes | Sunny) = \\frac{P( Sunny | Yes) * P(Yes) } {P (Sunny)}$\n", + "\n", + "Here we have P (Sunny |Yes) = 3/9 = 0.33, P(Sunny) = 5/14 = 0.36, P( Yes)= 9/14 = 0.64\n", + "\n", + "Now, $P (No | Sunny) = \\frac{0.33 * 0.64}{0.36} = 0.60$, which has higher probability.\n", + "\n", + "$P(No | Sunny) = \\frac{P( Sunny | No) * P(No) } {P (Sunny)}$\n", + "\n", + "Here we have P (Sunny |No) = 2/5 = 0.4, P(Sunny) = 5/14 = 0.36, P( No)= 5/14 = 0.36\n", + "\n", + "Now, $P (Yes | Sunny) = \\frac{0.4 * 0.36}{0.36} = 0.4$, which has lower probability.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ABCMeta BaseDiscreteNB BaseEstimator BaseNB BernoulliNB ClassifierMixin GaussianNB LabelBinarizer MultinomialNB __all__ __builtins__ __doc__ __file__ __name__ __package__ _check_partial_fit_first_call abstractmethod binarize check_X_y check_array check_is_fitted in1d issparse label_binarize logsumexp np safe_sparse_dot six'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import naive_bayes\n", + "' '.join(dir(naive_bayes)) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "- naive_bayes.GaussianNB\tGaussian Naive Bayes (GaussianNB)\n", + "- naive_bayes.MultinomialNB([alpha, ...])\tNaive Bayes classifier for multinomial models\n", + "- naive_bayes.BernoulliNB([alpha, binarize, ...])\tNaive Bayes classifier for multivariate Bernoulli models." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:02:37.644606Z", + "start_time": "2018-04-29T08:02:37.635952Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "#Import Library of Gaussian Naive Bayes model\n", + "from sklearn.naive_bayes import GaussianNB\n", + "import numpy as np\n", + "\n", + "#assigning predictor and target variables\n", + "x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])\n", + "Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:02:52.828101Z", + "start_time": "2018-04-29T08:02:52.818463Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([4, 3])" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Create a Gaussian Classifier\n", + "model = GaussianNB()\n", + "\n", + "# Train the model using the training sets \n", + "model.fit(x[:8], Y[:8])\n", + "\n", + "#Predict Output \n", + "predicted= model.predict([[1,2],[3,4]])\n", + "predicted" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# cross-validation \n", + " \n", + "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", + "- A model is trained using k-1 of the folds as training data;\n", + "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:04:04.297675Z", + "start_time": "2018-04-29T08:04:04.273413Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)\n", + "# Train the model using the training sets \n", + "model.fit(data_X_train, data_y_train)\n", + "\n", + "#Predict Output \n", + "predicted= model.predict(data_X_test)\n", + "predicted" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:04:34.184513Z", + "start_time": "2018-04-29T08:04:34.178511Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.65000000000000002" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(data_X_test, data_y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:05:04.297453Z", + "start_time": "2018-04-29T08:05:04.249311Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=7.\n", + " % (min_labels, self.n_folds)), Warning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.53413410073295453" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.cross_validation import cross_val_score\n", + "\n", + "model = GaussianNB()\n", + "scores = cross_val_score(model, [[c] for c in df.click],\\\n", + " df.reply, cv = 7)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现决策树\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# 决策树\n", + "- 这个监督式学习算法通常被用于分类问题。\n", + "- 它同时适用于分类变量和连续因变量。\n", + "- 在这个算法中,我们将总体分成两个或更多的同类群。\n", + "- 这是根据最重要的属性或者自变量来分成尽可能不同的组别。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/tree.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/playtree.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 在上图中你可以看到,根据多种属性,人群被分成了不同的四个小组,来判断 “他们会不会去玩”。\n", + "### 为了把总体分成不同组别,需要用到许多技术,比如说 Gini、Information Gain、Chi-square、entropy。" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:10:20.871345Z", + "start_time": "2018-04-29T08:10:20.855125Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn import tree\n", + "model = tree.DecisionTreeClassifier(criterion='gini')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:10:49.988277Z", + "start_time": "2018-04-29T08:10:49.973060Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.91275167785234901" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "model.fit(data_X_train,data_y_train)\n", + "model.score(data_X_train,data_y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:11:12.730866Z", + "start_time": "2018-04-29T08:11:12.725782Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict\n", + "model.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:11:28.411441Z", + "start_time": "2018-04-29T08:11:28.397481Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.33461538461538459" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# crossvalidation\n", + "scores = cross_val_score(model, data_X, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现SVM支持向量机\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/svm.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- 将每个数据在N维空间中用点标出(N是你所有的特征总数),每个特征的值是一个坐标的值。\n", + " - 举个例子,如果我们只有身高和头发长度两个特征,我们会在二维空间中标出这两个变量,每个点有两个坐标(这些坐标叫做支持向量)。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/xyplot.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- 现在,我们会找到将两组不同数据分开的一条直线。\n", + " - 两个分组中距离最近的两个点到这条线的距离同时最优化。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/sumintro.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 上面示例中的黑线将数据分类优化成两个小组\n", + "- 两组中距离最近的点(图中A、B点)到达黑线的距离满足最优条件。\n", + " - 这条直线就是我们的分割线。接下来,测试数据落到直线的哪一边,我们就将它分到哪一类去。" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:29.788250Z", + "start_time": "2018-04-29T08:17:29.785022Z" + } + }, + "outputs": [], + "source": [ + "from sklearn import svm\n", + "# Create SVM classification object \n", + "model=svm.SVC() " + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:31.035310Z", + "start_time": "2018-04-29T08:17:31.030713Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __path__ __spec__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "' '.join(dir(svm))" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:41.872379Z", + "start_time": "2018-04-29T08:17:41.849759Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.90380313199105144" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "model.fit(data_X_train,data_y_train)\n", + "model.score(data_X_train,data_y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:47.661313Z", + "start_time": "2018-04-29T08:17:47.655841Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict\n", + "model.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:18:00.419986Z", + "start_time": "2018-04-29T08:17:58.671257Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# crossvalidation\n", + "scores = []\n", + "cvs = [3, 5, 10, 25, 50, 75, 100]\n", + "for i in cvs:\n", + " score = cross_val_score(model, data_X, df.repost,\n", + " cv = i)\n", + " scores.append(score.mean() ) # Try to tune cv\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:18:05.493658Z", + "start_time": "2018-04-29T08:18:05.359658Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(cvs, scores, 'b-o')\n", + "plt.xlabel('$cv$', fontsize = 20)\n", + "plt.ylabel('$Score$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "\n", + "> # 泰坦尼克号数据分析\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:31:28.492497Z", + "start_time": "2018-05-29T07:31:28.488728Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn import tree\n", + "import warnings \n", + "warnings.filterwarnings(\"ignore\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-06T07:02:49.855926Z", + "start_time": "2018-06-06T07:02:49.705773Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "train = pd.read_csv('../data/tatanic_train.csv', \n", + " sep = \",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-06T07:02:52.803564Z", + "start_time": "2018-06-06T07:02:52.759733Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 PassengerId Survived Pclass \\\n", + "0 0 1 0 3 \n", + "1 1 2 1 1 \n", + "2 2 3 1 3 \n", + "3 3 4 1 1 \n", + "4 4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:58.070575Z", + "start_time": "2018-05-29T07:28:57.897862Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", + "train[\"Fare\"] = train[\"Fare\"].fillna(train[\"Fare\"].median())\n", + "#Convert the male and female groups to integer form\n", + "train[\"Sex\"][train[\"Sex\"] == \"male\"] = 0\n", + "train[\"Sex\"][train[\"Sex\"] == \"female\"] = 1\n", + "#Impute the Embarked variable\n", + "train[\"Embarked\"] = train[\"Embarked\"].fillna('S')\n", + "#Convert the Embarked classes to integer form\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"S\"] = 0\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"C\"] = 1\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"Q\"] = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:08.358884Z", + "start_time": "2018-05-29T07:28:08.346226Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.12294397 0.31274009 0.23680307 0.32751287]\n", + "0.977553310887\n" + ] + } + ], + "source": [ + "#Create the target and features numpy arrays: target, features_one\n", + "target = train['Survived'].values\n", + "features_one = train[[\"Pclass\", \"Sex\", \"Age\", \"Fare\"]].values\n", + "\n", + "#Fit your first decision tree: my_tree_one\n", + "my_tree_one = tree.DecisionTreeClassifier()\n", + "my_tree_one = my_tree_one.fit(features_one, target)\n", + "#Look at the importance of the included features and print the score\n", + "print(my_tree_one.feature_importances_)\n", + "print(my_tree_one.score(features_one, target))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:15.915998Z", + "start_time": "2018-05-29T07:28:15.705994Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "test = pd.read_csv('../data/tatanic_test.csv', sep = \",\")\n", + "# Impute the missing value with the median\n", + "test.Fare[152] = test.Fare.median()\n", + "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", + "#Convert the male and female groups to integer form\n", + "test[\"Sex\"][test[\"Sex\"] == \"male\"] = 0\n", + "test[\"Sex\"][test[\"Sex\"] == \"female\"] = 1\n", + "\n", + "#Impute the Embarked variable\n", + "test[\"Embarked\"] = test[\"Embarked\"].fillna('S')\n", + "#Convert the Embarked classes to integer form\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"S\"] = 0\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"C\"] = 1\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"Q\"] = 2\n", + "\n", + "# Extract the features from the test set: Pclass, Sex, Age, and Fare.\n", + "test_features = test[[\"Pclass\",\"Sex\", \"Age\", \"Fare\"]].values\n", + "\n", + "# Make your prediction using the test set\n", + "my_prediction = my_tree_one.predict(test_features)\n", + "\n", + "# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions\n", + "PassengerId =np.array(test['PassengerId']).astype(int)\n", + "my_solution = pd.DataFrame(my_prediction, PassengerId, columns = [\"Survived\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:18.081288Z", + "start_time": "2018-05-29T07:28:18.074414Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Survived
8920
8930
8941
\n", + "
" + ], + "text/plain": [ + " Survived\n", + "892 0\n", + "893 0\n", + "894 1" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_solution[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:25:44.488717Z", + "start_time": "2018-05-29T07:25:44.484381Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(418, 1)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check that your data frame has 418 entries\n", + "my_solution.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# Write your solution to a csv file with the name my_solution.csv \n", + "my_solution.to_csv(\"../data/tatanic_solution_one.csv\", \n", + " index_label = [\"PassengerId\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:26.996353Z", + "start_time": "2018-05-29T07:28:26.982601Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.905723905724\n" + ] + } + ], + "source": [ + "# Create a new array with the added features: features_two\n", + "features_two = train[[\"Pclass\",\"Age\",\"Sex\",\"Fare\",\\\n", + " \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "\n", + "#Control overfitting by setting \"max_depth\" to 10 and \"min_samples_split\" to 5 : my_tree_two\n", + "max_depth = 10\n", + "min_samples_split = 5\n", + "my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, \n", + " min_samples_split = min_samples_split, \n", + " random_state = 1)\n", + "my_tree_two = my_tree_two.fit(features_two, target)\n", + "\n", + "#Print the score of the new decison tree\n", + "print(my_tree_two.score(features_two, target))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:28.033226Z", + "start_time": "2018-05-29T07:28:28.018293Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.979797979798\n" + ] + } + ], + "source": [ + "# create a new train set with the new variable\n", + "train_two = train\n", + "train_two['family_size'] = train.SibSp + train.Parch + 1\n", + "\n", + "# Create a new decision tree my_tree_three\n", + "features_three = train[[\"Pclass\", \"Sex\", \"Age\", \\\n", + " \"Fare\", \"SibSp\", \"Parch\", \"family_size\"]].values\n", + "\n", + "my_tree_three = tree.DecisionTreeClassifier()\n", + "my_tree_three = my_tree_three.fit(features_three, target)\n", + "\n", + "# Print the score of this decision tree\n", + "print(my_tree_three.score(features_three, target))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:32.678968Z", + "start_time": "2018-05-29T07:28:32.465958Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.939393939394\n", + "418\n", + "[0 0 0]\n" + ] + } + ], + "source": [ + "#Import the `RandomForestClassifier`\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "#We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables\n", + "features_forest = train[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "\n", + "#Building the Forest: my_forest\n", + "n_estimators = 100\n", + "forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, \n", + " n_estimators = n_estimators, random_state = 1)\n", + "my_forest = forest.fit(features_forest, target)\n", + "\n", + "#Print the score of the random forest\n", + "print(my_forest.score(features_forest, target))\n", + "\n", + "#Compute predictions and print the length of the prediction vector:test_features, pred_forest\n", + "test_features = test[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "pred_forest = my_forest.predict(test_features)\n", + "print(len(test_features))\n", + "print(pred_forest[:3])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:26:25.602062Z", + "start_time": "2018-05-29T07:26:25.572689Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.14130255 0.17906027 0.41616727 0.17938711 0.05039699 0.01923751\n", + " 0.0144483 ]\n", + "[ 0.10384741 0.20139027 0.31989322 0.24602858 0.05272693 0.04159232\n", + " 0.03452128]\n", + "0.905723905724\n", + "0.939393939394\n" + ] + } + ], + "source": [ + "#Request and print the `.feature_importances_` attribute\n", + "print(my_tree_two.feature_importances_)\n", + "print(my_forest.feature_importances_)\n", + "\n", + "#Compute and print the mean accuracy score for both models\n", + "print(my_tree_two.score(features_two, target))\n", + "print(my_forest.score(features_two, target))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 阅读材料\n", + "机器学习算法的要点(附 Python 和 R 代码)http://blog.csdn.net/a6225301/article/details/50479672\n", + "\n", + "The \"Python Machine Learning\" book code repository and info resource https://github.com/rasbt/python-machine-learning-book\n", + "\n", + "An Introduction to Statistical Learning (James, Witten, Hastie, Tibshirani, 2013) : Python code https://github.com/JWarmenhoven/ISLR-python\n", + "\n", + "BuildingMachineLearningSystemsWithPython https://github.com/luispedro/BuildingMachineLearningSystemsWithPython" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 作业\n", + "https://www.datacamp.com/community/tutorials/the-importance-of-preprocessing-in-data-science-and-the-machine-learning-pipeline-i-centering-scaling-and-k-nearest-neighbours" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python [conda env:anaconda]", + "language": "python", + "name": "conda-env-anaconda-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 0, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "780px", + "left": "1279px", + "top": "168.667px", + "width": "341px" + }, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/code/.ipynb_checkpoints/09.machine_learning_with_sklearn-checkpoint.ipynb b/code/.ipynb_checkpoints/09.machine_learning_with_sklearn-checkpoint.ipynb deleted file mode 100644 index a03a9b0..0000000 --- a/code/.ipynb_checkpoints/09.machine_learning_with_sklearn-checkpoint.ipynb +++ /dev/null @@ -1,3015 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "***\n", - "***\n", - "# 计算传播与机器学习\n", - "\n", - "***\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "![](./img/machine.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 1、 监督式学习\n", - "\n", - "工作机制:\n", - "- 这个算法由一个目标变量或结果变量(或因变量)组成。\n", - "- 这些变量由已知的一系列预示变量(自变量)预测而来。\n", - "- 利用这一系列变量,我们生成一个将输入值映射到期望输出值的函数。\n", - "- 这个训练过程会一直持续,直到模型在训练数据上获得期望的精确度。\n", - "- 监督式学习的例子有:回归、决策树、随机森林、K – 近邻算法、逻辑回归等。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 2、非监督式学习\n", - "\n", - "工作机制:\n", - "- 在这个算法中,没有任何目标变量或结果变量要预测或估计。\n", - "- 这个算法用在不同的组内聚类分析。\n", - "- 这种分析方式被广泛地用来细分客户,根据干预的方式分为不同的用户组。\n", - "- 非监督式学习的例子有:关联算法和 K–均值算法。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 3、强化学习\n", - "\n", - "工作机制:\n", - "- 这个算法训练机器进行决策。\n", - "- 它是这样工作的:机器被放在一个能让它通过反复试错来训练自己的环境中。\n", - "- 机器从过去的经验中进行学习,并且尝试利用了解最透彻的知识作出精确的商业判断。 \n", - "- 强化学习的例子有马尔可夫决策过程。alphago\n", - "\n", - "> Chess. Here, the agent decides upon a series of moves depending on the state of the board (the environment), and the\n", - "reward can be defined as win or lose at the end of the game:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- 线性回归\n", - "- 逻辑回归\n", - "- 决策树\n", - "- SVM\n", - "- 朴素贝叶斯\n", - "---\n", - "- K最近邻算法\n", - "- K均值算法\n", - "- 随机森林算法\n", - "- 降维算法\n", - "- Gradient Boost 和 Adaboost 算法\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn做线性回归\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# 线性回归\n", - "- 通常用于估计连续性变量的实际数值(房价、呼叫次数、总销售额等)。\n", - "- 通过拟合最佳直线来建立自变量X和因变量Y的关系。\n", - "- 这条最佳直线叫做回归线,并且用 $Y= \\beta *X + C$ 这条线性等式来表示。\n", - "- 系数 $\\beta$ 和 C 可以通过最小二乘法获得" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:10:39.010055Z", - "start_time": "2018-04-29T07:10:39.002664Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import sklearn\n", - "from sklearn import datasets\n", - "from sklearn import linear_model\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.preprocessing import scale" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:11:24.244682Z", - "start_time": "2018-04-29T07:11:24.234905Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# boston data\n", - "boston = datasets.load_boston()\n", - "y = boston.target\n", - "X = boston.data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:11:45.142201Z", - "start_time": "2018-04-29T07:11:45.137656Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", - " 'TAX', 'PTRATIO', 'B', 'LSTAT'], \n", - " dtype='|t| [95.0% Conf. Int.]\n", - "-----------------------------------------------------------------------------------\n", - "Intercept 36.4911 5.104 7.149 0.000 26.462 46.520\n", - "boston.data[0] -0.1072 0.033 -3.276 0.001 -0.171 -0.043\n", - "boston.data[1] 0.0464 0.014 3.380 0.001 0.019 0.073\n", - "boston.data[2] 0.0209 0.061 0.339 0.735 -0.100 0.142\n", - "boston.data[3] 2.6886 0.862 3.120 0.002 0.996 4.381\n", - "boston.data[4] -17.7958 3.821 -4.658 0.000 -25.302 -10.289\n", - "boston.data[5] 3.8048 0.418 9.102 0.000 2.983 4.626\n", - "boston.data[6] 0.0008 0.013 0.057 0.955 -0.025 0.027\n", - "boston.data[7] -1.4758 0.199 -7.398 0.000 -1.868 -1.084\n", - "boston.data[8] 0.3057 0.066 4.608 0.000 0.175 0.436\n", - "boston.data[9] -0.0123 0.004 -3.278 0.001 -0.020 -0.005\n", - "boston.data[10] -0.9535 0.131 -7.287 0.000 -1.211 -0.696\n", - "boston.data[11] 0.0094 0.003 3.500 0.001 0.004 0.015\n", - "boston.data[12] -0.5255 0.051 -10.366 0.000 -0.625 -0.426\n", - "==============================================================================\n", - "Omnibus: 178.029 Durbin-Watson: 1.078\n", - "Prob(Omnibus): 0.000 Jarque-Bera (JB): 782.015\n", - "Skew: 1.521 Prob(JB): 1.54e-170\n", - "Kurtosis: 8.276 Cond. No. 1.51e+04\n", - "==============================================================================\n", - "\n", - "Warnings:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", - "[2] The condition number is large, 1.51e+04. This might indicate that there are\n", - "strong multicollinearity or other numerical problems.\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import statsmodels.api as sm\n", - "import statsmodels.formula.api as smf\n", - "\n", - "# Fit regression model (using the natural log of one of the regressors)\n", - "results = smf.ols('boston.target ~ boston.data', data=boston).fit()\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:13:21.823618Z", - "start_time": "2018-04-29T07:13:21.812795Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "regr = linear_model.LinearRegression()\n", - "lm = regr.fit(boston.data, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:13:29.286705Z", - "start_time": "2018-04-29T07:13:29.280511Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(36.491103280363603,\n", - " array([ -1.07170557e-01, 4.63952195e-02, 2.08602395e-02,\n", - " 2.68856140e+00, -1.77957587e+01, 3.80475246e+00,\n", - " 7.51061703e-04, -1.47575880e+00, 3.05655038e-01,\n", - " -1.23293463e-02, -9.53463555e-01, 9.39251272e-03,\n", - " -5.25466633e-01]),\n", - " 0.74060774286494269)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lm.intercept_, lm.coef_, lm.score(boston.data, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:14:24.251725Z", - "start_time": "2018-04-29T07:14:24.248401Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "predicted = regr.predict(boston.data)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:14:33.380349Z", - "start_time": "2018-04-29T07:14:32.952670Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "ax.scatter(y, predicted)\n", - "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", - "ax.set_xlabel('$Measured$', fontsize = 20)\n", - "ax.set_ylabel('$Predicted$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## 训练集和测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 190, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 6.32000000e-03, 1.80000000e+01, 2.31000000e+00, ...,\n", - " 1.53000000e+01, 3.96900000e+02, 4.98000000e+00],\n", - " [ 2.73100000e-02, 0.00000000e+00, 7.07000000e+00, ...,\n", - " 1.78000000e+01, 3.96900000e+02, 9.14000000e+00],\n", - " [ 2.72900000e-02, 0.00000000e+00, 7.07000000e+00, ...,\n", - " 1.78000000e+01, 3.92830000e+02, 4.03000000e+00],\n", - " ..., \n", - " [ 6.07600000e-02, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.96900000e+02, 5.64000000e+00],\n", - " [ 1.09590000e-01, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.93450000e+02, 6.48000000e+00],\n", - " [ 4.74100000e-02, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.96900000e+02, 7.88000000e+00]])" - ] - }, - "execution_count": 190, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "boston.data" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:27.403480Z", - "start_time": "2018-04-29T07:16:27.398197Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data,\n", - " boston.target, \n", - " test_size=0.2, \n", - " random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:43.427978Z", - "start_time": "2018-04-29T07:16:43.423656Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "regr = linear_model.LinearRegression()\n", - "lm = regr.fit(Xs_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:47.859814Z", - "start_time": "2018-04-29T07:16:47.854257Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(30.288948339369036,\n", - " array([ -1.12463481e-01, 3.00810168e-02, 4.07309919e-02,\n", - " 2.78676719e+00, -1.72406347e+01, 4.43248784e+00,\n", - " -6.23998173e-03, -1.44848504e+00, 2.62113793e-01,\n", - " -1.06390978e-02, -9.16398679e-01, 1.24516469e-02,\n", - " -5.09349120e-01]),\n", - " 0.75088377867329148)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:17:35.601265Z", - "start_time": "2018-04-29T07:17:35.598315Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "predicted = regr.predict(Xs_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:17:43.752187Z", - "start_time": "2018-04-29T07:17:43.605493Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "ax.scatter(y_test, predicted)\n", - "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", - "ax.set_xlabel('$Measured$', fontsize = 20)\n", - "ax.set_ylabel('$Predicted$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 交叉验证" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# cross-validation \n", - " \n", - "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", - "- A model is trained using k-1 of the folds as training data;\n", - "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:21:10.344979Z", - "start_time": "2018-04-29T07:21:10.333153Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-1.5787701857180245" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, boston.data , boston.target, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:25:40.617010Z", - "start_time": "2018-04-29T07:25:39.304291Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scores = [cross_val_score(regr, data_X_scale,\\\n", - " boston.target,\\\n", - " cv = int(i)).mean() \\\n", - " for i in range(3, 50)]\n", - "plt.plot(range(3, 50), scores,'r-o')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:25:34.856887Z", - "start_time": "2018-04-29T07:25:34.840623Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.45384871359695633" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_scale = scale(boston.data)\n", - "scores = cross_val_score(regr,data_X_scale, boston.target,\\\n", - " cv = 7)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 使用天涯bbs数据" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:08.949140Z", - "start_time": "2018-05-29T07:23:08.554345Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlelinkauthorauthor_pageclickreplytime
0【民间语文第161期】宁波px启示:船进港湾人应上岸/post-free-2849477-1.shtml贾也http://www.tianya.cn/5049945019467527032012-10-29 07:59
1宁波镇海PX项目引发群体上访 当地政府发布说明(转载)/post-free-2839539-1.shtml无上卫士ABChttp://www.tianya.cn/743418358824410412012-10-24 12:41
\n", - "
" - ], - "text/plain": [ - " title link author \\\n", - "0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 \n", - "1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC \n", - "\n", - " author_page click reply time \n", - "0 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59 \n", - "1 http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41 " - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv('../data/tianya_bbs_threads_list.txt', sep = \"\\t\", header=None)\n", - "df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})\n", - "df[:2]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:27.984100Z", - "start_time": "2018-05-29T07:23:27.969145Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# 定义这个函数的目的是让读者感受到:\n", - "# 抽取不同的样本,得到的结果完全不同。\n", - "def randomSplit(dataX, dataY, num):\n", - " dataX_train = []\n", - " dataX_test = []\n", - " dataY_train = []\n", - " dataY_test = []\n", - " import random\n", - " test_index = random.sample(range(len(df)), num)\n", - " for k in range(len(dataX)):\n", - " if k in test_index:\n", - " dataX_test.append([dataX[k]])\n", - " dataY_test.append(dataY[k])\n", - " else:\n", - " dataX_train.append([dataX[k]])\n", - " dataY_train.append(dataY[k])\n", - " return dataX_train, dataX_test, dataY_train, dataY_test, " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:28.537926Z", - "start_time": "2018-05-29T07:23:28.509765Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'linear_model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m np.log(df.reply+1), 20)\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# Create linear regression object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mregr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlinear_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinearRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m# Train the model using the training sets\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mregr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_X_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_y_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'linear_model' is not defined" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "# Use only one feature\n", - "data_X = df.reply\n", - "# Split the data into training/testing sets\n", - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), \n", - " np.log(df.reply+1), 20)\n", - "# Create linear regression object\n", - "regr = linear_model.LinearRegression()\n", - "# Train the model using the training sets\n", - "regr.fit(data_X_train, data_y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "print('Variance score: %.2f' % regr.score(data_X_test, data_y_test))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:16.208659Z", - "start_time": "2018-05-29T07:23:16.054583Z" - } - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'data_X_train' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata_X_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'data_X_train' is not defined" - ] - } - ], - "source": [ - "data_X_train[:3]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:26:38.754002Z", - "start_time": "2018-04-29T07:26:38.751117Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "y_true, y_pred = data_y_test, regr.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:26:41.635527Z", - "start_time": "2018-04-29T07:26:41.541620Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAD+CAYAAADxhFR7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADiFJREFUeJzt3cGLJId1x/Hf690d1iWtkDMzjkLiroowIgHn4u0Q2/ggCET5A5I40CRohV1kdbBxEuSwTW6ui3KISG5NkATeyiFgxdgHWzGOk1MO7o1jcAwyQlGPY9by7p7iNEZy9uUwM8vM7PR0dU9VV7/u7wcGNNVF1Zse9quiqqfK3F0AgBg6bQ8AAKiOaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACORi3Rvc2dnxLMvq3iwArLVbt27ddffdWevVHu0syzQajereLACsNTMbV1mP0yMAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaAPYeGVZKssydTodZVmmsizbHmmq2j+nDQCRlGWpPM81mUwkSePxWHmeS5L6/X6bo52KI20AG20wGDwI9qHJZKLBYNDSRGcj2gA22t7e3lzL20a0AWy0brc71/K2EW0AG60oCiVJcmxZkiQqiqKlic5GtAFstH6/r+FwqDRNZWZK01TD4XAlL0JKkrl7rRvs9XrOXf4AYD5mdsvde7PW40gbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAAiHaABAI0QaAQIg2AARSKdpmtmVm3zezv2t6IADAdFWPtG9IervBOQAAFcyMtpn9uqTflPQPzY8DADjLmdE2M5P0N5I+O2O93MxGZja6c+dOnfMBAI6YdaT9J5L+xd3fPGsldx+6e8/de7u7u/VNBwA45uKM1/9I0hUz+31JvyDpETN7w93/qvnRAAAnnRltd//44X+b2bOSPkGwAaA9fE4bAAKZdXrkAXd/VdKrjU0CAJiJI20ACIRoA0AgRBsAAiHaAHAOZVkqyzJ1Oh1lWaayLBvdX+ULkQCA48qyVJ7nmkwmkqTxeKw8zyVJ/X6/kX1ypA0ACxoMBg+CfWgymWgwGDS2T6INAAva29uba3kdiDYALKjb7c61vA5EGwAWVBSFkiQ5tixJEhVF0dg+iTYALKjf72s4HCpNU5mZ0jTVcDhs7CKkJJm717rBXq/no9Go1m0CwLozs1vu3pu1HkfaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAApkZbTPrmNk3zOwHZvaGmT2zjMEAAA+rcqTtkv7Y3Z+S9FlJRbMjAQCmuThrBXd3SbcPvk0lfbfRiQAAU82MtiSZ2QuSPi/pjqSHTo+YWS4pl6Rut1vnfACAIypdiHT3F919W9INSa+bmZ14fejuPXfv7e7uNjEnAEBzfnrE3V+T9Kik7WbGAQCcpcqnR540sycO/vtjkn7m7ncbnwwA8JAq57Qfl/R1M7sg6SeSPtnsSACAaap8euTfJT21hFkAADPwF5EAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAAiHaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEMjPaZnbZzIZm9oaZjc3sc8sYDADwsCpH2o9Iel3Sr0m6KukvzOyDjU4FADjVzGi7+z13/5Lvuyvph5Ieb340AMBJc53TNrMPS7os6XsnludmNjKz0Z07d+qcD3hIWZbKskydTkdZlqksy7ZHApamcrTNbEfSFyVdc3c/+pq7D9295+693d3dumcEHijLUnmeazwey901Ho+V5znhxsaoFG0ze7+kr0q64e7fbnYkYLrBYKDJZHJs2WQy0WAwaGkiYLmqfHrkMUlfkVS4+9eaHwmYbm9vb67lwLqpcqT9GUkfkfSSmb158PVkw3MBp+p2u3MtB9ZNlU+PfMHdH3H3Dx35emsZwwEnFUWhJEmOLUuSREVRtDQRsFz8RSRC6ff7Gg6HStNUZqY0TTUcDtXv99seDVgKO/FBkHPr9Xo+Go1q3SYArDszu+XuvVnrcaQNAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCi3bCyLJVlmTqdjrIsU1mWbY8EILCLbQ+wzsqyVJ7nmkwmkqTxeKw8zyWJx2MBWAhH2g0aDAYPgn1oMploMBi0NBGA6Ih2g/b29uZaDgCzEO0GdbvduZYDwCxEu0FFUShJkmPLkiRRURQtTQQgOqLdoH6/r+FwqDRNZWZK01TD4ZCLkAAWZu5e6wZ7vZ6PRqNatwkA687Mbrl7b9Z6HGkDQCBEGwACIdoAEAjRBoBAiDYABEK0G1SWpXZ2dmRmMjPt7OxwwygA58INoxpSlqWee+45vfvuuw+W3bt3T9euXZPEDaMALIYj7YYMBoNjwT703nvvccMoAAurHG0ze5+ZPdXkMOvkrJtCccMoAIuaGW0ze8zMvizpHUkvND/SejjrplDcMArAoqocad+X9LeS/rThWdZKURTa2tp6aPmlS5e4YRSAhc2Mtrv/1N2/KennS5hnbfT7fb388sva3t5+sGx7e1uvvPIKFyEBLKzyDaPM7FlJn3D3T53yWi4pl6Rut3t1PB7XOSMArL2l3jDK3Yfu3nP33u7ubh2bBACcgo/8AUAgRBsAApn5F5FmdkXSdyRdkXTZzJ6W9Gl3/1bDswEATpgZbXf/H0kfWsIsAIAZOD0CAIEQbQAIhGgDQCBEGwACIdoAEAjRxlRlWSrLMnU6HWVZttSn7rS5b2CV8eQanKosS+V5rslkIkkaj8fK81xS80/daXPfwKqrfMOoqnq9no9Go1q3ieXLskyn3fgrTVO9/fbba7tvoC1LvWEU1s+0p+ss46k7be4bWHVEG6ea9nSdZTx1p819A6uOaONURVEoSZJjy5IkWcpTd9rcN7DqiDZO1e/3NRwOlaapzExpmmo4HC7lQmCb+wZWHRciAWAFcCESANYQ0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACKRStM3sD8zsv8zsTTN7rolByrJUlmXqdDrKskxlWU5dx8x08eJFmdmDdc967Tz7XGT2559/vpbtzrvfo/uZ9lpdP3PToswJLJ27n/kl6YqkH0r6ZUlPSPqxpN1p61+9etXndfPmTU+SxCU9+EqSxG/evHnmOodfW1tbfunSpVNfO7mdefa56OxVZziPs+af9tr169dr+ZmbVtfvBohE0shn9NjdK0X79yTdPPL930v6w2nrLxLtNE1PjV2apjPXqfJ1dDvz7PM8s593u4vuN03Tqa9duHBhKbOdV12/GyCSqtG2/XWnM7PPSdpx98HB9y9Kuu3uf31knVxSLkndbvfqeDw+c5sndTodnTaHmen+/ftnrlPF0e3Ms88qqs4173YX3a+ZSdJc71Xds51XXb8bIBIzu+XuvVnrVTmnvSXp6L+U+5L+7+gK7j50956793Z3d+ebVFK32525fNo6i26/yj4X3XYd2110e91ud+prFy5cmGtbbanrdwOsoyrRvq3989mHfkX757hrUxSFkiQ5tixJEhVFceY6h7a2tnTp0qVTXzu5nXn2uejsVWc4j7Pmn/Zanue1/MxNq+t3A6ylWedPJP2ipB9J+oD2L0S+JemRaesvck7bff/iU5qmbmaepunUi4eH5zsPz88ernvWa+fZ5yKzX79+vZbtzrvfkxduT3utrp+5aVHmBOqius5pS5KZPSvpLw++/XN3/8dp6/Z6PR+NRov/XwQANlDVc9oXq2zM3V+V9Oo5ZwIAnBN/EQkAgRBtAAiEaANAIEQbAAIh2gAQSKWP/M21QbM7kub7O/b27Ei62/YQAfA+VcP7NBvv0XSpu8/8k/Laox2JmY2qfC5y0/E+VcP7NBvv0flxegQAAiHaABDIpkd72PYAQfA+VcP7NBvv0Tlt9DltAIhm04+0ASAUoo2ZzOx9ZvZU23MA2NBoL+Pp8uvAzB4zsy9LekfSC23Ps4rM7LKZDc3sDTMbHzyeDyeYWcfMvmFmPzh4r55pe6aoNu6ctpldkfR9SR/V/mPT/kPSb7j7nVYHW0Fm9qik35L0q5I+6u6fanmklWNm25KelvSapG1J/ymp5+61Pt0pOtt/eOkT7n7bzH5X0hf4vPZiNvFI+xlJ/+ruP3L3H0v6Z0m/3fJMK8ndf+ru35T087ZnWVXufs/dv3Tw8JG72n8U3+Ntz7VqDt6f2wffppK+2+Y8kVV6CMKa+aCO/5n9f0v6pZZmwRoxsw9Luizpe23PsorM7AVJn5d0R/sHT1jAJh5pz3y6PDAvM9uR9EVJ13zTzjlW5O4vuvu2pBuSXj84ZYI5bWK0G3+6PDaLmb1f0lcl3XD3b7c9z6pz99ckPar9awCY0yZG+3VJz5jZB8zsCUkfl/RPLc+EoMzsMUlfkVS4+9fanmdVmdmTB//eZGYfk/Szg2sAmNPGndN293fMbCDp3w4W/Zm7/2+bM62qg0/afEfSFUmXzexpSZ9292+1Othq+Yykj0h6ycxeOlj2O+7+VoszraLHJX3dzC5I+omkT7Y8T1gb95E/AIhsE0+PAEBYRBsAAiHaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCD/D/+oO+KxGV+rAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.scatter(y_pred, y_true, color='black')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:00.422795Z", - "start_time": "2018-04-29T07:27:00.326748Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot outputs\n", - "plt.scatter(data_X_test, data_y_test, color='black')\n", - "plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:36.147084Z", - "start_time": "2018-04-29T07:27:36.142088Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "('Coefficients: \\n', array([ 0.68334304]))" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The coefficients\n", - "'Coefficients: \\n', regr.coef_" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:48.770254Z", - "start_time": "2018-04-29T07:27:48.765411Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Residual sum of squares: 0.40'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The mean square error\n", - "\"Residual sum of squares: %.2f\" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:56.521151Z", - "start_time": "2018-04-29T07:27:56.496715Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "df.click_log = [[np.log(df.click[i]+1)] for i in range(len(df))]\n", - "df.reply_log = [[np.log(df.reply[i]+1)] for i in range(len(df))]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:02.712616Z", - "start_time": "2018-04-29T07:28:02.701169Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.62'" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)\n", - "\n", - "# Create linear regression object\n", - "regr = linear_model.LinearRegression()\n", - "# Train the model using the training sets\n", - "regr.fit(Xs_train, y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % regr.score(Xs_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:16.645996Z", - "start_time": "2018-04-29T07:28:16.549017Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD+CAYAAAAqP/5ZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHKJJREFUeJzt3XuQXFWdB/DvryfThJ4EQqbDQ3BuG6hEiAqaQQWyJRrWlGhQKUHYFiq1YOtEQQQWlXG3oKzJLouFgmXJtrKATlOIG1FQyRqBUMhLJpKVQBlMyMwY5DGAvDIJkPRv/ziZMD3T99F9b99Xfz9VXZXpe/ue39xJfefOOefeI6oKIiJKvkzUBRARUTAY6EREKcFAJyJKCQY6EVFKMNCJiFKCgU5ElBIMdCKilGCgExGlBAOdiCglZoTZWD6f10KhEGaTRESJt379+udVdZ7bfqEGeqFQwNDQUJhNEhElnoiMeNmPXS5ERCnBQCciSgkGOhFRSjDQiYhSwlegi8jXRWTzpNdOETk5qOKIiMg7X4Guqv+hqkeo6hEAFgP4G4DfBlIZERE1JMgulyKA/1HVXQEek4go0SqVCgqFAjKZDAqFAiqVSsvaCnIe+jkwoV5DREoASgDQ09MTYHNERPFWqVRQKpUwPj4OABgZGUGpVAIAFIvT4tI3CWJNURFZDOAaVT3Bab/e3l7ljUVE1C4KhQJGRqbfE2RZFoaHhz0fR0TWq2qv235Bdbl8HsB1AR2LiCgVRkdHG3rfL9+BLiJdAJYDuMV/OURE6WHXzdyq7ucgrtA/C2CNqr4WwLGIiFJjYGAAuVyu5r1cLoeBgYGWtOc70FX1v1X1nCCKISJKk2KxiHK5DMuyICKwLAvlcrklA6JAQIOiXnFQlIiocWEPihIRUcQY6EREKcFAJyJKCQY6EVFKMNCJiFKCgU5ElBIMdCKilGCgExGlBAOdiCglGOhERCnBQCciSgkGOhFRSjDQiYhSgoFORJQSDHQiopRgoBMRpUQQa4ruLyI3i8hTIrJFRLJBFEZERI0J4gr9ewA2AjgMwCIAbwZwTCIiatAMPx8WkYMBHA9ghZq17HYGUhURETXM7xX6IgBbAawWkU0i8m0Rkck7iEhJRIZEZGhsbMxnc0REZMdvoB8I4CgA5wF4H4ATACyfvIOqllW1V1V7582b57M5IiKy46vLBcBzANar6jYAEJG1ABb6roqIiBrm9wr9QQBHicjbRGQfACcBGPJfFhERNcrXFbqqbheR8wCsBbAPgBtU9e5AKiMioob47XKBqt4B4I4AaiEiIh94pygRUUow0ImIUoKBTkSUEgx0IqKUYKATEaUEA52IKCUY6EREKcFAJyJKCQY6EVFKMNCJiFKCgU5E1CKqwAUXAKecAvzkJ61vz/ezXIiIqJYq8M1vAqtWvfXe7bcDH/0ocNBBrWuXgU5EFKBVq4D+/unvd3YCuVxr22agExEF4LvfBb76VfvtDz8MzJ7d2hrYh05E5MMPfwiI2If5vfeaLpijj259LbxCJyJqwuAgcNZZ9tvXrgVOOim8egAGOhFRQ1avBj7zGfvtt98OfOIT4dUzme9AF5FhALv2fPm0qv6D32MSEcXNr3/tHNS33AKcdlp49dQTyBW6qh4RxHGIiOLmrruApUvtt994I3D22eHV44SDokREddx3nxnstAvzH/zADHbGJcyBYAJ9h4hsEZEHRWTZ1I0iUhKRIREZGhsbC6A5IqLWGRoyQb5kSf3tV11lgvyLXwy3Li98B7qqHqmqhwP4FwAVEZkzZXtZVXtVtXfevHl+myMiaolHHzVBfuyx9bd/61smyJ3mmkctsC4XVb0XwDCAQlDHJCJqtU2bTJC/5z31t3/jG0C1am7ljztfgS4iXSJyyJ5/vxfAIQD+EkRhRERTVSoVFAoFZDIZFAoFVCqVpo+1dasJ8ne+s/728883Qb5qldkvCfzOcskBuEdEOgC8DOBzqrrdf1lERLUqlQpKpRLGx8cBACMjIyiVSgCAYrHo+TjbtgHveAewa1f97eecA5TLQCaBU0ZEVUNrrLe3V4eGhkJrj4jSo1AoYGRkZNr7lmVheHjY9fPPPgssXAi8/HL97WecYe7+7OjwWWgLiMh6Ve112493ihJRIoyOjjb0/oQXXjD943/7W/3ty5ebuz87O/1WGL0E/lFBRO2op6enofdffhlYsADI5+uH+Uc+AuzcCdx2WzrCHGCgE1FCDAwMIDflgeK5XA4DAwM1723fDrz3vcCcOcBf6kzR+MAHzD533gnss08rKw4fA52IEqFYLKJcLsOyLIgILMtCuVzeOyC6YwdwwgnArFnAhg3TP/+udwGvvAI8+GDrF5qICgOdqE0FOQUwLMViEcPDw6hWqxgeHkaxWMQbbwDLlpmQvv/+6Z+ZPx/4+9/NjUOtXmAiahwUJWpDQU0BjNKuXcDppwO33lp/+4EHAhs3Au10gzqv0CnRkniVGQf9/f17w3zC+Pg4+usthhkz1apZWKKzs36Yd3WZQdBnn22vMAd4hU4JloarzKg0OwUwStWq+xzxkRHAZtJLW+AVOiVWHK4yk/oXQqNTAKOkam69dwrzzZvNfjEsP1QMdEqsqK8yJ/5CGBkZgaru/QshCaHudQpg1EScb8F/7DET5IcfHl5NccZAp8SK+iozDn8hNMttCmDUurudH4h1/fUmyI86KryakoCBTokV9VVm1H8h+FVvCmDUFi40Qf7ii/W3X3ONCfIVK0ItKzEY6JRYUV9lRv0XQposWWKC/Ikn6m+fWFzivPPCrStpGOiUaFFeZUb9F0LQohjgXb7cBPl999XfftFFJsiTsLhELKhqaK/FixcrUZoMDg6qZVkqImpZlg4ODkZdUlMGBwc1l8spgL2vXC7Xsu/n7LNVTVTXf51zTkuaTSwAQ+ohY/k8dCLy/axxry64ALj6avvtp55qHmVLtbw+Dz2QLhcRyYrI4yLyoyCOR0ThavUA72WXma4VuzD/0IfMtTnD3J+g+tAvhVkgmogSqFUDvFddZYL88svrb1+0yAT5unW+mqE9fAe6iBwJ4FgAt/gvh4gmhDlIGfQA73XXmSC/6KL62w8+2AT5xo1NHZ7seOlot3sBEABrARwBYAWAH9XZpwRgCMBQT09PGOMHRIkX9iDlRJt+B3hvvtl5sLOjowWFtwGEMSgqIn0A5qrqgIisALBEVc+125+DokTehDVIGZTf/Ab4+Med96lWne/+JHthLRJ9FoDZInIagLkAukRkk6pe6fO4RG0tKXeh3nMPcOKJzvvs3u38PBYKjq9AV9XjJ/496QqdYU7kU09PT90r9LjchTo0BBx7rPM+b74JzOADukPF35tEMRTXu1A3bjTdJk5h/vrrpsecYR6+wAJdVW9w6j8nIu+ifk7NVFu2mCB/97vt99m+3QR5NhteXVSLd4oSka2nngIOO8x5n5deAvbfP5x62lVYg6JElEJjY2aRZbd98vlw6iFvGOhEtNfLLwNz5jjvs20bcOih4dRDjeGgKBHhpZdMH7lTmG/ZYvrIGebxxUAnamPj4ybIDzjAfp+NG02Qz58fXl3UHAY6URt6/XUT5F1d9vs8/LAJ8kWLwquL/GGgE7WR3btNkM+cab/Pj39sgrzXdU4FxQ0DnagNqJogd7rZZ2IB5rPOCq8uChZnuRClnNsDsU4/HfjpT8OphVqLgU6UUm5BvmQJcO+94dRC4WCgE6WMW5AXCsDWraGUQiFjoBOlhFuQd3QAu3aFUwtFg4FOlHBeFo0I8ZFNFCEGOlFCMchpKgY6UcIwyMkO56ETJURnp3uYTyzHTO2JgU4Uc4cfboLcaUCTQU6Az0AXkYyIrBWRJ0Rkk4gsC6owona3334myJ980n4fyypAJINCoYBKpRJecRRLfq/QFcDZqroAwFcARLvgIVEKHH20CfJXX7XfZ3CwglyuCyMjI1BVjIyMoFQqMdTbnK9AV+PpPV9aAP7Pf0lE7iqVCgqFAjKZ9FydLl9ugvxPf7Lfp1o1XSv9/f0YHx+v2TY+Po7+/v4WV0lx5nuWi4hcAuBrAMYATOtyEZESgBIA9PT0+G2OCJVKBaVSaW+gTVydAohsEWU/+vqAa6913mfXLnNj0ITR0dG6+9m9T+0hsEWiReRUAKsAHKk2B+Ui0RSEQqGAkZGRae9bloXh4eHwC2rS5ZcDl13mvM/4OLDvvtPfT8s5IG+8LhId2CwXVf05gFkAuoM6JlE9Sb86vfZa07XiFOYvvmi6VuqFOQAMDAwgl8vVvJfL5TAwwGGsduZ3lst8ETl4z7+PA7BTVZ8PpDIiG3Zdd3Hv0lu92gR5X5/9Ptu2mSB3WhIOMF1L5XIZlmVBRGBZFsrlciK7nCg4fvvQ5wBYIyIdAJ4D8Fn/JRE5GxgYqOlDB+J9dbpuHfDhDzvvs2kTsGBBY8ctFosMcKrhd5bLH1V1gaoerqrHqer6oAqj9tHojJWkXJ0+8oi5IncK8z/8wVyRNxrmRHWpamivxYsXK9Fkg4ODmsvlFOaeBgWguVxOBwcHQ2nbsiwVEbUsK7A2N2+euG/T/vXb3wbSFLUJAEPqIWMDm+XiBWe50FRRzdaYOvURMN02fq70n3kGOOQQ531uugk488ymDk9tzOssFwY6RSqTyaDe/0ERQbVabVm7Qf4ieeUVYP/9nfe5+mrg/PMbOizRXqFPWyRqRlQzVoKY+rhjh+kjdwrzSy81nSwMcwoDA50iFcZ86nqDrn5+kezebYJ8Stk1VqwwQR7TiTeUUgx0ilSrZ6xM9JVPfYjVySef3PAvElUT5DMcJvsuXWr2u/76QMonagj70CnVnPrKBwYG0N/fj9HRUfT09GBgYMD2FwlXCaIosQ89hdzma6fxCYSNmnoO6oU5YPrKi8UihoeHUa1WMTw8XDfMRbhKECWIl7mNQb04D715bvO1o5zPHRf1zoGI1Hw98bIsy/FYbvPI9z48migE4Dz0dHGbZsen79mfIxGpmRrpNN+cXSsUR+xySRm3aXZJfwJhEOy+V1V1HXRl1wqlge8FLigcPT09da8+J6bZuW1vB3bnwOmvFF6RU5rwCj0h3OZr8/nYjZ0DXpFTKnnpaA/qxUFRf9weJtWqh00lids54GAnJRE4KEr0FnatUJJxUJRCF/Y8eC/tsWuF2gkHRSkQUx9HO3GLPYCWLDzh1h6vyKkd+epyEZGZAK4B8CEAMwF8V1W/Y7c/u1zSK+x58PZ3gbr/f969G8jwb1NKkLC6XLoA/C+AdwJYDODrIvJ2n8ekBHK6xb4Vph934iZQezt2mKtyhjmlld81RV9Q1dV7BmKfB/BXmIWjqY1UKhWITR9Hq+bBv3Vc9yB/4QUT5DNntqQUotgI7FpFRN4F0+2yccr7JREZEpGhsbGxoJqjGOnv77dddahV8+Cff34T3IJ861YT5HPntqQEotgJZNqiiOQBrAVQUtWH7fZjH3o62S0jB8D2/WYtXQrcdZfzPvffDxx3XKDNEkUqtGmLInIAgNsBXOoU5uRPGFMCJ7eRz+eRz+c9tWfXrWJZVmC1n3++mX7oFOY//7m5ImeYU9vycveR3QvAfgDuBfAJL/vzTtHmhPFo3HpteG3PqT6/tV9xhfudnZdeGthpIIoleLxT1G+gfxPAdgCbJ73m2+3PQG+OZVlNPdM7iDa8tmd3y32ztf/sZ+5BfuqpgX37RLHmNdB5638C2PVRiwiq1WpL2/DbXqO1P/QQ8MEPOh9z/nxgy5aGyiBKNN76nyJ+Vqj324bf9rzWvmWL6SN3C3NVhjmRHQZ6yJoZIAzj0bj12nBrz8v34lb7iy+aID/iCOf6gn7eCtdfpVTy0i8T1Kvd+9D9DBCG8WjcyW10d3drd3e346N6vX4v9WrfudO9j7xVj7Ll+quUNGAfevykad3PZr8X9XjrfSv/W6bp50DtwWsfOgM9RGEMboalme8lLk9ATNPPgdoDB0VjKIzBzbA08r3E7Znkafo5EE3GQA9Rmtb99PK9xC3IJ6Tp50BUw0tHe1Cvdh8UVQ1mcLORwctW1mL3+SSs28n1VylJEMadoo2+GOj++blF3+04fmd6JCHIiZLIa6BzUDRh7FfqeYuX2RpBzvSIy2AnUVp5HRTlmqIJ42UFID/7NLLCEIOcKF44KJowQd2i72emR1wHO4naHQM9YZq5Rd/rcdw+yyAnijcGesIUi0WUy2VYlgURQXd3N7q7uyEisCwL5XIZxWKx4eM4fZZBTpQMHBRNqEqlgv7+foyOjqKnpwcDAwOegrwRXvrIq1Vv+xFR80IdFBWRfQG8XVWfCOJ45KxSqaBUKmF8fBwAMDIyglKpBACBhLqXgH79dSCb9d0UEQXIV5eLiOwnIr8A8CyAS4Ipidz09/fvDfMJ4+Pj6O/v93VcL10rL7xgulYY5kTx47cPvQrgewAuDKAW8iiIKYeTeQnyP//ZBPncuU01QUQh8BXoqvqaqt4JYFdA9ZAHQT1cykuQr1ljgnzhwoYOTUQR4CyXBPL7cCkvQX7FFSbIly1rtkoiClvLA11ESiIyJCJDY2NjrW6uLTQy5XCyfN49yE8/3QT5JRwRIUqcQKYtisgKAEtU9Vyn/ThtMRqf+hTwy18673PoocC2beHUQ0SN4QIXhFWrzBW5W5irMsyJ0sDXPHQRmQ3gEQCzAcwUkRMBfF5V7w6gNmrSbbcBn/yk+368s5MoXfzOcnlVVY9Q1YNUdf89/26bMK9UKigUChARzJgxAyKCQqGAlStX1n2/UqkE3nYmk9l77EcfNVfkbmFuWQWIZAKvqdF6vWx3+xwRTeLloelBvdK0wIXbQhP1Xn4XkLBvO+9pcYlWLGrRXL217dpt7+vri6ReorgBF7hoLS8LTdTTzAIS9m13AnjDdf+JH3GQi1o0wq1du+0dHR3YvXu37eeI2oXXQVEGepMymQyaOXcigmq16qttkQzMTbrOppZnV3MQNTlxa7fRc9nqeonihrNcWqzRuzL9fm6CmUfuHGZ2j7IN6g7TRrm1a7e9o6OjoeMRtTsGepPcFpqop5G7Oafycnfn4GDFceaK3ztMm+XWrt32UqkUSb1EieWloz2oV5oGRVXNYJ5lWQpAOzo6FIBalqV9fX11329mMM/LYGcjx56oWUSarqkZbu3abY+qXqI4AQdFk40LMBPRhFAXuKDgMMiJqFkM9JhgkBORXwz0iDHIiSgoDPSIMMiJKGgM9JAxyImoVRjoIWGQE1GrMdBbzEuQV6ve9iMicsI7RVvEy52d4+PmqpxhTkRBYKAHzEuQP/OMCfJ99w2nJiJqDwz0gHgJ8g0bTJAfdFA4NRFRe/Ed6CJyuohsFZHNIvLPQRQ1VbOr1tT7nN17s2bNgohMe2UymZp/r1y5subYIhtcgzyXK6K7O49jjpl+/HqvWbNm1a0rk8mgs7OzZt/Zs2dPOy8rV67cu1LSjBkzsHLlymnf98qVK5HP52vazOfzyGQyyOfze//t9Xy7HT+fz9uuUhTEyk7NrIjE1ZAodbw88MXuBbOW6F8BHArgYADPAJhnt38zD+dqdpWdep/r7OzUbDZb8142m1URaWjloaVLl+qMGd/28OCsMxs67uSXiDRcVy6X06VLl9bdlslkmq7F7Xx7Xb0pm806rlLUyM+3kf8j9bZns1nt7Oz01S5RWODx4Vx+A/0zAAYnfX0TgDPs9m8m0CeeWjj1ZVlWU5/z//qChyC/qEVtR/dyOt+NnOuJ47h9xu3n28j/kWbqI4oTr4Hu62mLIvJVAHlV7d/z9X8CeFpVvzNpnxKAEgD09PQsbnTZtmZX2Wl2RSF7HwfwK5d9ygC+EGCb8eF0vhs5115XKWpkVaIgV0TiakgUR2GtWJRF7fI5VQA1i0CqallVe1W1d968eQ030OwqO8GtanMkzMWbU5jfCECQ1jAHnM9nI+fabZUiP8dstq1m2yWKG7+B/jRM//mEw2D61APT7Co79T7X2dmJbDZb8142m4XUHdU8BCbIH3do5W6YIF/hWEszJgYTG5HL5bB06dK62zKZ5n/Ubufb6+pN2WzWcZUir+15ad9tRaRsNovOzk5f7RLFjpd+GbsXgIMAPAXgQJhB0ScBdNnt3+yKRc2uWlPvc3bvdXV17elHneXaR37ggbWrFbm95syZo93d3Z77cbu6uurUZQZKZ8yYUbPvrFmzpp2Xvr6+vSsldXR0aF9f37Tvu6+vr6amrq4u7e7uVhHR7u7uvf/2er7djt/d3W27StFEnUDzKzs1syISV0OipEBYKxaJyAoA/7rny4tV9Va7feO8YtGuXcCUC7a6Au2WJyLyILQVi1T1BgA3+D1OVFQBL70RDHIiiru2fjgXn4BIRGnSloHOICeiNGqrZ7ksXOgc5u9//1tDn0RESdMWgX788SbIn3ii/vZly0yIP/RQuHUREQUp1YFeLpsgf+CB+tuPPtoE+Zo14dZFRNQKqQz0W281Qf4Fmxs3zz3XBPmGDeHWRUTUSqkaFF2zBvjYx+y3n3oqsHp1ePUQEYUpFVfo69aZK3K7ML/wQnNFzjAnojRL9BX6Aw+YAU873/8+sGc9CiKi1EvkFfof/2iuyO3C/MorzRU5w5yI2kmiAv2xx0yQL15cf/vll5sgv/jicOsiIoqDRHS5qAKFAjA6Wn/7174G/Pu/e7sDlIgorRIR6DfdVD/Mv/xl4JprGOREREBCAv2AA2q/XrECuO46b09JJCJqF4kI9JNPBn7/e+C554BTTgE6OqKuiIgofhIR6ABwwglRV0BEFG++Oy1E5OggCiEiIn+aDnQRuUhEtgBYH2A9RETUJD9X6EMA3h9UIURE5E/Tfeiqeg8ACOcMEhHFQssn/olISUSGRGRobGys1c0REbUt10AXkf+aCORJL88DoapaVtVeVe2dN2+ev2qJiMiWa5eLqtosE0FERHEi6nNFZBHZpaqe+uJFZAzAiK8G0yUP4Pmoi4gRno9aPB+12vl8WKrq2sXR9KCoiPwAwD8C6BCRzQDuUNXznD7jpaB2IiJDqtobdR1xwfNRi+ejFs+HOz+zXPqCLISIiPzh462IiFKCgR6tctQFxAzPRy2ej1o8Hy58D4oSEVE88AqdiCglGOhEFFsisq+ILIi6jqRgoEdERLIi8riI/CjqWuJARPYXkZtF5CkR2SIi2ahripKIXCgifxGRrSLypajrCZuI7CcivwDwLIBLJr3/FREZFZFNIvKx6CqMp8QscJFClwIYjrqIGPkegI0AzgSwD4A3oy0nOiJSAHA+gEUAZgJ4UkRuUNXtUdYVsirM/4lfAfggAIjI4QC+BHNe3g7gdyJiqWrb/l+ZilfoERCRIwEcC+CWqGuJAxE5GMDxAFapsVPbe7R+IqCqMBddrwF4I7pywqeqr6nqnQB2TXr70wBuUdVXVfVxmAuixVHUF1cM9JCJed7wNQC+EnUtMbIIwFYAq/f8Kf1taePnMqvqUwAuA/AggN8BOJNXoQDMVfnkR4dsA3BIRLXEEgM9fF8EsE5VN0ddSIwcCOAoAOcBeB+AEwAsj7SiCInIfgD+CeaX/lUALhYRdo8CWZi/WiZUAeyOqJZY4n+S8J0FYLaInAZgLoAuEdmkqldGXFeUngOwXlW3AYCIrAWwMNqSIvU5AH9S1XUA1onIp2Gem3RHpFVF72kAh076+jAAf42olljiFXrIVPV4VX23qh4D4N8A3NrmYQ6YroWjRORtIrIPgJNgljhsVzsBHCMinSIyG8ACAH+PuKY4+DWAM0Qkt2ccai6ADRHXFCu8QqfIqep2ETkPwFqYGS43qOrdEZcVpUEAHwHwJIAdAG5U1QejLSlce36RPQJgNoCZInIigM/DnJvHYH7pndvmg+fT8NZ/IqKUYJcLEVFKMNCJiFKCgU5ElBIMdCKilGCgExGlBAOdiCglGOhERCnBQCciSgkGOhFRSvw/1Z5KVW1nE4AAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot outputs\n", - "plt.scatter(Xs_test, y_test, color='black')\n", - "plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:41.441426Z", - "start_time": "2018-04-29T07:28:41.428476Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-0.68370073919430563" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, df.click_log, \\\n", - " df.reply_log, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:29:00.237224Z", - "start_time": "2018-04-29T07:29:00.220565Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-0.71881497228209845" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, df.click_log, \n", - " df.reply_log, cv =5)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn做logistic回归\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- logistic回归是一个分类算法而不是一个回归算法。\n", - "- 可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。\n", - "- 简单来说,它通过将数据拟合进一个逻辑函数(logistic function)来预估一个事件出现的概率。\n", - "- 因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "$$odds= \\frac{p}{1-p} = \\frac{probability\\: of\\: event\\: occurrence} {probability \\:of \\:not\\: event\\: occurrence}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$ln(odds)= ln(\\frac{p}{1-p})$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$logit(x) = ln(\\frac{p}{1-p}) = b_0+b_1X_1+b_2X_2+b_3X_3....+b_kX_k$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/logistic.jpg)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:46:50.277195Z", - "start_time": "2018-04-29T07:46:50.272229Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "repost = []\n", - "for i in df.title:\n", - " if u'转载' in i:\n", - " repost.append(1)\n", - " else:\n", - " repost.append(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:06.292994Z", - "start_time": "2018-04-29T07:47:06.270715Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[[194675, 2703], [88244, 1041], [82779, 625]]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]\n", - "data_X[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:45.269303Z", - "start_time": "2018-04-29T07:47:45.259792Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.61241970021413272" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "df['repost'] = repost\n", - "model = LogisticRegression()\n", - "model.fit(data_X,df.repost)\n", - "model.score(data_X,df.repost)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:59.648431Z", - "start_time": "2018-04-29T07:47:59.633936Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "def randomSplitLogistic(dataX, dataY, num):\n", - " dataX_train = []\n", - " dataX_test = []\n", - " dataY_train = []\n", - " dataY_test = []\n", - " import random\n", - " test_index = random.sample(range(len(df)), num)\n", - " for k in range(len(dataX)):\n", - " if k in test_index:\n", - " dataX_test.append(dataX[k])\n", - " dataY_test.append(dataY[k])\n", - " else:\n", - " dataX_train.append(dataX[k])\n", - " dataY_train.append(dataY[k])\n", - " return dataX_train, dataX_test, dataY_train, dataY_test, " - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:48:27.726443Z", - "start_time": "2018-04-29T07:48:27.710922Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.45'" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Split the data into training/testing sets\n", - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "# Create logistic regression object\n", - "log_regr = LogisticRegression()\n", - "# Train the model using the training sets\n", - "log_regr.fit(data_X_train, data_y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:48:56.873331Z", - "start_time": "2018-04-29T07:48:56.870219Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "y_true, y_pred = data_y_test, log_regr.predict(data_X_test)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:39:12.344043Z", - "start_time": "2018-04-29T07:39:12.338223Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", - " array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_true, y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:39:13.175680Z", - "start_time": "2018-04-29T07:39:13.171386Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.50 0.17 0.25 6\n", - " 1 0.72 0.93 0.81 14\n", - "\n", - "avg / total 0.66 0.70 0.64 20\n", - "\n" - ] - } - ], - "source": [ - "print(classification_report(y_true, y_pred))" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:43.039620Z", - "start_time": "2018-04-29T07:51:43.034812Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:47.690742Z", - "start_time": "2018-04-29T07:51:47.683127Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.60'" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create logistic regression object\n", - "log_regr = LogisticRegression()\n", - "# Train the model using the training sets\n", - "log_regr.fit(Xs_train, y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % log_regr.score(Xs_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:55.780061Z", - "start_time": "2018-04-29T07:51:55.771924Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Logistic score for test set: 0.595745\n", - "Logistic score for training set: 0.613941\n", - " precision recall f1-score support\n", - "\n", - " 0 1.00 0.03 0.05 39\n", - " 1 0.59 1.00 0.74 55\n", - "\n", - "avg / total 0.76 0.60 0.46 94\n", - "\n" - ] - } - ], - "source": [ - "print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))\n", - "print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))\n", - "y_true, y_pred = y_test, log_regr.predict(Xs_test)\n", - "print(classification_report(y_true, y_pred))" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:52:53.880925Z", - "start_time": "2018-04-29T07:52:53.866672Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.53333333333333333" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logre = LogisticRegression()\n", - "scores = cross_val_score(logre, data_X, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:53:26.825100Z", - "start_time": "2018-04-29T07:53:26.810871Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.62948717948717947" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logre = LogisticRegression()\n", - "data_X_scale = scale(data_X)\n", - "# The importance of preprocessing in data science and the machine learning pipeline I: \n", - "scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现贝叶斯预测\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# Naive Bayes algorithm\n", - "\n", - "It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. \n", - "\n", - "In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. \n", - "\n", - "why it is known as ‘Naive’? For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "贝叶斯定理为使用$p(c)$, $p(x)$, $p(x|c)$ 计算后验概率$P(c|x)$提供了方法:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$\n", - "p(c|x) = \\frac{p(x|c) p(c)}{p(x)}\n", - "$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- P(c|x) is the posterior probability of class (c, target) given predictor (x, attributes).\n", - "- P(c) is the prior probability of class.\n", - "- P(x|c) is the likelihood which is the probability of predictor given class.\n", - "- P(x) is the prior probability of predictor." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/Bayes_41.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Step 1: Convert the data set into a frequency table\n", - "\n", - "Step 2: Create Likelihood table by finding the probabilities like:\n", - "- p(Overcast) = 0.29, p(rainy) = 0.36, p(sunny) = 0.36\n", - "- p(playing) = 0.64, p(rest) = 0.36\n", - "\n", - "Step 3: Now, use Naive Bayesian equation to calculate the posterior probability for each class. The class with the highest posterior probability is the outcome of prediction." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## Problem: Players will play if weather is sunny. Is this statement is correct?\n", - "\n", - "We can solve it using above discussed method of posterior probability.\n", - "\n", - "$P(Yes | Sunny) = \\frac{P( Sunny | Yes) * P(Yes) } {P (Sunny)}$\n", - "\n", - "Here we have P (Sunny |Yes) = 3/9 = 0.33, P(Sunny) = 5/14 = 0.36, P( Yes)= 9/14 = 0.64\n", - "\n", - "Now, $P (Yes | Sunny) = \\frac{0.33 * 0.64}{0.36} = 0.60$, which has higher probability." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'ABCMeta BaseDiscreteNB BaseEstimator BaseNB BernoulliNB ClassifierMixin GaussianNB LabelBinarizer MultinomialNB __all__ __builtins__ __doc__ __file__ __name__ __package__ _check_partial_fit_first_call abstractmethod binarize check_X_y check_array check_is_fitted in1d issparse label_binarize logsumexp np safe_sparse_dot six'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn import naive_bayes\n", - "' '.join(dir(naive_bayes)) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- naive_bayes.GaussianNB\tGaussian Naive Bayes (GaussianNB)\n", - "- naive_bayes.MultinomialNB([alpha, ...])\tNaive Bayes classifier for multinomial models\n", - "- naive_bayes.BernoulliNB([alpha, binarize, ...])\tNaive Bayes classifier for multivariate Bernoulli models." - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:02:37.644606Z", - "start_time": "2018-04-29T08:02:37.635952Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "#Import Library of Gaussian Naive Bayes model\n", - "from sklearn.naive_bayes import GaussianNB\n", - "import numpy as np\n", - "\n", - "#assigning predictor and target variables\n", - "x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])\n", - "Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:02:52.828101Z", - "start_time": "2018-04-29T08:02:52.818463Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([4, 3])" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Create a Gaussian Classifier\n", - "model = GaussianNB()\n", - "\n", - "# Train the model using the training sets \n", - "model.fit(x[:8], Y[:8])\n", - "\n", - "#Predict Output \n", - "predicted= model.predict([[1,2],[3,4]])\n", - "predicted" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# cross-validation \n", - " \n", - "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", - "- A model is trained using k-1 of the folds as training data;\n", - "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:04:04.297675Z", - "start_time": "2018-04-29T08:04:04.273413Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)\n", - "# Train the model using the training sets \n", - "model.fit(data_X_train, data_y_train)\n", - "\n", - "#Predict Output \n", - "predicted= model.predict(data_X_test)\n", - "predicted" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:04:34.184513Z", - "start_time": "2018-04-29T08:04:34.178511Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.65000000000000002" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.score(data_X_test, data_y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:05:04.297453Z", - "start_time": "2018-04-29T08:05:04.249311Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=7.\n", - " % (min_labels, self.n_folds)), Warning)\n" - ] - }, - { - "data": { - "text/plain": [ - "0.53413410073295453" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "model = GaussianNB()\n", - "scores = cross_val_score(model, [[c] for c in df.click],\\\n", - " df.reply, cv = 7)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现决策树\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# 决策树\n", - "- 这个监督式学习算法通常被用于分类问题。\n", - "- 它同时适用于分类变量和连续因变量。\n", - "- 在这个算法中,我们将总体分成两个或更多的同类群。\n", - "- 这是根据最重要的属性或者自变量来分成尽可能不同的组别。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/tree.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/playtree.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 在上图中你可以看到,根据多种属性,人群被分成了不同的四个小组,来判断 “他们会不会去玩”。\n", - "### 为了把总体分成不同组别,需要用到许多技术,比如说 Gini、Information Gain、Chi-square、entropy。" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:10:20.871345Z", - "start_time": "2018-04-29T08:10:20.855125Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn import tree\n", - "model = tree.DecisionTreeClassifier(criterion='gini')" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:10:49.988277Z", - "start_time": "2018-04-29T08:10:49.973060Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.91275167785234901" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "model.fit(data_X_train,data_y_train)\n", - "model.score(data_X_train,data_y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:11:12.730866Z", - "start_time": "2018-04-29T08:11:12.725782Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Predict\n", - "model.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:11:28.411441Z", - "start_time": "2018-04-29T08:11:28.397481Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.33461538461538459" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# crossvalidation\n", - "scores = cross_val_score(model, data_X, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现SVM支持向量机\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/svm.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- 将每个数据在N维空间中用点标出(N是你所有的特征总数),每个特征的值是一个坐标的值。\n", - " - 举个例子,如果我们只有身高和头发长度两个特征,我们会在二维空间中标出这两个变量,每个点有两个坐标(这些坐标叫做支持向量)。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/xyplot.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- 现在,我们会找到将两组不同数据分开的一条直线。\n", - " - 两个分组中距离最近的两个点到这条线的距离同时最优化。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/sumintro.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 上面示例中的黑线将数据分类优化成两个小组\n", - "- 两组中距离最近的点(图中A、B点)到达黑线的距离满足最优条件。\n", - " - 这条直线就是我们的分割线。接下来,测试数据落到直线的哪一边,我们就将它分到哪一类去。" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:29.788250Z", - "start_time": "2018-04-29T08:17:29.785022Z" - } - }, - "outputs": [], - "source": [ - "from sklearn import svm\n", - "# Create SVM classification object \n", - "model=svm.SVC() " - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:31.035310Z", - "start_time": "2018-04-29T08:17:31.030713Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __path__ __spec__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "' '.join(dir(svm))" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:41.872379Z", - "start_time": "2018-04-29T08:17:41.849759Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.90380313199105144" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "model.fit(data_X_train,data_y_train)\n", - "model.score(data_X_train,data_y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:47.661313Z", - "start_time": "2018-04-29T08:17:47.655841Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Predict\n", - "model.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:18:00.419986Z", - "start_time": "2018-04-29T08:17:58.671257Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# crossvalidation\n", - "scores = []\n", - "cvs = [3, 5, 10, 25, 50, 75, 100]\n", - "for i in cvs:\n", - " score = cross_val_score(model, data_X, df.repost,\n", - " cv = i)\n", - " scores.append(score.mean() ) # Try to tune cv\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:18:05.493658Z", - "start_time": "2018-04-29T08:18:05.359658Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(cvs, scores, 'b-o')\n", - "plt.xlabel('$cv$', fontsize = 20)\n", - "plt.ylabel('$Score$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "\n", - "\n", - "> # 泰坦尼克号数据分析\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:31:28.492497Z", - "start_time": "2018-05-29T07:31:28.488728Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn import tree\n", - "import warnings \n", - "warnings.filterwarnings(\"ignore\") \n" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:31:48.879245Z", - "start_time": "2018-05-29T07:31:48.872163Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "train = pd.read_csv('../data/tatanic_train.csv', sep = \",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:31:52.234171Z", - "start_time": "2018-05-29T07:31:52.216747Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 PassengerId Survived Pclass \\\n", - "0 0 1 0 3 \n", - "1 1 2 1 1 \n", - "2 2 3 1 3 \n", - "3 3 4 1 1 \n", - "4 4 5 0 3 \n", - "\n", - " Name Sex Age SibSp \\\n", - "0 Braund, Mr. Owen Harris male 22.0 1 \n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", - "2 Heikkinen, Miss. Laina female 26.0 0 \n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", - "4 Allen, Mr. William Henry male 35.0 0 \n", - "\n", - " Parch Ticket Fare Cabin Embarked \n", - "0 0 A/5 21171 7.2500 NaN S \n", - "1 0 PC 17599 71.2833 C85 C \n", - "2 0 STON/O2. 3101282 7.9250 NaN S \n", - "3 0 113803 53.1000 C123 S \n", - "4 0 373450 8.0500 NaN S " - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train.head() " - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:58.070575Z", - "start_time": "2018-05-29T07:28:57.897862Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", - "train[\"Fare\"] = train[\"Fare\"].fillna(train[\"Fare\"].median())\n", - "#Convert the male and female groups to integer form\n", - "train[\"Sex\"][train[\"Sex\"] == \"male\"] = 0\n", - "train[\"Sex\"][train[\"Sex\"] == \"female\"] = 1\n", - "#Impute the Embarked variable\n", - "train[\"Embarked\"] = train[\"Embarked\"].fillna('S')\n", - "#Convert the Embarked classes to integer form\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"S\"] = 0\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"C\"] = 1\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"Q\"] = 2" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:08.358884Z", - "start_time": "2018-05-29T07:28:08.346226Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.12294397 0.31274009 0.23680307 0.32751287]\n", - "0.977553310887\n" - ] - } - ], - "source": [ - "#Create the target and features numpy arrays: target, features_one\n", - "target = train['Survived'].values\n", - "features_one = train[[\"Pclass\", \"Sex\", \"Age\", \"Fare\"]].values\n", - "\n", - "#Fit your first decision tree: my_tree_one\n", - "my_tree_one = tree.DecisionTreeClassifier()\n", - "my_tree_one = my_tree_one.fit(features_one, target)\n", - "#Look at the importance of the included features and print the score\n", - "print(my_tree_one.feature_importances_)\n", - "print(my_tree_one.score(features_one, target))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:15.915998Z", - "start_time": "2018-05-29T07:28:15.705994Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "test = pd.read_csv('../data/tatanic_test.csv', sep = \",\")\n", - "# Impute the missing value with the median\n", - "test.Fare[152] = test.Fare.median()\n", - "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", - "#Convert the male and female groups to integer form\n", - "test[\"Sex\"][test[\"Sex\"] == \"male\"] = 0\n", - "test[\"Sex\"][test[\"Sex\"] == \"female\"] = 1\n", - "\n", - "#Impute the Embarked variable\n", - "test[\"Embarked\"] = test[\"Embarked\"].fillna('S')\n", - "#Convert the Embarked classes to integer form\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"S\"] = 0\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"C\"] = 1\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"Q\"] = 2\n", - "\n", - "# Extract the features from the test set: Pclass, Sex, Age, and Fare.\n", - "test_features = test[[\"Pclass\",\"Sex\", \"Age\", \"Fare\"]].values\n", - "\n", - "# Make your prediction using the test set\n", - "my_prediction = my_tree_one.predict(test_features)\n", - "\n", - "# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions\n", - "PassengerId =np.array(test['PassengerId']).astype(int)\n", - "my_solution = pd.DataFrame(my_prediction, PassengerId, columns = [\"Survived\"])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:18.081288Z", - "start_time": "2018-05-29T07:28:18.074414Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Survived
8920
8930
8941
\n", - "
" - ], - "text/plain": [ - " Survived\n", - "892 0\n", - "893 0\n", - "894 1" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_solution[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:25:44.488717Z", - "start_time": "2018-05-29T07:25:44.484381Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(418, 1)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Check that your data frame has 418 entries\n", - "my_solution.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# Write your solution to a csv file with the name my_solution.csv \n", - "my_solution.to_csv(\"../data/tatanic_solution_one.csv\", \n", - " index_label = [\"PassengerId\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:26.996353Z", - "start_time": "2018-05-29T07:28:26.982601Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.905723905724\n" - ] - } - ], - "source": [ - "# Create a new array with the added features: features_two\n", - "features_two = train[[\"Pclass\",\"Age\",\"Sex\",\"Fare\",\\\n", - " \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "\n", - "#Control overfitting by setting \"max_depth\" to 10 and \"min_samples_split\" to 5 : my_tree_two\n", - "max_depth = 10\n", - "min_samples_split = 5\n", - "my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, \n", - " min_samples_split = min_samples_split, \n", - " random_state = 1)\n", - "my_tree_two = my_tree_two.fit(features_two, target)\n", - "\n", - "#Print the score of the new decison tree\n", - "print(my_tree_two.score(features_two, target))" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:28.033226Z", - "start_time": "2018-05-29T07:28:28.018293Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.979797979798\n" - ] - } - ], - "source": [ - "# create a new train set with the new variable\n", - "train_two = train\n", - "train_two['family_size'] = train.SibSp + train.Parch + 1\n", - "\n", - "# Create a new decision tree my_tree_three\n", - "features_three = train[[\"Pclass\", \"Sex\", \"Age\", \\\n", - " \"Fare\", \"SibSp\", \"Parch\", \"family_size\"]].values\n", - "\n", - "my_tree_three = tree.DecisionTreeClassifier()\n", - "my_tree_three = my_tree_three.fit(features_three, target)\n", - "\n", - "# Print the score of this decision tree\n", - "print(my_tree_three.score(features_three, target))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:32.678968Z", - "start_time": "2018-05-29T07:28:32.465958Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.939393939394\n", - "418\n", - "[0 0 0]\n" - ] - } - ], - "source": [ - "#Import the `RandomForestClassifier`\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "#We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables\n", - "features_forest = train[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "\n", - "#Building the Forest: my_forest\n", - "n_estimators = 100\n", - "forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, \n", - " n_estimators = n_estimators, random_state = 1)\n", - "my_forest = forest.fit(features_forest, target)\n", - "\n", - "#Print the score of the random forest\n", - "print(my_forest.score(features_forest, target))\n", - "\n", - "#Compute predictions and print the length of the prediction vector:test_features, pred_forest\n", - "test_features = test[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "pred_forest = my_forest.predict(test_features)\n", - "print(len(test_features))\n", - "print(pred_forest[:3])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:26:25.602062Z", - "start_time": "2018-05-29T07:26:25.572689Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.14130255 0.17906027 0.41616727 0.17938711 0.05039699 0.01923751\n", - " 0.0144483 ]\n", - "[ 0.10384741 0.20139027 0.31989322 0.24602858 0.05272693 0.04159232\n", - " 0.03452128]\n", - "0.905723905724\n", - "0.939393939394\n" - ] - } - ], - "source": [ - "#Request and print the `.feature_importances_` attribute\n", - "print(my_tree_two.feature_importances_)\n", - "print(my_forest.feature_importances_)\n", - "\n", - "#Compute and print the mean accuracy score for both models\n", - "print(my_tree_two.score(features_two, target))\n", - "print(my_forest.score(features_two, target))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 阅读材料\n", - "机器学习算法的要点(附 Python 和 R 代码)http://blog.csdn.net/a6225301/article/details/50479672\n", - "\n", - "The \"Python Machine Learning\" book code repository and info resource https://github.com/rasbt/python-machine-learning-book\n", - "\n", - "An Introduction to Statistical Learning (James, Witten, Hastie, Tibshirani, 2013) : Python code https://github.com/JWarmenhoven/ISLR-python\n", - "\n", - "BuildingMachineLearningSystemsWithPython https://github.com/luispedro/BuildingMachineLearningSystemsWithPython" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 作业\n", - "https://www.datacamp.com/community/tutorials/the-importance-of-preprocessing-in-data-science-and-the-machine-learning-pipeline-i-centering-scaling-and-k-nearest-neighbours" - ] - } - ], - "metadata": { - "celltoolbar": "Slideshow", - "kernelspec": { - "display_name": "Python [conda env:anaconda]", - "language": "python", - "name": "conda-env-anaconda-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.4" - }, - "latex_envs": { - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 0 - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": false, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "780px", - "left": "1279px", - "top": "168.667px", - "width": "341px" - }, - "toc_section_display": false, - "toc_window_display": true - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/code/09.01-machine-learning-with-sklearn.ipynb b/code/09.01-machine-learning-with-sklearn.ipynb new file mode 100755 index 0000000..cbb64ac --- /dev/null +++ b/code/09.01-machine-learning-with-sklearn.ipynb @@ -0,0 +1,2200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Introducing Scikit-Learn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "\n", + "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "\n", + "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Python machine learning \n", + "\n", + "- [Scikit-Learn](http://scikit-learn.org) provides efficient versions of a large number of common algorithms.\n", + " - Scikit-Learn is characterized by a clean, uniform, and streamlined API, as well as by very useful and complete online documentation.\n", + "\n", + "> # Once you understand the basic use and syntax of Scikit-Learn for one type of model, switching to a new model or algorithm is very straightforward.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Python machine learning \n", + "\n", + "A solid understanding of these API elements will form the foundation for understanding the deeper practical discussion of machine learning algorithms and approaches.\n", + "\n", + "This section provides an overview of the Scikit-Learn API \n", + "\n", + "- The *data representation* in Scikit-Learn\n", + "- The *Estimator* API\n", + " - a more interesting example of using these tools for exploring a set of images of hand-written digits." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Data Representation in Scikit-Learn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "Machine learning is about creating models from data: \n", + "- How data can be represented in order to be understood by the computer.\n", + " - Tables of data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Data as table\n", + "\n", + "A basic table is a two-dimensional grid of data\n", + "- the rows represent individual elements of the dataset\n", + "- the columns represent quantities related to each of these elements.\n", + "\n", + "For example, consider the [Iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set), famously analyzed by Ronald Fisher in 1936.\n", + "\n", + "We can download this dataset in the form of a Pandas ``DataFrame`` using the [seaborn](http://seaborn.pydata.org/) library:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:26:46.002170Z", + "start_time": "2018-05-15T07:26:45.988082Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", + "
" + ], + "text/plain": [ + " sepal_length sepal_width petal_length petal_width species\n", + "0 5.1 3.5 1.4 0.2 setosa\n", + "1 4.9 3.0 1.4 0.2 setosa\n", + "2 4.7 3.2 1.3 0.2 setosa\n", + "3 4.6 3.1 1.5 0.2 setosa\n", + "4 5.0 3.6 1.4 0.2 setosa" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import seaborn as sns\n", + "sns.set_context(\"talk\", font_scale=1.5)\n", + "\n", + "iris = sns.load_dataset('iris')\n", + "iris.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- Each row of the data refers to a single observed flower\n", + "- The number of rows is the total number of flowers in the dataset.\n", + " - the rows of the matrix as *samples*\n", + " - the number of rows as ``n_samples``.\n", + "- each column of the data refers to a particular quantitative piece of information that describes each sample.\n", + " - the columns of the matrix as *features*\n", + " - the number of columns as ``n_features``." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Features matrix\n", + "\n", + "This table layout of the information can be thought of as a ``two-dimensional numerical array or matrix``, which we will call the **features matrix**.\n", + "\n", + "- The features matrix is often stored in a variable named ``X``.\n", + "- The features matrix is assumed to be two-dimensional, with shape ``[n_samples, n_features]``, \n", + "- The features matrix is most often contained in a NumPy array or a Pandas ``DataFrame``\n", + "- some Scikit-Learn models also accept SciPy sparse matrices.\n", + "\n", + "The samples (i.e., rows) always refer to the individual objects described by the dataset.\n", + "- For example, the sample might be a flower, a person, a document, an image, a sound file, a video, an astronomical object, or anything else you can describe with a set of quantitative measurements.\n", + "\n", + "The features (i.e., columns) always refer to the distinct observations that describe each sample in a quantitative manner.\n", + "- Features are generally real-valued, but may be Boolean or discrete-valued in some cases." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Target array\n", + "\n", + "In addition to the feature matrix ``X``, we also generally work with a *label* or *target* array, which by convention we will usually call ``y``.\n", + "- The target array is usually one dimensional, with length ``n_samples``\n", + "- The target array is generally contained in a NumPy array or Pandas ``Series``.\n", + "- The target array may have continuous numerical values, or discrete classes/labels.\n", + "\n", + "While some Scikit-Learn estimators do handle multiple target values in the form of a two-dimensional, ``[n_samples, n_targets]`` target array, we will primarily be working with the common case of a one-dimensional target array." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Target array\n", + "\n", + "The target array is usually the quantity we want to *predict from the data*: in statistical terms, it is the dependent variable.\n", + "> For example, in the preceding data we may wish to construct a model that can predict the species of flower based on the other measurements; in this case, the ``species`` column would be considered the target array.\n", + "\n", + "With this target array in mind, we can use Seaborn (see [Visualization With Seaborn](04.14-Visualization-With-Seaborn.ipynb)) to conveniently visualize the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T12:51:31.747920Z", + "start_time": "2018-05-15T12:51:26.463393Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import seaborn as sns; \n", + "sns.set()\n", + "sns.set_context(\"talk\", font_scale=1)\n", + "sns.pairplot(iris, hue='species', size=1.5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "For use in Scikit-Learn, we will extract the features matrix and target array from the ``DataFrame``\n", + "- we can use some of the Pandas ``DataFrame`` operations discussed in the [Chapter 3](03.00-Introduction-to-Pandas.ipynb):" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:31:02.519569Z", + "start_time": "2018-05-15T07:31:02.513989Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(150, 4)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_iris = iris.drop('species', axis=1)\n", + "X_iris.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:31:11.307830Z", + "start_time": "2018-05-15T07:31:11.298124Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_lengthsepal_widthpetal_lengthpetal_width
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
\n", + "
" + ], + "text/plain": [ + " sepal_length sepal_width petal_length petal_width\n", + "0 5.1 3.5 1.4 0.2\n", + "1 4.9 3.0 1.4 0.2\n", + "2 4.7 3.2 1.3 0.2" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_iris[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:31:27.159081Z", + "start_time": "2018-05-15T07:31:27.153904Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(150,)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_iris = iris['species']\n", + "y_iris.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:06:05.380136Z", + "start_time": "2018-05-15T07:06:05.374539Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 setosa\n", + "1 setosa\n", + "2 setosa\n", + "Name: species, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_iris[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "To summarize, the expected layout of features and target values is visualized in the following diagram:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "![](figures/05.02-samples-features.png)\n", + "[figure source in Appendix](06.00-Figure-Code.ipynb#Features-and-Labels-Grid)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Scikit-Learn's Estimator API" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "With this data properly formatted, we can move on to consider the *estimator* API of Scikit-Learn:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Guiding principles outlined in the [Scikit-Learn API paper](http://arxiv.org/abs/1309.0238):\n", + "\n", + "- *Consistency*: All objects share a common interface drawn from a limited set of methods, with consistent documentation.\n", + " - Every machine learning algorithm in Scikit-Learn is implemented via the Estimator API, which provides a consistent interface for a wide range of machine learning applications.\n", + "\n", + "- *Inspection*: All specified parameter values are exposed as public attributes.\n", + "\n", + "- *Limited object hierarchy*: \n", + " - Only algorithms are represented by Python classes; \n", + " - datasets are represented in standard formats (NumPy arrays, Pandas ``DataFrame``s, SciPy sparse matrices) and \n", + " - parameter names use standard Python strings.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Guiding principles outlined in the [Scikit-Learn API paper](http://arxiv.org/abs/1309.0238):\n", + "\n", + "- *Composition*: Many machine learning tasks can be expressed as sequences of more fundamental algorithms,\n", + " and Scikit-Learn makes use of this wherever possible.\n", + "\n", + "- *Sensible defaults*: When models require user-specified parameters, the library defines an appropriate default value.\n", + "\n", + "> In practice, these principles make Scikit-Learn very easy to use, once the basic principles are understood.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Basics of the API\n", + "\n", + "Most commonly, the steps in using the Scikit-Learn estimator API are as follows\n", + "(we will step through a handful of detailed examples in the sections that follow).\n", + "\n", + "1. Choose a class of model by importing the appropriate estimator class from Scikit-Learn.\n", + "2. Choose model hyperparameters by instantiating this class with desired values.\n", + "3. Arrange data into a features matrix and target vector following the discussion above.\n", + "4. Fit the model to your data by calling the ``fit()`` method of the model instance.\n", + "5. Apply the Model to new data:\n", + " - For supervised learning, often we predict labels for unknown data using the ``predict()`` method.\n", + " - For unsupervised learning, we often transform or infer properties of the data using the ``transform()`` or ``predict()`` method.\n", + "\n", + "We will now step through several simple examples of applying supervised and unsupervised learning methods." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Supervised learning example: Simple linear regression\n", + "\n", + "As an example of this process, let's consider a simple linear regression—that is, the common case of fitting a line to $(x, y)$ data.\n", + "We will use the following simple data for our regression example:" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T12:53:05.100098Z", + "start_time": "2018-05-15T12:53:04.956259Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEcCAYAAAAP5CkrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xt0U1W+B/Bv0leSvtLUIjDWUl6lFBAoQkGktToXGVuQguOA81gqQpc4zFLBkTUyOuNyDS6WAypXcHB0ROYO9xa4ykNUYNCrgvJwoCMtglpaxoJNmzZgk7S0OfePTiKF5OQkPcnJyfl+1prV6UlysneL+XXv/du/rRMEQQAREZEM9Eo3gIiIYgeDChERyYZBhYiIZMOgQkREsmFQISIi2cQr3QAlWa0XRR/X6XTIzExGS0s7tJgkx/6z/+w/+++r/1lZqX5fx5GKCL2+5wer1+hPif1n/9l/9j/Y/qvix3XkyBHcddddKCwsxG233YbNmzcDAOx2OxYvXozCwkKUlJSgqqpK4ZYSEWlb1E9/2e12PPjgg3jiiSdQVlaG2tpa3Hvvvbj++uuxefNmmEwmHDhwAF988QUeeOABjB49GiNGjFC62UREmhT1I5XGxkYUFxdj5syZ0Ov1KCgowKRJk/DZZ59h7969WLJkCZKSkjBmzBiUlZVxtEJEpKCoH6nk5+dj1apV3u/tdjuOHDmCvLw8xMfHIzs72/tYbm4u3nvvPcn3DjRfqNfren3VGvaf/b/8q9aouf/WNieaWh3ol2FCltkY0j1C7X/UB5XLXbx4EZWVld7RysaNG3s9bjAY4HK5JN8vMzMZOl3gH5jZnBx0W2MJ+8/+a5ma+n/8tBVV+07h9Nk2OFxdMBniMSzbjB/fOhxjhmWFdM9g+6+aoHL27FlUVlYiOzsba9aswVdffXVVAHG5XDCZTJLv2dLSHnCkYjYno62tHW639lIK2X/2n/1XT/9r6mz4044atF7s8F5zuLpw/HQzGs5fxMLykRiZa5F8P7H+Wywpfl+niqBy4sQJLFiwADNnzsSvf/1r6PV65OTkoKurC42NjRg4cCAAoK6uDkOHDpV8X0EQ0N0d+Hlut4Du7uj/RxUu7D/7z/5Hf/+3f3ymV0C5XOvFDmw/cAZ512cEfd9g+x/1C/XNzc1YsGAB7r33Xixfvhz6fw8tUlJScOutt+K5556D0+lEdXU1du7cifLycoVbTEQUWU1tTtSduyD6nDPnLsDa5gx7W6J+pLJlyxbYbDasW7cO69at817/+c9/jqeffhpPPvkkiouLYTKZsGzZMtxwww0KtpaIKPKaW51wdYpPuzg7utFsd4a8cC9V1AeVyspKVFZW+n38+eefj2BriIiizzUZRhgS40QDizEpDtekhzegACqY/iIiInH9zEbkDkgTfc6gAWlhH6UADCpERDGhbEoOMlKTfD6WkZqEssk5EWkHgwoRUQzIz7FgQVk+8gdlwJgUB6Bnyit/UEbP9Rzp6cR9EfVrKkREJE1+jgX5ORZY25xotjtxTboxIlNel2NQISKKMVnmyAcTD05/ERGRbBhUiIhINgwqREQxyNrmRE2dLSK76C/HNRUiohhSW2/DzgP1OHP+Apwd3TAmxWFQ/zSUTcnxLuJbW53IygjPuguDChFRjKitt+GVnbW9Cks6O7pRW9+KhqaLyEwzwNrm9Bls5MLpLyKiGLHzQL3fSsXtzi40fPsdnB09pVw8weaVnbWorbfJ1gYGFSKiGCClUrEvrRc7sPNgvWztYFAhIooBUioV+yNnWXwGFSKiGOCpVBwKT1l8OTCoEBFFSDjTfKVUKvZHzrL4zP4iIgqzQGm+cimbkoPzNoffxXp/5CyLz5EKEVEYedJ8a+tbw5555a9S8fXXpiDVmODzNXKXxedIhYgojMTSfD2ZV3KOVvxVKq6tt2HnwXqcOXfZaGlAGsomyztaYlAhIgoTKWm+nswruXe3X1mpOFJl8RlUiIjCREqaryfzKlKl6sNdFp9rKkREYSIlzVfOzKtowKBCRBQmUtJ8pWReKVVxOBSc/iIiCiOxNN9AmVeRSkWWE0cqRERh5C/NN39QRs91P8EhkqnIcuJIhYgozELJvIp0KrJcGFSIiCJEauaVkqnIfcXpLyKiKBNMKnK0YVAhIs2LtuwqNacic/qLiDRLLLtq1OBMAD0B53yzI2xnuvviSUWurW/1+xw5i0DKiUGFiDRJ7Dz38zYHpt94HWrP2nGqoVWRdN6+pCIridNfRKRJgbKrtnzwNY6fblYsnTfUVGSlcaRCRJojJbuqq1vweT2S6bxyF4G0tjlhbXWGdSqPQYWINKcv57kDkU/n7WsRyEjuzOf0FxFpTl/OcweiN53Xl0jvzGdQISLN6ct57kD0pvP6ImVnvpwYVIhIk8qm5CAjNcnnY3FxOtHXRms675WC2ZkvFwYVItIkseyqucWD/QacaE7nvZISO/O5UE9EmiWWXTWofxreOXy29z6VMJzpHk6etSOxwCL3VB6DChFpnq/sqpG5FkwtvB5ffG3FtzaH33TeSKTphkqJnfmqCirV1dV48MEH8dFHH3m/v/vuu2EwGLzPWbRoESorK5VqIhHFmCyzEZZUw1XX1XKAVqR35qsiqAiCgK1bt2LlypWIi/s+DfDkyZOYNm0aXn75ZQVbR0RaE6jESzTtePesHe08WI8z5y6EfSpPFUFl/fr12L17NyorK7Fhwwbv9ZqaGowYMULBlhGRFqntAC25d+aLUUVQmTNnDiorK3Ho0KFe12tra5GYmIjS0lK43W7MmDEDDz/8MBITEyXdV6fTQS+S/6bX63p91Rr2n/2//KvW+Ot/U6sTZ84HTtO1XXRF3RpL/0wT+meaJD031N+/KoJKv379fF7PyMjApEmTcPfdd6OlpQW/+tWv8MILL2Dp0qWS7puZmQydLvAPzGxODqq9sYb9Z/+17Mr+N7Q4vTvT/XF2dKOjG7BYUsLZtIgI9veviqDiz/r1673/32QyYdGiRfjjH/8oOai0tLQHHKmYzcloa2uH2+27uFwsY//Zf/b/6v4b9D1puGKBxZgUh6Q4wGb7LhJNDQux379YsFRtULHb7Vi/fj0WL16MlJSeDnZ0dCApyfeGJV8EQUC3hJpybreAbj8VS7WA/Wf/2f/v+5+ZZsCg/oHTdC2phpj4uQX7+1ftjvrU1FTs2bMHa9euxaVLl1BfX4/169ejoqJC6aYRUYwTK/Giph334aDaoKLX67F+/XqcPHkSRUVFmD9/Pm6//Xb84he/ULppRBTj1HqAViToBEFQ//gsRFbrRdHH4+J0sFhSYLN9FxPD2GCx/+w/+x+4/5FI01WCWP+zslL9vk61aypERNGgrwdoxRrVTn8REVH0YVAhIiLZMKgQkapZ25yoqbPJetAUhY5rKkSkSmqpEqw1DCpEpDpqqhKsNZz+IiLVkVIlmJTBoEJEqtLU5kTducBVgrnGogwGFSJSleZWp+iZ60DPVFiznUFFCQwqRKQq12QYYUiME32OMSkO16RzQ6ISGFSISHHBpAX3MxuROyBN9DmDBqRxl7tCmP1FRIoJNS24bEoOztscPhfrtV4lWGkcqRCRIjxpwbX1rd4DrzxpwT3XbX5fyyrB0YsjFSJShJS0YLHgkJ9jQX6OJWarBKsVgwoRRVwwacGBAgWrBEcXTn8RUcQxLTh2MagQUcQxLTh2MagQUcQxLTh2MagQkSLKpuQgIzXJ52NMC1YvBhUiUgTTgmMTs7+IKCKsbU5YW53Iyvg+W4tpwbGHQYWIwkrKrnmmBccOBhUiChsepqU9XFMhIlF9OQOeh2lpD0cqROTTgX+ew56j/8I31u/Q0ekO+gx4uXbN+1qLoejFoEJEXtY2J4580YSPqs/hXIuj12PBTlsFs2veV7AItYIxKYtBhYi8H+Bff2NHR5db9LlSij0C3++aFwss/nbNcy1GvbimQqRxl5egDxRQPKScAd+XXfNci1EvBhUijRP7APdHarHHUHbNB7MWQ9GHQYVIw6R8gPsitdhjKLvmWcFY3bimQqRhUj7AfQmm2GOwu+b7shZDyuNIhUjDpJSgv1J8nA5jBge/SJ5lNiI/xxIwGLGCsboxqBBpmJQP8Ct1dQt47/C/RM+Q7ytWMFYvBhUijRP7APcn3BlYrGCsXlxTIVKJcO0s93yA/+f/fg6Hq0vy66SeId+XdrGCsfowqBBFuUjsLM9MN8LtFoJ6jdhueDmxgrG6MKgQRbFI7SwPJQuMGVjkC9dUiKJYpHaWh5IFxgws8oVBhShKRXJnebBZYMzAIn9UFVSqq6sxdepU7/d2ux2LFy9GYWEhSkpKUFVVpWDriOQV6Z3lUrLAmIFFgahiTUUQBGzduhUrV65EXNz3Q/QVK1bAZDLhwIED+OKLL/DAAw9g9OjRGDFihIKtJZJHpHeWe7LAdh6sx5lzPUkBJkM8BvVPxU2j+8OcksQMLApIFUFl/fr12L17NyorK7FhwwYAQHt7O/bu3Yt3330XSUlJGDNmDMrKylBVVYUVK1ZIuq9Op4NeZKym1+t6fdUa9l/Z/g/INGHwwDTUnGn1+xydTofWiy70zzTJ8p6jBmdi1ODMnjTeC04Myc6EIQ5BZ4bFAqV//0oLtf8hBZVVq1ahoqICQ4YMCeXlQZszZw4qKytx6NAh77X6+nrEx8cjOzvbey03Nxfvvfee5PtmZiZDpwv8AzObk4NrcIxh/5Xr/z235+OPf/sMLXaXz8cdri5s2FmLh+eNxw3DsmR7X4slBXmy3U3d+O8/uP6HFFT+/Oc/49VXX8XIkSNRUVGBO+64A2azOZRbSdKvX7+rrjkcDhgMhl7XDAYDXC7f//H50tLSHnCkYjYno62tXbN/qbH/yvb/ukwjFtyRjxe3VsPR4XsarMXuwn+9U4vszO+npaxtTjS1OtAvwxTydFU09F9J7L///lssKX5fF1JQ0ev1cLvdOHHiBGpqarBy5UrccsstmDVrFkpKSnqte4SL0Wi8KoC4XC6YTNKnAQRBQLeE1Hy3W0B3t/b+UXmw/5Ht/5U75zPSDAj0mVZ37gLOtzjQbHfKvlGSv3/2P5j+hxRUPvjgA+zYsQPbt2/HyZMncenSJezZswd79uyB2WxGeXk57rzzTowcOTKU20uSk5ODrq4uNDY2YuDAgQCAuro6DB06NGzvSRRO/nbOjxpskZQFdvRUE/Yc/heP4CVFhZRSnJWVhfvuuw9vvvkmtm/fjvvuuw/9+vWDIAhobW3FG2+8gTlz5mDmzJn4y1/+gubmZrnbjZSUFNx666147rnn4HQ6UV1djZ07d6K8vFz29yIKt8uP9HX+e5rLExDePXQWifHi/6nqdcBHx8/xCF5SXJ/3qQwfPhyPPfYY3n//fbz22mu48847YTQaIQgCTp06hWeffRYlJSVYtGgR3n33XXR2dsrRbgDA008/ja6uLhQXF2PJkiVYtmwZbrjhBtnuTxQpYjvnL7R3Ij5OPKHELQCNLQ7R5/AIXooEnSAIsk8Wulwu7N+/H/v27cOHH34Iu93uzbJKS0vDj370I1RUVGD06NFyv3VQrNaLoo/HxelgsaTAZvtOk3Oq7L88/Q9UXbipzYmnXj0kOsWVGK+DMSke9vZLIbcDAJbNGyt5Coy/f/bfX/+zslL9vi4s+1QMBgNmzJiB0tJS7N+/H6tXr0ZDQwOAnl3wmzdvxubNmzFq1Cg89NBDKC4uDkcziBQltbqwlJ3znV0CbisciN2H6hHqn4EsAEmRIHtQcblc2LNnD3bv3o0DBw6go6NnSC8IAhISEjB58mTU1tbCarXin//8JyorK1FRUYFnnnlG7qYQKSaY6sJSd84PvMYUckABWACSIkOWoOJ2u/HRRx9hx44d2Lt3rzfV1zOzNnLkSMyePRvl5eUwm81wu9348MMPsWbNGtTW1mLbtm3IycnBwoUL5WgOkeKkVBf2BBVPMcfaev875wcNSMPQbHPA4OMPC0BSpPQpqFRXV2P79u3YvXs3bLae86o9gcRisaC8vBwVFRXIy+u9N1ev16O4uBjjxo3DnXfeicbGRlRVVTGoUEwIprqwZ+RQNiUH520On4HIExCkBJ/rr01GsjHRW7vLmBSHQQPSUDZZvgO9iMSEFFTWrl2LHTt2eNdJPIEkPj4e06ZNQ0VFBUpKShAfL377tLQ0FBUVYdu2bfj2229DaQpR1AmmurAnqPgq5ugrIAQKPneXDuMRvKSokIOKTqfzBpPhw4ejoqICM2fOhMUS3F9DDkdPGqRnAyOR2oVaXVjKmexSgw+P4CWlhDz9lZaWhvLycsyePRsFBQUhN2DWrFmYM2cOBg8eHPI9iKKJ1DUSfx/6gQKClOBDpJSQgsrzzz+P0tJSJCQk9LkBt9xyS5/vQRRtpKyR9BVHIxSNQtpRP336dFkCClEssLY5UVNn67Vb3TNNlT8oA8akngKrPDWRtEAVh3QRRaNAmxs5TUVaxKBCFIJgNjdymoq0pM8FJYm0SMrmRiItYlAhClIwmxuJtIZBhShIwWxuJNIaBhWiIHk2N4phRWDSKgYV0ixfqcBSeDY3imFFYNIqZn+R5kg950RMJDY3EqkRRyqkKWJnwfdct0m6Dzc3EvnGkQppSjDnnATCzY1EV2NQIc0I5ZwTKbi5keh7nP4izWAqMFH4MaiQZjAVmCj8GFRIM5gKTBR+DCqkKWVTcpCRmuTzMaYCE/UdgwppClOBicKL2V+kOUwFJgofBhVSDWubE9ZWJ7Iy5AkCTAUmkh+DCkU9OcqqEFFkMKhQVAvmhEUiUh4X6imqXFk5WG0nLIZa+ZgoVnCkQlHB1xTXwMxknG36TvR1oZRVCQdO0RH1YFAhxfmb4vqqUbxOl+d5zXZlgwqn6Ii+x+kvUpzYFFcg0VBWRW1TdEThxKBCipJSOViM0mVVgql8TKQFDCqkKCmVg/2JhrIqrHxM1BuDCilKSuXgOD2Q0z8lKsuqsPIxUW9cqCdFeSoH19a3+n1OtxtoanViYGYyCkdkoXB4v5CnvOTelS+l/UpP0RFFEoMKKa5sSg7O2xyii/WebDDbxQ7kXJsa9Id0OFN+xdofDVN0RJGk+umvV155BaNGjcK4ceO8/zty5IjSzaIg+Koc7E8o2VSelN/a+lY4O3rWPzwpvz3XbSG3HWDlY6LLqX6kUltbi4cffhj333+/0k2hPvBUDj7Z0Io1/3McnV1uv88NdsOjlJTfvn7ws/IxUQ/Vj1Rqa2uRn5+vdDNIJu5uQTSgAMFlU0U65TfLbER+joUBhTRL1SMVp9OJM2fOYOPGjVi2bBnS0tJw//33Y+7cuZJer9PpoBcJq3q9rtdXrVGi/9dmmmBMivNOU/liTIrDtRYT4uICt8t2QVrKr+2iC/0zTb2u8/fP/l/+VWtC7b+qg0pzczPGjx+PefPm4YUXXkB1dTUqKyuRlZWF4uLigK/PzEyGThf4B2Y2J8vRXNWKZP8tlhQMvz4Dx083+33O8OszkDc4S9L9hgk6GJPi4ezo8vsckyEeQ3MyYbH47id//+y/lgXbf1UHlezsbGzatMn7/YQJEzBr1izs27dPUlBpaWkPOFIxm5PR1tYOt1uQo8mqolT/Z0zMRsP5i36zqWZMzIbNJl5o0iNRB+QOSEXNGZGU3/6pSNQJV92Tv3/2n/333X+LJcXv61QdVE6cOIGPP/4YCxcu9F7r6OiAwWCQ9HpBENAtYTO32y2gu1t7/6g8It3/4dk9WVM7D9bj62/s6LjkRlKCHoN/kI6yyTkYnp0RVHvumJyDcy3+U37vmJwjej/+/tl/9l96/1W9UG8ymbB27Vq88847cLvdOHjwIHbt2oXZs2cr3TSSgwDAMzup+/f3IWDKL1HkqHqkkpubizVr1mD16tV4/PHHce211+IPf/gDCgoKlG4a9YGvUvIdne4+lZJnyi9RZKg6qABAaWkpSktLlW6GqshdqkRu4dxXkmWOzj4TxQrVBxWSrq+lSiIRjILZV8LgQBR9GFQ0oi+nE9bU2bD94zMROSr3y7NtkkvJM6gQRR8GFY0INKX03/u+xOKK0Vd9UB8/bcWfdtSE/ahczyjq60Z7wOeylDxR9FJ19hdJI2VKqaHpOzyx4SCe2XikV4HFqn2nwn5U7uUFHzsuiZdoAVhKniiaMahogNTTFS91A181XsDq/zmOdw/Vo6nViVMNbaKvkaNuVjBn1LOUPFF04/SXBnhOJ5R6bG9Xt4CtH3wNAKLlTXoel7a+4W+RX+oZ9UmJegwe2LP5kftKiKIXg4oGSDmd8Epd3QI+rWkKWDcr0PpGoIwzqaOon//HCEwe1V9y+4lIGZz+0oiyKTnISE0K6jXnmtuRMyBV9Dli6xtSDseSesb70OvSg2o7ESmDQUUjLi9VkpQg7dfe0eXGlFED/QajQOsbUjYxekZRYrgwT6QenP7SkMtLlaypOoZzLeIL7DodkJ6aiB/fMgT/V30OZ85dNoU1IE10fSOYTYw8450odjCoaFCW2YiyKYOwYUet6PMEAVj9t39410F++h95MKckSqqbJWWtxLPI7xlF7TxYH1TgIqLow6CiUekm6esrV252lDIVJSXj7PJFfhZ8JIoNXFPRKCkL5FcKZrNjqGslPOOdSN0YVDRKyoe+L8FsdhTLOONaCVFsYlDRsFDSjD3rIFLwcCwi7eGaiob5WiAPJNhijlwrIdIWBhWNu/JDf9sHX+OrRv+pwKHuGeHhWETawOkvAvD9AnlF8WCugxBRyBhUqBeugxBRX3D6i67imRKzXXShoxtIigMsqQalm0VEKsCgQn5lmY2wWFJgs32H7m5B6eYQkQpw+ouIiGTDoBJDrG1O1NTZ+nwSIxFRqDj9FQMCHYRFRBQpDCoq5zkI6/Ky8VcWgLwm3ejzKF8iIrkxqKhcoIOw/vN/P4cgCBzBEFFEMKjIxNrmjOhowNrmxOl/teGrRrvo8xyu78+Xv3IEw8BCRHJjUOmjSK9nXPl+ofCUsGdQISK5Maj0gZT1DDk/uH29X6g8Jey5xkJEcmJKcR8EWs+QeqCVHO8XrGBK2BMRScWgEqKmNifqzvmv5gsEd6CVHO8XjGBL2BMRScGgEqLmVqfo+etA30YDV25klPJ+HsakOJgM4jOboZawJyISwzWVEHnOeBf7oA9lNOBv4X/qmP4B3y8pQY+f356HoT8wo9nuxPo3T+Ci89JVz0s1JrCEPRGFBYNKiDxnvNfWt/p9TrCjgUAL//0yjGj49ju/rx/8g3RMLhgAAGi2O+GG7yKQgp/rRER9xemvPhA74z2UA60CLfx77ivl/XYeqEe7s8vnc79zdsmeREBEBDCo9EkoB1r5K/ooZSHe2ubE3JIhAd8v0kkEREQenP7qoyvPeL8m3feO+kCbJKUu/JtTErHsJ+NE3y+YJAIu1hORnBhUZJJl9l+eRVLRxyAX/sXeL1xJBEREgXD6KwKkbJL0LPyLkbrwL+e9iIiCofqgUlNTg7lz52Ls2LGYNWsWjh07pnSTeglmfUPOhX+5kwiIiKRQdVDp6OhAZWUlKioqcPjwYfzsZz/DQw89hM7OzrC+bzAnLAazvhHKwr8/ct6LiEgqVa+pfPLJJ9Dr9Zg/fz4AYO7cuXj99dexf/9+TJ8+Xfb3C6Ui8TUZRiQl6NFxye33vpevb0hd+JdCznsREUmh6qBSV1eHIUOG9LqWm5uL06dPSwoqOp0OepGxml6v836tqRNfbF9YPhIjc3sHlpo6G3YcOINLXf4DCgDkDkhD/0xTr2v9M01XXQtVqPe6vP9axP6z/5d/1ZpQ+6/qoOJwOGA09v7L22AwwOVySXp9ZmYydLrAPzCzORnvHD4mutj+zuGzmFp4vffa8dNWvLKrFi128bZkphtwz+35sFhSJLVZCWZzstJNUBT7z/5rWbD9V3VQMRqNVwUQl8sFk0naX+UtLe0BRypmczJO1VlxqsF/ORYAONXQii++tnqnl/7rHfGAotcBI3IyMPOmQbgu0wibzX/5FaV4+t/W1g63W3ulXdh/9p/9991/sT+CVR1UBg8ejE2bNvW6VldXh7KyMkmvFwQB3RIK/35rcwQ8ZdHZ0Y1vbQ5YUg1oanPiy2/Ej/lNSNDjF7ePQJbZiO7u6P4H63YLUd/GcGL/2X/2X3r/VZ39NXnyZHR2duKNN97ApUuXsGXLFjQ3N2Pq1Kmyvk+W2QRDYpzoczyL7bX1Nry07Z/oFFmYB4COTjcPySKimKPqoJKYmIgNGzZg165dmDhxIjZt2oR169ZJnv6Sql+GtM2EzXYnXtlZi4amwFNZ3NFORLFI1dNfADBixAhs3rw57O9TNiUH520On4v1ns2EwRz3yx3tRBSLVD1SiaRAmwkz042Sj/vljnYiilWqH6lEkthmwpo6m6TjfrOvTcFPSodyRzsRxSQGlRD4qhAspTJwUoIeD80ezWkvIopZnP6SiZTKwIN/kM6AQkQxjUFFRqwMTERax6AiI1YGJiKt45qKzFgZmIi0jEElTMSO+yUiilWc/iIiItkwqBARkWwYVIiISDY6QRC0W9OZiIhkxZEKERHJhkGFiIhkw6BCRESyYVAhIiLZMKgQEZFsGFSIiEg2DCpERCQbBhUiIpINgwoREcmGQcWPmpoazJ07F2PHjsWsWbNw7NgxpZsUUUeOHMFdd92FwsJC3Hbbbdi8ebPSTYq45uZmTJ48Gfv371e6KRF3/vx5LFq0COPHj8e0adOwceNGpZsUUZ999hkqKiowfvx4TJ8+HTt27FC6SRFRXV2NqVOner+32+1YvHgxCgsLUVJSgqqqqsA3EegqLpdLuPnmm4W//vWvQmdnp1BVVSXcdNNNQkdHh9JNi4i2tjbhxhtvFN566y2hu7tb+Pzzz4Ubb7xR+Pjjj5VuWkQtXLhQGDFihPD3v/9d6aZElNvtFmbPni2sXLlS6OzsFE6dOiXceOONwtGjR5VuWkR0dXUJRUVFwu7duwVBEITDhw8LI0eOFM6ePatwy8LH7XYLVVVVQmFhoTBx4kTv9V/+8pfC0qVLBZfLJRw/flyYOHGiUFtbK3ovjlR8+OSTT6DX6zF//nwkJCRg7ty5yMjKL6QqAAAGqklEQVTI0MxfrI2NjSguLsbMmTOh1+tRUFCASZMm4bPPPlO6aRHzt7/9DUajEQMGDFC6KRF3/PhxNDU1YenSpUhISMCwYcOwefNm5ObmKt20iLhw4QJsNhu6u7shCAJ0Oh0SEhIQFxendNPCZv369di4cSMqKyu919rb27F3714sWbIESUlJGDNmDMrKygKOVhhUfKirq8OQIUN6XcvNzcXp06cValFk5efnY9WqVd7v7XY7jhw5ghEjRijYqsg5c+YMXnvtNTz11FNKN0URJ06cwLBhw7Bq1SrcdNNNmD59Oo4fP46MjAylmxYRGRkZmD9/Ph555BEUFBTgnnvuwYoVK2L6D4w5c+bgrbfewujRo73X6uvrER8fj+zsbO81KZ+DPPnRB4fDAaOx96mNBoMBLpdLoRYp5+LFi6isrERBQQFKS0uVbk7YdXV1YdmyZfjNb34Ds9msdHMUYbfb8emnn6KoqAj79+/H559/jgULFiA7OxsTJkxQunlh53a7YTAY8Pzzz6O0tBQHDhzAo48+ioKCgpj9w6pfv35XXXM4HDAYDL2uSfkc5EjFB6PReNUPzuVywWQyKdQiZZw9exY/+clPkJ6ejrVr10Kvj/1/Li+99BLy8/NRXFysdFMUk5iYiPT0dCxatAiJiYnexep9+/Yp3bSIeO+991BdXY3bb78diYmJKCkpQUlJCd58802lmxZRoX4Oxv6nRAgGDx6Murq6Xtfq6uowdOhQhVoUeSdOnMCPf/xjTJ06FS+99NJVf7HEqrfffhu7du3ChAkTMGHCBDQ2NuKRRx7Bn/70J6WbFjG5ublwOp3o6uryXvOsL2jBuXPn0NnZ2etafHw84uO1NbGTk5ODrq4uNDY2eq9J+hwMb06BOnV0dAhTp04VNm7c6M3+KioqEtrb25VuWkRYrVahqKhIePnll5VuiuJuueUWzWV/OZ1O4eabbxZWrlwpXLp0STh69KgwduxY4R//+IfSTYuIkydPCgUFBcKWLVsEt9stfPrpp8K4ceOE6upqpZsWdp988kmv7K+HHnpIeOSRRwSHw+HN/jp27JjoPThS8SExMREbNmzArl27MHHiRGzatAnr1q3TzPTXli1bYLPZsG7dOowbN877v9WrVyvdNIoAg8GAN954A6dOncKUKVOwdOlSPPHEExg7dqzSTYuIvLw8vPDCC9i4cSMKCwvx+9//Hs8++2yvRWytePrpp9HV1YXi4mIsWbIEy5Ytww033CD6Gh4nTEREsuFIhYiIZMOgQkREsmFQISIi2TCoEBGRbBhUiIhINgwqREQkGwYVIiKSDYMKERHJhkGFiIhkw6BCRESyYVAhIiLZMKgQEZFsGFSIFGK1WjFp0iTk5eUhLy8PBw8e9Pm8L7/8EmPGjEFeXh6KiorQ1NQU4ZYSScegQqSQrKwsPPXUU97vf/vb31510t6lS5ewbNkydHR0AACeeeYZn0e/EkULBhUiBc2YMQNlZWUAgIaGBrz44ou9Hn/xxRdRU1MDAJg3bx5uvfXWiLeRKBg8T4VIYXa7HWVlZWhqakJ8fDyqqqowcuRIHD16FD/96U/hdrsxZMgQbNu2TTPHOpN6caRCpLD09HQ888wzAICuri48+eSTcDgcWL58OdxuNxISEvDcc88xoJAqMKgQRYFp06bh7rvvBgBUV1dj3rx5qK+vBwA8+uijyM/PV7J5RJJx+osoSjgcDsyaNQsNDQ3ea1OmTMGrr74KnU6nYMuIpONIhShKmEwmPPHEE72uLV++nAGFVIVBhSiKbN++vdf369atU6glRKFhUCGKErt378bOnTsBAKmpqQCAt99+G2+//baSzSIKCoMKURSwWq3ejZBZWVnYunUrLBYLAOB3v/sdd9GTajCoEEWBFStWoK2tzfv/c3JysHz5cgBAW1sbVqxYoWTziCRjUCFSWFVVFfbv3w8A+OEPf4jp06cDAGbOnIni4mIAwPvvv4+qqirF2kgkFVOKiRT0zTffoLy8HO3t7UhLS8OuXbt61fZqbGzEHXfcAYfDgeTkZGzfvh3XXXedgi0mEseRCpFCBEHA448/jvb2dgDAY489dlWxyIEDB+LRRx8FALS3t2P58uXg34EUzRhUiBTy+uuv49ChQwCAoqIi3HXXXT6fd88992D8+PEAgEOHDuH111+PWBuJgsXpLyIikg1HKkREJBsGFSIikg2DChERyYZBhYiIZMOgQkREsmFQISIi2TCoEBGRbBhUiIhINgwqREQkGwYVIiKSDYMKERHJ5v8BR41I3Ujk6Q0AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "rng = np.random.RandomState(42)\n", + "x = 10 * rng.rand(50)\n", + "y = 2 * x - 1 + rng.randn(50)\n", + "plt.scatter(x, y)\n", + "plt.xlabel('x', fontsize = 30)\n", + "plt.ylabel('y', fontsize = 30);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With this data in place, we can use the recipe outlined earlier. Let's walk through the process: " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### 1. Choose a class of model\n", + "\n", + "In Scikit-Learn, every class of model is represented by a Python class.\n", + "So, for example, if we would like to compute a simple linear regression model, we can import the linear regression class:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:35:54.202165Z", + "start_time": "2018-05-15T07:35:54.199317Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "Note that other more general linear regression models exist as well; you can read more about them in the [``sklearn.linear_model`` module documentation](http://Scikit-Learn.org/stable/modules/linear_model.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### 2. Choose model hyperparameters\n", + "\n", + "An important point is that *a class of model is not the same as an instance of a model*.\n", + "\n", + "Once we have decided on our model class, there are still some options open to us.\n", + "Depending on the model class we are working with, we might need to answer one or more questions like the following:\n", + "\n", + "- Would we like to fit for the offset (i.e., *y*-intercept)?\n", + "- Would we like the model to be normalized?\n", + "- Would we like to preprocess our features to add model flexibility?\n", + "- What degree of regularization would we like to use in our model?\n", + "- How many model components would we like to use?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### 2. Choose model hyperparameters\n", + "\n", + "\n", + "These are examples of the important choices that must be made *once the model class is selected*.\n", + "- These choices are often represented as *hyperparameters*, or parameters that must be set before the model is fit to data.\n", + "- In Scikit-Learn, hyperparameters are chosen by passing values at model instantiation.\n", + "\n", + "We will explore how you can quantitatively motivate the choice of hyperparameters in [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb).\n", + "\n", + "For our linear regression example, we can instantiate the ``LinearRegression`` class and specify that we would like to fit the intercept using the ``fit_intercept`` hyperparameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:40:02.740297Z", + "start_time": "2018-05-15T07:40:02.735314Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LinearRegression(fit_intercept=True)\n", + "model\n", + "#help(LinearRegression)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "Keep in mind that when the model is instantiated, the only action is the storing of these hyperparameter values.\n", + "- In particular, we have not yet applied the model to any data: \n", + "- the Scikit-Learn API makes very clear the distinction between *choice of model* and *application of model to data*." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### 3. Arrange data into a features matrix and target vector\n", + "\n", + "Previously we detailed the Scikit-Learn data representation, which requires a two-dimensional features matrix and a one-dimensional target array.\n", + "\n", + "- The target variable ``y`` is already in the correct form (a length-``n_samples`` array)\n", + "- The feature matrix ``x`` should be transformed to a matrix of size ``[n_samples, n_features]``.\n", + "\n", + "In this case, this amounts to a simple reshaping of the one-dimensional array:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:41:32.840436Z", + "start_time": "2018-05-15T07:41:32.835772Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 1)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = x[:, np.newaxis]\n", + "X.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### 4. Fit the model to your data\n", + "\n", + "Now it is time to apply our model to data.\n", + "This can be done with the ``fit()`` method of the model:" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:41:45.689495Z", + "start_time": "2018-05-15T07:41:45.684155Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "This ``fit()`` command causes a number of model-dependent internal computations to take place, and the results of these computations are stored in model-specific attributes that the user can explore.\n", + "\n", + "In Scikit-Learn, by convention all model parameters that were learned during the ``fit()`` process have trailing underscores; \n", + "- for example in this linear model, we have the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:07:27.190067Z", + "start_time": "2018-05-15T07:07:27.185547Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1.9776566])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The parameters represent the slope of the simple linear fit to the data.\n", + "model.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:07:28.062749Z", + "start_time": "2018-05-15T07:07:28.058403Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.9033107255311164" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The parameter represent the intercept of the simple linear fit to the data.\n", + "model.intercept_" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Comparing to the data definition, we see that they are very close to the input slope of 2 and intercept of -1.\n", + "\n", + "One question that frequently comes up regards the uncertainty in such internal model parameters.\n", + "\n", + "In general, Scikit-Learn does not provide tools to draw conclusions from internal model parameters themselves:\n", + "- interpreting model parameters is much more a *statistical modeling* question than a *machine learning* question.\n", + "- Machine learning rather focuses on what the model *predicts*.\n", + "\n", + "If you would like to dive into the meaning of fit parameters within the model, other tools are available, including the [Statsmodels Python package](http://statsmodels.sourceforge.net/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "#### 5. Predict labels for unknown data\n", + "\n", + "Once the model is trained, the main task of supervised machine learning is to evaluate it based on what it says about new data that was not part of the training set.\n", + "In Scikit-Learn, this can be done using the ``predict()`` method.\n", + "For the sake of this example, our \"new data\" will be a grid of *x* values, and we will ask what *y* values the model predicts:" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:43:57.243102Z", + "start_time": "2018-05-15T07:43:57.237670Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-1. , -0.75510204, -0.51020408, -0.26530612, -0.02040816,\n", + " 0.2244898 , 0.46938776, 0.71428571, 0.95918367, 1.20408163,\n", + " 1.44897959, 1.69387755, 1.93877551, 2.18367347, 2.42857143,\n", + " 2.67346939, 2.91836735, 3.16326531, 3.40816327, 3.65306122,\n", + " 3.89795918, 4.14285714, 4.3877551 , 4.63265306, 4.87755102,\n", + " 5.12244898, 5.36734694, 5.6122449 , 5.85714286, 6.10204082,\n", + " 6.34693878, 6.59183673, 6.83673469, 7.08163265, 7.32653061,\n", + " 7.57142857, 7.81632653, 8.06122449, 8.30612245, 8.55102041,\n", + " 8.79591837, 9.04081633, 9.28571429, 9.53061224, 9.7755102 ,\n", + " 10.02040816, 10.26530612, 10.51020408, 10.75510204, 11. ])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xfit = np.linspace(-1, 11)\n", + "xfit" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "As before, we need to coerce these *x* values into a ``[n_samples, n_features]`` features matrix, after which we can feed it to the model:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:44:17.061898Z", + "start_time": "2018-05-15T07:44:17.058567Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "Xfit = xfit[:, np.newaxis]\n", + "yfit = model.predict(Xfit)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Finally, let's visualize the results by plotting first the raw data, and then this model fit:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:07:37.931538Z", + "start_time": "2018-05-15T07:07:37.811851Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(x, y)\n", + "plt.plot(xfit, yfit);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Typically the efficacy of the model is evaluated by comparing its results to some known baseline, as we will see in the next example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Supervised learning example: Iris classification\n", + "\n", + "> ### Question: given a model trained on a portion of the Iris data, how well can we predict the remaining labels?\n", + "\n", + "For this task, we will use an extremely simple generative model known as **Gaussian naive Bayes** \n", + "- which proceeds by assuming each class is drawn from an axis-aligned Gaussian distribution\n", + "- see [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) for more details).\n", + "- it is so fast \n", + "- it has no hyperparameters to choose\n", + "\n", + "Gaussian naive Bayes is often a good model to use as a baseline classification, before exploring whether improvements can be found through more sophisticated models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# To evaluate the model on data it has not seen before\n", + "\n", + "- we will split the data into a *training set* and a *testing set*.\n", + " - Using the ``train_test_split`` utility function:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:46:59.245968Z", + "start_time": "2018-05-15T07:46:59.240652Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris,\n", + " random_state=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With the data arranged, we can follow our recipe to predict the labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:50:17.006283Z", + "start_time": "2018-05-15T07:50:17.000757Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import GaussianNB # 1. choose model class\n", + "model = GaussianNB() # 2. instantiate model\n", + "model.fit(Xtrain, ytrain) # 3. fit model to data\n", + "y_model = model.predict(Xtest) # 4. predict on new data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Finally, we can use the ``accuracy_score`` utility to see the fraction of predicted labels that match their true value:" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:50:19.472731Z", + "start_time": "2018-05-15T07:50:19.468189Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('setosa', 'setosa') ('versicolor', 'versicolor') ('versicolor', 'versicolor') ('setosa', 'setosa') ('virginica', 'virginica') ('versicolor', 'versicolor') ('virginica', 'virginica') ('setosa', 'setosa') ('setosa', 'setosa') ('virginica', 'virginica') ('versicolor', 'versicolor') ('setosa', 'setosa') ('virginica', 'virginica') ('versicolor', 'versicolor') ('versicolor', 'versicolor') ('setosa', 'setosa') ('versicolor', 'versicolor') ('versicolor', 'versicolor') ('setosa', 'setosa') ('setosa', 'setosa') ('versicolor', 'versicolor') ('versicolor', 'versicolor') ('versicolor', 'virginica') ('setosa', 'setosa') ('virginica', 'virginica') ('versicolor', 'versicolor') ('setosa', 'setosa') ('setosa', 'setosa') ('versicolor', 'versicolor') ('virginica', 'virginica') ('versicolor', 'versicolor') ('virginica', 'virginica') ('versicolor', 'versicolor') ('virginica', 'virginica') ('virginica', 'virginica') ('setosa', 'setosa') ('versicolor', 'versicolor') ('setosa', 'setosa')\n" + ] + } + ], + "source": [ + "print(*zip(ytest, y_model))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:50:48.890800Z", + "start_time": "2018-05-15T07:50:48.885683Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9736842105263158" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "accuracy_score(ytest, y_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "With an accuracy topping 97%, we see that even this very naive classification algorithm is effective for this particular dataset!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Unsupervised learning example: Iris dimensionality reduction\n", + "\n", + "Reducing the dimensionality of the Iris data to more easily visualize it:\n", + "- Iris data is four dimensional: \n", + " - there are four features recorded for each sample.\n", + "\n", + "The task of dimensionality reduction is to ask:\n", + "\n", + "> ## whether there is a suitable lower-dimensional representation that retains the essential features of the data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Unsupervised learning example: Iris dimensionality reduction\n", + "\n", + "Dimensionality reduction is often used as an aid to visualizing data: \n", + "- it is much easier to plot data in two dimensions than in four dimensions or higher!\n", + "\n", + "Here we will use ``principal component analysis`` (PCA; see [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb))\n", + "- It is a fast linear dimensionality reduction technique.\n", + "\n", + "We will ask the model to return \n", + "- two components\n", + " - a two-dimensional representation of the data.\n", + "# Following the sequence of steps outlined earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:52:04.614637Z", + "start_time": "2018-05-15T07:52:04.609500Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA # 1. Choose the model class\n", + "model = PCA(n_components=2) # 2. Instantiate the model with hyperparameters\n", + "model.fit(X_iris) # 3. Fit to data. Notice y is not specified!\n", + "X_2D = model.transform(X_iris) # 4. Transform the data to two dimensions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "To plot the results:\n", + "- A quick way to do this is to insert the results into the original Iris ``DataFrame``, \n", + "- use Seaborn's ``lmplot`` to show the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T12:53:24.796325Z", + "start_time": "2018-05-15T12:53:24.563624Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set_context(\"talk\", font_scale=1.5)\n", + "iris['PCA1'] = X_2D[:, 0]\n", + "iris['PCA2'] = X_2D[:, 1]\n", + "sns.lmplot(\"PCA1\", \"PCA2\", hue='species', data=iris, fit_reg=False);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "In the two-dimensional representation, the species are fairly well separated, even though the PCA algorithm had no knowledge of the species labels!\n", + "\n", + "A relatively straightforward classification will probably be effective on the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Unsupervised learning: Iris clustering\n", + "\n", + "Let's next look at applying clustering to the Iris data.\n", + "\n", + "> ### A clustering algorithm attempts to find distinct groups of data without reference to any labels.\n", + "\n", + "We will use a powerful clustering method called a ``Gaussian mixture model (GMM)``\n", + "- more detail in [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb).\n", + "- A GMM attempts to model the data as a collection of Gaussian blobs.\n", + "\n", + "We can fit the Gaussian mixture model as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:53:16.203796Z", + "start_time": "2018-05-15T07:53:16.170761Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.mixture import GMM # 1. Choose the model class\n", + "model = GMM(n_components=3,\n", + " covariance_type='full') # 2. Instantiate the model with hyperparameters\n", + "model.fit(X_iris) # 3. Fit to data. Notice y is not specified!\n", + "y_gmm = model.predict(X_iris) # 4. Determine cluster labels" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "As before, we will \n", + "- add the cluster label to the Iris ``DataFrame`` and \n", + "- use Seaborn to plot the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T10:05:36.294395Z", + "start_time": "2018-05-15T10:05:36.289669Z" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'axes.labelsize': 11.0,\n", + " 'axes.titlesize': 12.0,\n", + " 'font.size': 50.0,\n", + " 'grid.linewidth': 1.0,\n", + " 'legend.fontsize': 10.0,\n", + " 'lines.linewidth': 1.75,\n", + " 'lines.markeredgewidth': 0.0,\n", + " 'lines.markersize': 7.0,\n", + " 'patch.linewidth': 0.3,\n", + " 'xtick.labelsize': 10.0,\n", + " 'xtick.major.pad': 7.0,\n", + " 'xtick.major.width': 1.0,\n", + " 'xtick.minor.width': 0.5,\n", + " 'ytick.labelsize': 10.0,\n", + " 'ytick.major.pad': 7.0,\n", + " 'ytick.major.width': 1.0,\n", + " 'ytick.minor.width': 0.5}" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sns.plotting_context()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T12:53:39.330142Z", + "start_time": "2018-05-15T12:53:38.835696Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set_context(\"talk\", font_scale=1.5)\n", + "iris['cluster'] = y_gmm\n", + "sns.lmplot(\"PCA1\", \"PCA2\", data=iris, hue='species',\n", + " col='cluster', fit_reg=False);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "By splitting the data by cluster number, GMM algorithm recovered the underlying label without an expert: \n", + "- the measurements of these flowers are distinct enough\n", + "- we could *automatically* identify the presence of these different groups of species \n", + " - with a simple clustering algorithm!\n", + "- might further give experts in the field clues as to the relationship between the samples they are observing." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Application: Exploring Hand-written Digits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "In the wild, this problem involves \n", + "- locating characters in an image. \n", + "- identifying characters in an image. \n", + "\n", + "Here we'll take a shortcut and use Scikit-Learn's set of pre-formatted digits, which is built into the library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Loading and visualizing the digits data\n", + "\n", + "We'll use Scikit-Learn's data access interface and take a look at this data:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:09:24.416431Z", + "start_time": "2018-05-15T07:09:24.256577Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1797, 8, 8)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_digits\n", + "digits = load_digits()\n", + "digits.images.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The images data is a three-dimensional array: \n", + "- 1,797 samples \n", + "- each consisting of an 8 × 8 grid of pixels.\n", + "\n", + "Let's visualize the first hundred of these:" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T12:59:40.709794Z", + "start_time": "2018-05-15T12:59:38.045852Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, axes = plt.subplots(10, 10, figsize=(8, 8),\n", + " subplot_kw={'xticks':[], 'yticks':[]},\n", + " gridspec_kw=dict(hspace=0.1, wspace=0.1))\n", + "\n", + "for i, ax in enumerate(axes.flat):\n", + " ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')\n", + " ax.text(0.05, 0.05, str(digits.target[i]),\n", + " transform=ax.transAxes, color='green')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "In order to work with this data within Scikit-Learn, \n", + "- we need a two-dimensional, ``[n_samples, n_features]`` representation.\n", + "- treating each pixel in the image as a feature: \n", + " - so that we have a length-64 array of pixel values representing each digit.\n", + "- target array gives the previously determined label for each digit.\n", + "\n", + "Features and targets are represented as the ``data`` and ``target`` attributes in the `digits` dataset respectively:" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:02:52.799145Z", + "start_time": "2018-05-15T13:02:52.794763Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1797, 64)" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = digits.data\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:02:53.880019Z", + "start_time": "2018-05-15T13:02:53.875168Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0., 0., 5., ..., 0., 0., 0.],\n", + " [ 0., 0., 0., ..., 10., 0., 0.],\n", + " [ 0., 0., 0., ..., 16., 9., 0.],\n", + " ...,\n", + " [ 0., 0., 1., ..., 6., 0., 0.],\n", + " [ 0., 0., 2., ..., 12., 0., 0.],\n", + " [ 0., 0., 10., ..., 12., 1., 0.]])" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:03:30.604467Z", + "start_time": "2018-05-15T13:03:30.600002Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1797,)" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = digits.target\n", + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:03:31.333742Z", + "start_time": "2018-05-15T13:03:31.329466Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, ..., 8, 9, 8])" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "We see here that there are 1,797 samples and 64 features." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Unsupervised learning: Dimensionality reduction\n", + "\n", + "We'd like to visualize our points within the 64-dimensional parameter space\n", + "- it's difficult to effectively visualize points in such a high-dimensional space.\n", + "- Instead we'll reduce the dimensions to 2, using an unsupervised method.\n", + "\n", + "Here, we'll make use of a manifold learning algorithm called *Isomap* (see [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb)), and transform the data to two dimensions:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T07:09:47.457538Z", + "start_time": "2018-05-15T07:09:45.592768Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1797, 2)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.manifold import Isomap\n", + "iso = Isomap(n_components=2)\n", + "iso.fit(digits.data)\n", + "data_projected = iso.transform(digits.data)\n", + "data_projected.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "We see that the projected data is now two-dimensional.\n", + "Let's plot this data to see if we can learn anything from its structure:" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:10:41.495284Z", + "start_time": "2018-05-15T13:10:41.273837Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target,\n", + " edgecolor='none', alpha=0.5,\n", + " cmap=plt.cm.get_cmap('nipy_spectral', 10))\n", + "plt.colorbar(label='digit label', ticks=range(10))\n", + "plt.clim(-0.5, 9.5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "This plot gives us some good intuition into how well various numbers are separated in the larger 64-dimensional space. \n", + "\n", + "- zeros (in black) and ones (in purple) have very little overlap in parameter space.\n", + " - Intuitively, this makes sense: a zero is empty in the middle of the image, while a one will generally have ink in the middle.\n", + "- There seems to be a more or less continuous spectrum between ones and fours: \n", + " - we can understand this by realizing that some people draw ones with \"hats\" on them, which cause them to look similar to fours.\n", + "\n", + "Overall, however, the different groups appear to be fairly well separated in the parameter space: \n", + "- this tells us that even a very straightforward supervised classification algorithm should perform suitably on this data.\n", + "\n", + "Let's give it a try." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Classification on digits\n", + "\n", + "Let's apply a classification algorithm to the digits.\n", + "\n", + "- split the data into a training and testing set\n", + "- fit a Gaussian naive Bayes model" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:15:12.114546Z", + "start_time": "2018-05-15T13:15:12.110268Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:15:12.845573Z", + "start_time": "2018-05-15T13:15:12.836504Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "model = GaussianNB()\n", + "model.fit(Xtrain, ytrain)\n", + "y_model = model.predict(Xtest)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Now that we have predicted our model, we can gauge its accuracy by comparing the true values of the test set to the predictions:" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:15:16.281720Z", + "start_time": "2018-05-15T13:15:16.276351Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8333333333333334" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "accuracy_score(ytest, y_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With even this extremely simple model, we find about 80% accuracy for classification of the digits!\n", + "\n", + "However, this single number doesn't tell us *where* we've gone wrong\n", + "- one nice way to do this is to use the *confusion matrix*, \n", + " - which we can compute with Scikit-Learn and plot with Seaborn:" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:15:41.399490Z", + "start_time": "2018-05-15T13:15:41.036420Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "sns.set_context(\"notebook\", font_scale=1.7)\n", + "\n", + "mat = confusion_matrix(ytest, y_model)\n", + "\n", + "sns.heatmap(mat, square=True, annot=True, cbar=False)\n", + "plt.xlabel('predicted value')\n", + "plt.ylabel('true value');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "This shows us where the mis-labeled points tend to be: \n", + "- a large number of twos here are mis-classified as either ones or eights.\n", + "\n", + "Another way to gain intuition into the characteristics of the model:\n", + "- to plot the inputs again, with their predicted labels.\n", + "- using green for correct labels, and red for incorrect labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-15T13:17:10.256934Z", + "start_time": "2018-05-15T13:17:07.529831Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axes = plt.subplots(10, 10, figsize=(8, 8),\n", + " subplot_kw={'xticks':[], 'yticks':[]},\n", + " gridspec_kw=dict(hspace=0.1, wspace=0.1))\n", + "\n", + "test_images = Xtest.reshape(-1, 8, 8)\n", + "\n", + "for i, ax in enumerate(axes.flat):\n", + " ax.imshow(test_images[i], cmap='binary', interpolation='nearest')\n", + " ax.text(0.05, 0.05, str(y_model[i]),\n", + " transform=ax.transAxes,\n", + " color='green' if (ytest[i] == y_model[i]) else 'red')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Examining this subset of the data, we can gain insight regarding where the algorithm might be not performing optimally.\n", + "\n", + "To go beyond our 80% classification rate, we might move to a more sophisticated algorithm such as \n", + "- support vector machines (see [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb)), \n", + "- random forests (see [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb)) \n", + "- the other classification approaches." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "In this section we have covered the essential features of the Scikit-Learn \n", + "- data representation\n", + "- the estimator API.\n", + "\n", + "Regardless of the type of estimator, the same import/instantiate/fit/predict pattern holds.\n", + "\n", + "Armed with this information about the estimator API, you can explore the Scikit-Learn documentation and begin trying out various models on your data.\n", + "\n", + "In the next section, we will explore perhaps the most important topic in machine learning: how to select and validate your model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "< [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb) | [Contents](Index.ipynb) | [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) >" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/code/09.06-Linear-Regression.ipynb b/code/09.06-Linear-Regression.ipynb new file mode 100755 index 0000000..6147821 --- /dev/null +++ b/code/09.06-Linear-Regression.ipynb @@ -0,0 +1,1886 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# In Depth: Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "\n", + "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "\n", + "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "- Naive Bayes (discussed earlier in [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb)) is a good starting point for classification tasks\n", + "- linear regression models are a good starting point for regression tasks.\n", + " - can be fit very quickly, and \n", + " - are very interpretable.\n", + " \n", + "The simplest form of a linear regression model (i.e., fitting a straight line to data) \n", + "- Extended to model more complicated data behavior.\n", + "- We will see how linear models can be generalized to account for more complicated patterns in data.\n", + "\n", + "We begin with the standard imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:48:28.419326Z", + "start_time": "2018-12-26T01:48:26.363658Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns; sns.set()\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Simple Linear Regression\n", + "\n", + "We will start with the most familiar linear regression, a straight-line fit to data.\n", + "A straight-line fit is a model of the form\n", + "$$\n", + "y = ax + b\n", + "$$\n", + "where $a$ is commonly known as the *slope*, and $b$ is commonly known as the *intercept*.\n", + "\n", + "Consider the following data, which is scattered about a line with a slope of 2 and an intercept of -5:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:59:07.712292Z", + "start_time": "2018-12-26T01:59:07.534304Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rng = np.random.RandomState(1)\n", + "x = 10 * rng.rand(50)\n", + "y = 2 * x - 5 + rng.randn(50)\n", + "plt.scatter(x, y);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We can use Scikit-Learn's ``LinearRegression`` estimator to fit this data and construct the best-fit line:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:59:08.889365Z", + "start_time": "2018-12-26T01:59:08.703013Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD3CAYAAADSftWOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAIABJREFUeJzt3Wl0XOWd5/Fv7aXdkiXvu2xdsxmMWcxqY4yNSUhIHCAYcBK6M72k52Ryek56OtN9enp6us/09HZmeibdPSSZtJ2whBA6QDfY4AWwgdh4xcS+krxbtlFJLu1SrXdeSDKyXFUqlUq1qH6fN0iqW7eei6Sfrv/1PM/fZlkWIiKSX+zZHoCIiIyewltEJA8pvEVE8pDCW0QkDym8RUTykDNTL+TzdaY8raWyshi/vyedw8l5uubCUGjXXGjXC2O75pqaMlu8x/LiztvpdGR7CBmnay4MhXbNhXa9MH7XnBfhLSIiV1J4i4jkIYW3iEgeUniLiOQhhbeISB5SeIuI5CGFt4hIHsrYIh0RkUIQCEVo7wpQVuLmYH0LlqOZO6+ZkvbXSTq8DcO4HfhL0zRXGoaxFHgdaBh4+B9M03wx7aMTEckTkWiUF7c3cqDeR2tHAKfDRjhiUT3Jyx2La7DZ4i6WTElS4W0YxneBp4HugS8tA/7WNM2/SetoRETy1IvbG3n7o3OXPw9H+ncEWVo3Je3BDcnXvI8DXx7y+TLgc4ZhvGsYxg8NwyhL+8hERHJIIBSh2d9DIBSJ+divPrkY83mHGnwxnzNWSd15m6b5smEY84Z8aQ/wA9M09xmG8Z+BPwH+Y6JzVFYWj2mNf01N4f190DUXhkK75ny73kgkyo9e+4QPj1zA19ZLzaQill8/nWcevg6Hw47P38v//cXHdPaGYz6/pa0Xh9tFTXVJWseV6huWr5im2Tb4MfD3Iz1hLDuJ1dSU4fN1pvz8fKRrLgyFds35eL3PvV1/RTmk2d/Lq++doKs7QHVFEb/cdZJAKHK5xj1c9aQiIsFQSted6A9dqlMFtxiGcdvAx/cD+1I8j4hIzgqEIhyo98V87J2D5/nZjkZcTjvPPHQNK26aEfO45ddPx+NK/86Cqd55/w7w94ZhhICLwL9L35BERHJDe1eASx2BmI9Foha3LK5h49rFlBa5uCM6FZvNxoH6FvydfVSWeVlaV80zD1/HpUvdMc8xFkmHt2map4DlAx/vB+5K+2hERHJIRamHqnIPrTECvLzYzW987trLd9UOu50Nq+tYv6KW9q4AFaUePC4HDsf4rIXUCksRkTg8Lgd1syfFfGzZ4hrauwJXzSTxuBxMqSwel1LJUFphKSISQzAU4fUPTrPnaDMALqedcDhKVbmHYq+LQw0+du5voqrcw9K6Gh5ftRCHPXP3wwpvEZkQBpelD5Yr4n0tGR+faOUnW018bX1UlXt4cnUd186vor0rwJY9Z9hx4PzlY1s7Apdno2xYXZfei0pA4S0ieW3osvRLHQGqyj3ctKgaCzjU0HL5a8ncHfs7Azy/rYGPjjVjt9l48LY5fOHueXjd/VFZUerh8PHWmM89UN/C+hW1414uGaTwFpG8NnxZemtHgG37mq44ZqS742jUYtv+c7zy7gn6ghFqZ5azce1iZk8pveK4RLNP/J19tHcFmFJZPNZLSorCW0TyVqJ52LHEujs+eaGDTW+anP60kxKvk6+vW8zdS6Zjj7EfSaLZJ5VlXipKPaldSAoU3iKStxLdCcdyqaOPE03tLJhZQSRi8Yt3j7NjfxMWcNf103h01ULKi91xn+9xOVhaV3PFnf6gpXXVGSuZgMJbRPJYojvhWGw2+KsXDlJa5CQYjhIMRZk+uZin1xgsnluZ1DkeX7UQ4KrFOINfzxSFt4jkLY/LQbHXlXR4Rwe2Huka2ESqdmY5f7DhZpyjWEgTbzFOpmmRjojkrUAoQndvMOZjHpedyeUebEC83bTbOoNEoldvJpWMTC3GiUfhLSJ5q70rgL8zdniHwlG+/ZUlfGVlLfHieXCGSD5S2URE8laimndFiYfX3j/N3mPNcZ+f6Rki6aQ7bxHJW4OzP2Lp7A2y91gz86eXcdu1sRsAZ3qGSDrpzltE8trQ2R+XOvpw2G2EoxYup50n7q9lxU0zsbAoL3ZnfYZIOim8RSSvOex2vnTPAiJRi50HmghHLZZfO5XHVy0cUhKx5cQMkXRSeItI3rIsi32mj+ferqetK8jUyiKeWmtw3byqmMcPzhCZCBTeIpKXmtt6+enWej4+0YrTYeeRu+ezbvkcXGNodJ5PFN4ikldC4Shv7jnD6++fIhSOct28Sp5aYzC1amLcUSdL4S0ieePYaT+bt5pcaO2hosTNVx9axG3XTMEWYxOpiU7hLSI5r6M7yM92NPL+kYvYgFU3z+TL9y6g2Ou6fEyqjRfylcJbRHJW1LJ479B5fr7zON19YeZOLWPjgwbzp5dfPiZWM4ZstCXLNIW3iOSks81dbNpyjONNHXjdDjasXsSqm2dht19ZIonVjCEbbckyTeEtIjmlLxjm1V2n2Lr3LFHL4tbFU/jq/YuoLLt6GXuiZgyZbkuWaQpvEckZB+p9/PTtei51BKiZ5OWpNQY3LJgc9/hcakuWaQpvEcm6lvZennurgYONLTjsNj5/5zw+f8dc3CPcNedSW7JMU3iLSNaEI1He2nuWX+4+STAUZfGcSTy91mD65JKknp9LbckyTeEtIllRf7aNzVtNmnzdlBW7+NraxSy/buqo52znSluyTFN4i0hGdfWGeGlHI+8dvgDAyptmsH5lLSVD5myPRq60Jcs0hbeIZIRlWby95zQ/fPUTunpDzKopZeODBgtnVqTl/BNp06lkKLxFJrh0rjxM9VxNvi42bzGpP9eOx+Xg8VULWX3LrAm9iGa8KbxFJqh0rjxM9VyBUITXdp9iy54zRKIWd9wwnfX3zKeq3DvWyyt4Cm+RCSqdKw9TOdehxhZ++lY9Le19TC738uSaOh64Yz4+X+doL0ViUHiLTCCDZY0ijzNtKw9Hu4rxUkcfz7/dwL56Hw67jYeWz+XhO+fhcU/8NxEzSeEtMgEML2tUlLpp6wrGPHa0Kw+TXcUYiUbZ9tE5Xtl1kkAwwqJZFWxcazCzpjTl65L4FN4iE8Dwska84IbRrzxMZhXj8aZ2Nm0xOdvcRWmRiw0PLeKuG6ZjT3Gf7ULb3jUVCm+RPJeorBHLaFceJlrFeP2CKl7c1sA7B89jAXcvmc6jK2spK3Ynff6hCnV711QovEXyXKKyxnBet4NH7lkw6tcYvopxUqmH6dXF7K/30dkTYmZ1CU+vNaibPWnU5x6qULd3TYXCWyTPJSprDBcIRujqCVLsGd2v/tBVjI3n2njt/dN8ctKP22nn0ZW1PHDrbJyOsd0ZF/L2rqlI+jtoGMbtwF+aprnSMIyFwI8BCzgCfMs0zej4DFFEEklU1hjOZoMte8+yYfWiUZchgqEI//rBad781WnCEYsbayfz5AN1VE8qSvocfcEwzf6emLXsQt7eNRVJhbdhGN8Fnga6B770t8Afmaa50zCMfwS+CLwyPkMUkZEML2u4XQ76gpGrjotasGN/Ew67bVRliCMnWtm81cTX1kdlmYcnH6hj6aLqpDeRGqxlHz7eis/fG7OWXcjbu6Yi2Tvv48CXgc0Dny8D3hn4+A1gDQpvkawZvjlTabGLl3ce552D54laVx+fbBnC3xnghW0N7D3WjN1mY+1ts/ni3fPxukdXdkmmll3I27umIqnvgGmaLxuGMW/Il2ymaQ7+SHQCI+4sU1lZjNOZ+v/8mpqylJ+br3TNhSHd1zxr4L9PPOhh58HzMY/xd/bhcLuoqY69b3YkavFvu0+y+Y2j9AbCLJ5bye9+5Ubmzxj9JlJ9wTCHj7fGfOzw8VZ+a33R5T8Gv/fYUoqL3Hx45AItbb1UTypi+fXTeebh63CMsaaeTePxc53qG5ZD69tlQNtIT/D7e1J8qf4LL7QltbrmwjCe1xwJRagqi1+GiARDMV/75IUONm0xOX2xkxKvk689aHDPjTOw22ycO9826vnXzf4efP7emI+1tPVy/FTrFbXsR+6ax7rbZl/xOpcudcd8fj4Yy/c4UeinGt4HDMNYaZrmTmAdsCPF84jIOElUhij2OnE6rqxX9/SFeeXdE2zffw4LuPP6aTx230LKS9xEolGe29aQ0vzrVGrZhba9aypSDe/fB541DMMNHAV+nr4hiUi6PL5qIeaZNs42d13x9bPNXby4vZENq+uwLIs9R5t5YVsD7d1BplUV8/Rag2vmVl4+fizzr1XLHh9Jh7dpmqeA5QMf1wMrxmlMIpIm4YhFT18o5mMH6lu4Z8kMfra9gU9O+XE57Xzp3gU8eNscXM7P7qbTMf96cDbM4eOttLT1FkyrsvGkRToiE1iiudOXOvr4s3/eSzhicf2CKp56oC5mqSId868HZ8P81voijp9q1Z4laaDwFpnAEtWbLaCkyMWTq+tYZtTEnbOdzvnXXrdTtew0yd+5NyIyosF6cyyzp5TyF99czi2LpyRcbJPoHCPVrAOhCM3+HgKhqxcMydjozltkgnt0ZS3nmrswz7RhAQ67jWVGDd98+Nqkl8gPX8E5Us063u6Av/fY0nRdVsFTeItMYKcvdrJpi8nJCx143Q7W3jabNbfOoSjGxlSJ9tAevoJzpJp1vNkpxUVuHrlrXtqur5ApvEUmoN5AmH957yRv7zuLZcHya6fy+KqFMevTo9lDO5n514lmp3x45ALrbputNyvTQOEtMoFYlsU+08fz2xrwdwaYWlnEU2sNrptXFfc56d5DO9HslJa2Xu0OmCYKb5EJwtfWy0/fqufw8VacDhtfvHs+Dy2fgyvBnkLjsYd2otkp1ZOKtDtgmii8RfJcOBJly54zvLr7FKFwlGvnVfLUGoNpVSPf3Y7HHtqJVlQuv366SiZpovAWyWPmGT+btphcaO2hvMTNNx5ayO3XTE16n+3x2kM73uyUZx6+Lq83mcolCm+RPNTRE+Sl7Y3sPnIRG3DfzTNZf+8Cir2uUZ1nvPYdiTc7JZ+3dc01Cm+RPBK1LHYdvsBLOxrp7gszZ2opG9cuZsGM8pTPOdo53KOh3QHHj8JbJE+ca+5i0xaTxqZ2vG4HT6xexKqbZ466F+Vwo53DLblB4S2S4/qCYV7ddYqte88StSxuWTyFJ+5fRGVZemdt6C45vyi8RXLYgXofP327nksdAaorvDy1xmBJ7eRsD0tygMJbJAe1tPfy3FsNHGxswWG38fk75/L5O+bhVjlDBii8RXJIOBLlrY/O8stdJwmGoiyeM4mn1hjMiNMoWAqXwlskRzSca2PTFpMmXzdlxS42rjW447ppSc/ZlsKi8BbJsq7eED/f2ci7hy4AsOKmGaxfUUtp0ejmbEthUXiLZIllWby95ww/fPUIXb0hZtWUsHHtYhbOqsj20CQPKLxFsqCppZvNW0zqz7bhcTl47L6FrL5lFk6tQJQkKbxF0iBRI4Phx73+/ine/NUZIlGL5ddPY/09C5hc4c3gaGUiUHiLjMFoGhkcPt7CT7bW09Lex+RyL08+UMcDd87H5+vM0uglnym8RcYgmUYGlzr6eH5bA/tMHw67jXXL5/CFO+fjcWvOtqRO4S2SopEaGTxyzwJ2HTrPK7tOEghGWDSrgqfXGsyqKc3wSGUiUniLpChRI4NLHX38+aaPuNDaQ4nXyYZ1i7lryXTsmrMtaaLwFklRokYGFnChtYe7l0zn0ZW1lBW7Mz9AmdAU3iIpStTIoMTr5N+vX0Ld7ElZGJkUAoW3yBg8vmoh3X0h9h71EY5EAVg4s5zf/+pNeFz69ZLxo58ukRSFwhH+9YPT7D3aTDhiaRMpySiFt0gKjpxs5Sdb6mlu66WyzMOG1XXcXFetTaQkYxTeIqPQ1hXghW0N7DnajN1mY82ts/ni3fMp8uhXSTJLP3EiSegNhNmy5wxb956lLxihdkY5T681mDO1LNtDkwKl8BZJIBKN8uxrv2af6SMStbABi+dO4juP3YjLoRWSkj3awkwkjp6+MH/6473sOdpMJGoB/fO3j51u46Udx7M7OCl4Cm+RYSzLYs/RT/nesx9yrrk75jEH6lsIhCIZHpnIZ1Q2ERniU38PP9lazycnL+F0xJ854u/so70rwJTK4gyOTuQzCm8RIBSO8savTvP6+6cJR6JcP7+Kx1Yt5H++dCjm8vfKMi8VpZ4sjFSkn8JbCt7RU5fYtLWeTy/1UFHqZsPqOm4xarDZbHGXvy9ZODlh0wWR8Tam8DYMYz/QMfDpSdM0vzH2IYkkL9kONrG0dwf52fYGPvjkU2w2WL1sFl+6d8EVc7YfX7UQgAP1Plo7AthtELXgUEP/3tyxmi6IZELK4W0Yhhewmaa5Mn3DEUnOaDrYDBe1LN45eJ6Xdx6nJxBm3rQyNj5oMG9a+VXHOux2NqyuIxKJsuPAeQYmnXCpM3hV0wWRTLJZlpXSEw3DuB3YBJym/4/A90zT/DDe8eFwxHI69c9MSY9n/+VjXn3vxFVf/8I9C/jmIzfEfd6Jpna+//NDmGf8FHudbFx3DQ/eOR+HPf6bk33BMN/6H9tp9vde9diUyiL+z3dX4XWrAinjIu4P5lh+4nqAvwZ+ACwC3jAMwzBNMxzrYL+/J+UXqqkpK7g+f7rm+AKhCLsPNcV8bPeh86y7bfZVJZTeQJhf7jrJWx+dxbLgtmum8NX7FzGp1MOl1q6Er9fs78EXI7gBWtp6OX6qNeVZJ4X2fS6064WxXXNNTfwVvGMJ73qg0TRNC6g3DKMVmA6cHcM5RUaUqIPN8Cl8lmWxv97Hc2834O8MMKWyiKfXGFw3vyrp10vUdEGzTiRbxhLezwA3AL9rGMYMoBy4kJZRiSSQbJj62nr56Vv1HD7eitNh4wt3zeNzd8zFNcryXaKmC0vrqjXrRLJiLOH9Q+DHhmHson/V8DPxSiYi6TRSmDrsNv71g1O8tvsUwXCUa+ZW8vRag2lVqS+o+WzWSQv+zj4qy7wsrau+/HWRTEs5vE3TDAIb0jgWkaTFC9Oli6r5kx/t4UJrD+XFLr6+bjG3Xzt1zPtsD846Wb+iNuWpiSLppLfIJS8ND1O73cYvd53kr54/iA247+aZrL93AcVeV1pf1+NyaEm85ASFt+Q1l9POsTNtvLSjke6+MHOmlLLxwcUsmHH1nG2RiUThLXnrXHMXm7aaNJ5rx+N28MT9i1i1bKZWPEpBUHhL3gkEI7y6+yRb954lErW4xajhidV1VJZpyp4UDoW35KxY+5YcaPDx3Fv1tHYEqK7w8tSaOpbUVmd5pCKZp/CWnBNr35Jr5lbS1RviYGMrDruNz90xl8/fOU8zPqRgKbwl57y4vfGKOdytHQF2fXwRAGP2JJ5eazCjuiRbwxPJCQpvySmBUIQD9b6Yj5V6XXz70SXaBEoE9bCUHNPeFYi57B2gJxCiozuY4RGJ5CaFt+QMy7L49Wk/8RZDul0OSovdmR2USI5SeEtCgVCEZn/PuHdKP9/Szff+YTeb3jTjLmXvC0b4lxh7eIsUIhUPJaaxdKoZjUAo0r/P9sCc7aWLqvnSvfP5i8376Qte/QfjQH0L61fUapaJFDyFt8QUa8ZHvLZfqfaRPNjg49nXfk3vQEhXlLiZXOHFYbcTiBHccPV+3SKFSuEtV0k042PonW+qd+f+zgDPv13PR+aVr9He3d8XMhyJ4HbbCQSjVz23ssyj5gciKLwlhmQ71Yzm7hz6SzHb9jXxynsnCAQjOB02wpGre6h++ElzzOAGKPa6VDIRQW9YSgyDnWpiGexUM9Ld+fA3OE+c7+DPfvwRL2xrwGm3sX7FgpjBDcSsdQ/q7g2O+5unIvlAd95ylWTafjX7e5K6O+/pC/HyOyfYeaAJC7jrhmk8et9CPC4HOw80xZ3THY+/K6iatwgKb4ljpLZfI/WRLC9x8+EnF3lheyMd3UFmVJfw9Jo6jDmVl4+L9wfC47IRCMW+K69SzVsEUHhLHCO1/Up0d143p4K/f/ljjp7243baWb9iAWtvm4PTcWWVbvgfiOpJRSypnUzUsti+rynmuJbW1ajmLYLCW0aQqO3X8PCdVOqhotTN3qPNhCMWS2on8+QDddRMKor5/OF/IGrnTaazvZdItP/Nyvc/vni5/u11O7jrhmlq+CsywGZZsf95mm4+X2fKL1RTU4bP15nO4eS8fLrmQCjC3qOf8tr7p/G19VJZ5mHD6kXcXFczqsa/w685EIrg8/eAzUbNpKIJecedT9/ndCi064WxXXNNTVncXyDdeUvKi2wA2roCvLCtgT1Hm7HbbKy5dTZfvHs+RZ6x/2h5XA5mTSkb83lEJiKFdwEbyxL4aNRix4EmfvHucXoDERbMKGfjWoM5UxW2Ipmg8C5go11kM+jUxQ42vWly6mInxR4nG9ca3HvTDOyjKJGIyNgovAtUskvgh+rpC/PKeyfYvv8clgV3XDeVx1YtoqJE27SKZJrCu0AluwQe+vfZ3nusmee3NdDeFWRqVTEb19RxzbyqTA5ZRIZQeBeokRbZDC6Eafb38JOt9Rw5eQmnw84j98xn3e1zcTm1s4JINim8C1SiRTaL50wiFI7ybx+e5o0PTxOOWFw3v4qn1tQxVcvSRXKCwrsADU4NfOSeBcBni2zcLgdgsfvIRd4/cpHBifmlRU6mVRVRXeHN2phF5EoK7wISb2rgn/7GrTz3VgPvH7l4+dihK6q6esNs29eEzWZLOAtFRDJH4V1A4k0NDEeiHGpsGfH5akEmkjv0rlOBSDQ18L1DF+juC494jsFZKCKSfQrvApFoamAkauFyjLzAZugsFBHJLoV3DgiEIjT7e8a1Q0yi7jgAdvvI4T3YiEFEsk817yxKtLdIunlcDhbPrWT3xxdjPh4I9W/D6nU7CAQjeNz9IR0MRa5qxCAi2afwzqJEe4t8+4llaXudcCTKlj1n2Hu0GSBu41+AEq+T7z11MzUD87lT3W1QRMaXwjtLRtpbpC848huIyag/28amLSbnW7opL3bxtXWLmVVTwn/50V5ixbe/M4Db5bgc1uoVKZKbFN5ZMtLeIv6OwJi+OZ09QV7acZxdH1/ABqxcOpP1KxZQ4nURCEWSWhovIrkrpXwwDMMOfB+4EQgAv2maZmM6BzbRjbS3SGW5h8723lGfN2pZ7D58gZ/taKS7L8ycKaU8/aBB7YyKy8ck0x1eRHJbqjd3jwBe0zTvMAxjOfA3wBfTN6yJb6QA9bqdjLZx0jlfF5u3mDSca8fjdvDV+xdx/7KZMRsrjNQdXkRyW6rhfTfwJoBpmh8ahnFL+oZUONIVoIFghFd3n2Tr3rNEohbLjBqeuH8RVeXx9yIZqTu8iOS2lBoQG4bxA+Bl0zTfGPj8DLDANM2477KFwxHL6VQ4xNIXDOPvCFBZ7sHrHt3f0z2fXOQfXzmMz9/LlKpifufLS7jlmqnjNFIRybC0NyDuAIY2K7QnCm4Av78nxZcqjI7TTqCzvfdyqWSka25t7+O5t+s50NCCw27jc3fM5fN3zsPjcuTt/6tC+D4PV2jXXGjXC2PuHh/3sVTDezfwMPCzgZr3xymeR0YpHIny9kfn+OWukwRCEepmT+LptQYzq0uyPTQRyaBUw/sV4AHDMN6n/7b+G+kbkkB/KaXZ33NFLbqxqZ1Nb5qc83VRWuTiqTV13Hn9NGxq/CtScFIKb9M0o8Bvp3kswmdL5g8fb8Xn76Wq3MP1CyYTjVq8d/gCAPfeOJ2vrFxIaZEry6MVkWzRIp0cE2vJ/DsHzwMws6aEjWsNFs2alK3hiUiOUHjnkERL5os9Tv7wyZsp9upuW0S0JWxOae8KxFxxCf018K7eUIZHJCK5SuGdQ875uom3rbb2HBGRoVQ2yQH+zgDPvV3PPjN2yQS054iIXEnhnUWRaJTt+5r4xXsnCARjd9Hxuh3cvWS69hwRkSsovLPkxPkONm05xplPuyjxOnF6nTGbAJd4naxfURtzcykRKVwK7wzr6Qvx8rsn2Lm/CQu464Zp3Ld0Jn++aV/M4/2dAdq7AmqKICJXUHinUSAUibtDn2VZ/Orop7ywrZGO7iDTJxezca2BMadSzRFEZNQU3mmQqJGww27n00s9bN5q8utTflxOO+tXLGDtbXNwOvpLIWqOICKjpfBOg3iNhCNRi7IiF//24WnCEYsbFkzmyTV1TJlUdNU5hu/tXT2piCW1k/VGpYjEpPAeo0SrIt85eJ5o1GJSqZsNq+tYZtTE3URqeHOE2nmTU2qDJiKFQeE9RokaCUejFnfdMI0Nq+so8vT/r05UF4f+EsqUyuKU2qCJSOFQeI9RokbCFSVunlpj4HE5RqyLi4iMhlIjjkAoQrO/h0Ao9uKZQR6Xg9qZFTEfu/WaKZfvrgfr4q0dASw+q4u/uL0x3UMXkQKgO+9hRnOH3BsI8/Odx9l7rBkAt8tOKBSlqvzKRsKJ6uIH6ltYv6JWM0pEZFQU3sPEmzkCsGF1HdA/Z3vP0U/5f28cIxiKAlBe7GLZ4imsXjaLqnLvFWGcqC7u7+zTIhwRGTWVTYYY6Q55sJTydy8d4p9e/fXl4Abo6AmxY38TOw40XXUXPVgXj0WLcEQkFQrvAYFQhBNN7XHvkC919PHKuyf44x/u4ciJS7gcsaf8DYb8UIOLcGLRIhwRSUXBl02G1rjjNUIAsNttbN17looSNw+tmMPz22K/0RivDDJ8EU5l2ZV1cRGR0Sj48B5e444nErW4/+ZZfOneBTgc/UEeby+SIo/zqs7vwxfhxJvnLSKSjAkX3iMtghl+bLwa91ClRU6+/ZUl1M78rPFvvL1Iir1O/uuP98adqTK4CEdEZCwmTHinsggm0SyQoZx2O7862sy86eWXzxWrDFLsdXK2uevy82LNVBERSYcJE97JTPEbLtHqyKHauoNXnWt4GaTI03/HHYvmcotIuk2I2SbJTPGLxeNycNOi6qRfJ95MkimVxfQGwiPO5RYRSZcJEd7JLIKJpaWtF19bX9Kvk+hcmsstIpk0IcomicofsYIzHImyde9ZXt11kmA4yuI5k3h81SKKPA4cdhto/Sw1AAAGyklEQVR/sXk//hghnSiE1VBBRDJpQoT3aIKz/mwbm7eYNLV0U17s4mvrFrP82qlX7LO9bHFqIay53CKSKRMivGHk4OzsCfLSzuPsOnwBG7By6UzWr1hAidc16nPFo7ncIpIpNsuyMvJCPl9nyi9UU1OGz5dca4Lh87yjlsXujy/w0o7jdPWGmD2llI1rjbjbuCY6VyaN5ponCl3zxFdo1wtju+aamrLY+3AwAe68hwfs0EUwTb4uNm8xqT/Xjsfl4KurFnL/LbOSbn6gBTUikqvyNrwTLcoJhy1eff8kW/ecJRK1WFZXwxOrF1FV7s32sEVE0iJvwzveohxfWy/nmrtp7ehjcrmXJ9fUcdPC5Odyi4jkg7wM70SLcg41tmK323ho+Vwevmue3jAUkQkpL8N7pD1Jvr3+Bm6o1d22iExcebnCMtFqxqoyD3VzKjM8IhGRzMrL8A6Fo3jdsf/RcLNRo1KJiEx4eVU2sSyLDz65yIvbG+nsCVFS5MRhs9PVG9RqRhEpKCmFt2EYNuAc0DDwpQ9M0/zDtI0qhgut3WzeYnLsTBtul51H76vlgVtmE4laWs0oIgUn1TvvWmC/aZoPp3MwsQRDEX7yxlF+vr2BSNTipoXVbHhgEdUVRQA4HWghjYgUnFTDexkw0zCMHUAv8B3TNM30DeszP37zGB9+8ilV5R6eXF0Xtwu7iEghGXFvE8MwfgP4zrAvfwuYYprmS4Zh3A38nWmatyY6TzgcsZzO0Zc13j98nrPNnXzhnlqKPHlVohcRGau4e5uktDGVYRjFQNg0zeDA503ALNM0454sUxtTTRS65sJQaNdcaNcL47cxVapTBf8E+A8AhmHcCJxNFNxjEQhFuNDSHbeVmYhIIUq1DvHfgZ8YhvE5IAx8PW0jGnDFxlOdAarKRu4GLyJSKFIKb9M0/cDn0jyWK6TSDV5EpFDk5C1sqt3gRUQKRU6GdzLd4AOhCM3+HgW5iBSknJx7l6gb/KRSD1v2nuVwY8tVTRhUCxeRQpGTaTfYDT6WkiIXO/Y30doRwOKzWviL2xszO0gRkSzKyfCG/g7uq2+ZxeRyL3YbTC73ct/SGfT0hWIer1q4iBSSnCybADjsdjasrmP9ilocbheRYIj2rgA7D5yPefxgLVz7nIhIIcjZO+9BHpeD6dUleFyOhE0YKsu8VJTGfkxEZKLJ+fAeKlEtfGldtbaEFZGCkbNlk3gGmy0cqG/B39mnJgwiUpDyLryH1sLVhEFEClXehfcgj8uhNydFpGDlVc1bRET6KbxFRPKQwltEJA8pvEVE8lBKbdBERCS7dOctIpKHFN4iInlI4S0ikocU3iIieUjhLSKShxTeIiJ5SOEtIpKHcnpjKsMw7MD3gRuBAPCbpmlO2GaVhmG4gB8B8wAP8N9M03w1q4PKEMMwpgD7gAdM0zyW7fGMN8Mw/hD4AuAGvm+a5g+zPKRxNfCz/c/0/2xHgG9O5O+zYRi3A39pmuZKwzAWAj8GLOAI8C3TNKNjfY1cv/N+BPCapnkH8J+Av8nyeMbbU0CraZr3AA8C/zvL48mIgV/sfwJ6sz2WTDAMYyVwJ3AXsAKYndUBZcZDgNM0zTuB/wr8eZbHM24Mw/gu8APAO/ClvwX+aOD32gZ8MR2vk+vhfTfwJoBpmh8Ct2R3OOPuJeCPBz62AeEsjiWT/hr4RyB2g9KJZy3wMfAK8BrwenaHkxH1gHPgX9PlQOxO4hPDceDLQz5fBrwz8PEbwOp0vEiuh3c50D7k84hhGDld6hkL0zS7TNPsNAyjDPg58EfZHtN4Mwzj64DPNM0t2R5LBlXTfyPyKPDbwE8Nw7Bld0jjrov+kskx4Fngf2V1NOPINM2XufKPk800zcF9SDqBinS8Tq6HdwdQNuRzu2maE/pu1DCM2cAOYLNpms9lezwZ8AzwgGEYO4GbgE2GYUzL7pDGXSuwxTTNoGmaJtAHxG7OOnF8h/5rrqP/Pax/NgzDO8JzJoqh9e0yoC0dJ8318N5Nf60MwzCW0/9PzQnLMIypwFbgD0zT/FG2x5MJpmnea5rmCtM0VwIHgY2maV7M8rDG2y7gQcMwbIZhzABK6A/0iczPZ/+KvgS4gELpX3hg4H0OgHXAe+k4aa6XIF6h/67sffprwN/I8njG2/eASuCPDcMYrH2vM02zIN7IKxSmab5uGMa9wB76b6C+ZZpmJMvDGm9/B/zIMIz36J9h8z3TNLuzPKZM+X3gWcMw3MBR+kuiY6YtYUVE8lCul01ERCQGhbeISB5SeIuI5CGFt4hIHlJ4i4jkIYW3iEgeUniLiOSh/w9RSaPnY90jAAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "model = LinearRegression(fit_intercept=True)\n", + "\n", + "model.fit(x[:, np.newaxis], y)\n", + "\n", + "xfit = np.linspace(0, 10, 1000)\n", + "ytest = 2*xfit -5\n", + "yfit = model.predict(xfit[:, np.newaxis])\n", + "\n", + "plt.scatter(x, y)\n", + "plt.plot(xfit, yfit);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The slope and intercept of the data are contained in the model's fit parameters, which in Scikit-Learn are always marked by a trailing underscore.\n", + "Here the relevant parameters are ``coef_`` and ``intercept_``:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:59:13.031371Z", + "start_time": "2018-12-26T01:59:13.027600Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model slope: 2.02720881036\n", + "Model intercept: -4.99857708555\n" + ] + } + ], + "source": [ + "print(\"Model slope: \", model.coef_[0])\n", + "print(\"Model intercept:\", model.intercept_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We see that the results are very close to the inputs, as we might hope." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "**Model evaluation for regression**\n", + "\n", + "- RMSE\n", + "- R Square\n", + "\n", + "https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T02:32:43.445443Z", + "start_time": "2018-12-26T02:32:43.441417Z" + }, + "code_folding": [], + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# Root mean square error 均方根误差,亦称标准误差\n", + "# https://en.wikipedia.org/wiki/Root-mean-square_deviation\n", + "def rmse(y_test, y_pred): \n", + " mse = np.mean((y_test - y_pred) ** 2)\n", + " return mse ** 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T02:32:47.989491Z", + "start_time": "2018-12-26T02:32:47.983715Z" + }, + "code_folding": [], + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# R square\n", + "def R2(y_test, y_pred): \n", + " residuals_sum_of_squares = np.sum((y_pred - y_test)**2)\n", + " total_sum_of_squares = np.sum((y_test - np.mean(y_test))**2)\n", + " return 1 - residuals_sum_of_squares/total_sum_of_squares\n", + "# https://en.wikipedia.org/wiki/Coefficient_of_determination" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T02:35:37.441154Z", + "start_time": "2018-12-26T02:35:37.436570Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE: 0.1584\n", + "R2 score: 0.9992\n" + ] + } + ], + "source": [ + "print('RMSE: %.4f' % rmse(ytest, yfit))\n", + "print('R2 score: %.4f' % R2(ytest, yfit))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T06:10:19.758587Z", + "start_time": "2018-12-26T06:10:19.755636Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T02:35:47.008382Z", + "start_time": "2018-12-26T02:35:47.002317Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE: 0.1584\n", + "R2 score: 0.9992\n", + "Variance score: 0.9998\n" + ] + } + ], + "source": [ + "print('RMSE: %.4f' % mean_squared_error(ytest, yfit) ** 0.5)\n", + "print('R2 score: %.4f' % r2_score(ytest, yfit))\n", + "print('Variance score: %.4f' % explained_variance_score(ytest, yfit))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "The ``LinearRegression`` estimator is much more capable than this, however—in addition to simple straight-line fits, it can also handle multidimensional linear models of the form\n", + "$$\n", + "y = a_0 + a_1 x_1 + a_2 x_2 + \\cdots\n", + "$$\n", + "where there are multiple $x$ values.\n", + "Geometrically, this is akin to fitting a plane to points in three dimensions, or fitting a hyper-plane to points in higher dimensions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "**Building some example data using NumPy**\n", + "\n", + "The multidimensional nature of such regressions makes them more difficult to visualize" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:53:32.046192Z", + "start_time": "2018-12-26T01:53:32.040784Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "rng = np.random.RandomState(1)\n", + "X = 10 * rng.rand(100, 3)\n", + "y = 0.5 + np.dot(X, [1.5, -2., 1.])\n", + "# $y$ is constructed from three random $x$ values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "we can use the single ``LinearRegression`` estimator to fit lines, planes, or hyperplanes to our data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-26T01:52:01.908289Z", + "start_time": "2018-12-26T01:52:01.891454Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5\n", + "[ 1.5 -2. 1. ]\n" + ] + } + ], + "source": [ + "model.fit(X, y)\n", + "print(model.intercept_)\n", + "print(model.coef_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Basis Function Regression 基函数回归\n", + "\n", + "One trick you can use to adapt linear regression to nonlinear relationships between variables\n", + "- to transform the data according to *basis functions*.\n", + "\n", + "We have seen one version of this before, in the ``PolynomialRegression`` pipeline used in [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) and [Feature Engineering](05.04-Feature-Engineering.ipynb).\n", + "\n", + "The idea is to take our multidimensional linear model:\n", + "$$\n", + "y = a_0 + a_1 x_1 + a_2 x_2 + a_3 x_3 + \\cdots\n", + "$$\n", + "and build the $x_1, x_2, x_3,$ and so on, from our single-dimensional input $x$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "That is, we let $x_n = f_n(x)$, where $f_n()$ is some function that transforms our data.\n", + "\n", + "For example, if $f_n(x) = x^n$, our model becomes a polynomial regression:\n", + "$$\n", + "y = a_0 + a_1 x + a_2 x^2 + a_3 x^3 + \\cdots\n", + "$$\n", + "\n", + "Notice that this is *still a linear model*\n", + "- the linearity refers to the fact that the coefficients $a_n$ never multiply or divide each other.\n", + "- What we have effectively done is taken our one-dimensional $x$ values and projected them into a higher dimension, so that a linear fit can fit more complicated relationships between $x$ and $y$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Polynomial basis functions 多项式基函数\n", + "\n", + "> polynomial, Synonym: multinomial, 多项式\n", + "\n", + "This polynomial projection is useful enough that it is built into Scikit-Learn, using the ``PolynomialFeatures`` transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T09:39:14.565054Z", + "start_time": "2018-05-20T09:39:14.558498Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2., 4., 8.],\n", + " [ 3., 9., 27.],\n", + " [ 4., 16., 64.]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures\n", + "x = np.array([2, 3, 4])\n", + "poly = PolynomialFeatures(3, include_bias=False)\n", + "poly.fit_transform(x[:, None])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We see here that the transformer has converted our one-dimensional array into a three-dimensional array by taking the exponent of each value.\n", + "- This new, higher-dimensional data representation can then be plugged into a linear regression.\n", + "- As we saw in [Feature Engineering](05.04-Feature-Engineering.ipynb), the cleanest way to accomplish this is to use a pipeline.\n", + "\n", + "Let's make a 7th-degree polynomial model in this way:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T09:40:06.921714Z", + "start_time": "2018-05-20T09:40:06.917263Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "poly_model = make_pipeline(PolynomialFeatures(7),\n", + " LinearRegression())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With this transform in place, we can use the linear model to fit much more complicated relationships between $x$ and $y$. \n", + "\n", + "For example, here is a sine wave with noise:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "Our linear model, through the use of 7th-order polynomial basis functions, can provide an excellent fit to this non-linear data!" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T09:56:35.280127Z", + "start_time": "2018-05-20T09:56:35.146469Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rng = np.random.RandomState(1)\n", + "x = 10 * rng.rand(50)\n", + "y = np.sin(x) + 0.1 * rng.randn(50)\n", + "xfit = np.linspace(0, 10, 1000)\n", + "\n", + "poly_model.fit(x[:, np.newaxis], y)\n", + "yfit = poly_model.predict(xfit[:, np.newaxis])\n", + "\n", + "plt.scatter(x, y)\n", + "plt.plot(xfit, yfit);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Gaussian basis functions 高斯基函数\n", + "\n", + "Of course, other basis functions are possible.\n", + "For example, one useful pattern is to fit a model that is not a sum of polynomial bases, but a sum of Gaussian bases.\n", + "The result might look something like the following figure:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "\n", + "\n", + "\n", + "
[figure source in Appendix](#Gaussian-Basis)
\n", + "\n", + "The shaded regions in the plot are the scaled basis functions, and when added together they reproduce the smooth curve through the data.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "\n", + "These Gaussian basis functions are not built into Scikit-Learn, \n", + "- but we can write a custom transformer that will create them\n", + "- Scikit-Learn transformers are implemented as Python classes; \n", + " - reading Scikit-Learn's source is a good way to see how they can be created:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The simplest case of a normal distribution is known as the ''standard normal distribution''.\n", + "\n", + "$$\n", + "f(x \\mid \\mu, \\sigma^2) = \\frac{1}{\\sqrt{2\\pi\\sigma^2} } e^{ -\\frac{(x-\\mu)^2}{2\\sigma^2} } \\sim e^{ -0.5 (\\frac{x-\\mu}{\\sigma})^2}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:22:00.734658Z", + "start_time": "2018-05-20T15:22:00.710792Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "\n", + "class GaussianFeatures(BaseEstimator, TransformerMixin):\n", + " \"\"\"Uniformly spaced Gaussian features for one-dimensional input\"\"\"\n", + " def __init__(self, N, sigma_factor=2.0):\n", + " self.N = N\n", + " self.sigma_factor = sigma_factor\n", + " \n", + " @staticmethod\n", + " def _gauss_basis(x, mu, sigma, axis=None):\n", + " arg = (x - mu) / sigma\n", + " return np.exp(-0.5 * np.sum(arg ** 2, axis))\n", + " \n", + " def fit(self, X, y=None):\n", + " # create N centers spread along the data range\n", + " self.mu_ = np.linspace(X.min(), X.max(), self.N)\n", + " self.sigma_ = self.sigma_factor * (self.mu_[1] - self.mu_[0])\n", + " return self\n", + " \n", + " def transform(self, X):\n", + " return self._gauss_basis(X[:, :, np.newaxis], self.mu_,\n", + " self.sigma_, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:22:01.648815Z", + "start_time": "2018-05-20T15:22:01.503183Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rng = np.random.RandomState(1)\n", + "x = 10 * rng.rand(50)\n", + "y = np.sin(x) + 0.1 * rng.randn(50)\n", + "xfit = np.linspace(0, 10, 1000)\n", + "\n", + "gauss_model = make_pipeline(GaussianFeatures(20),\n", + " LinearRegression())\n", + "gauss_model.fit(x[:, np.newaxis], y)\n", + "yfit = gauss_model.predict(xfit[:, np.newaxis])\n", + "\n", + "plt.scatter(x, y)\n", + "plt.plot(xfit, yfit)\n", + "plt.xlim(0, 10);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "There is nothing magic about polynomial basis functions: \n", + "- You should have some sort of intuition about **the generating process of your data**; \n", + "- If you think one basis or another might be appropriate, you can use them as well." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Regularization 正则化\n", + "\n", + "The introduction of basis functions into our linear regression makes the model much more flexible, \n", + "- but it also can very quickly lead to over-fitting (refer back to [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) for a discussion of this).\n", + "\n", + "For example, if we choose too many Gaussian basis functions, we end up with results that don't look so good:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:33:09.422378Z", + "start_time": "2018-05-20T15:33:09.258227Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model = make_pipeline(GaussianFeatures(30),\n", + " LinearRegression())\n", + "model.fit(x[:, np.newaxis], y)\n", + "\n", + "plt.scatter(x, y)\n", + "plt.plot(xfit, model.predict(xfit[:, np.newaxis]))\n", + "\n", + "plt.xlim(0, 10)\n", + "plt.ylim(-5, 1.5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With the data projected to the 30-dimensional basis, the model has far too much flexibility and goes to extreme values between locations where it is constrained by data.\n", + "\n", + "We can see the reason for this if we plot the coefficients of the Gaussian bases with respect to their locations:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:35:27.115635Z", + "start_time": "2018-05-20T15:35:26.843888Z" + }, + "code_folding": [ + 0 + ], + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def basis_plot(model, title=None):\n", + " fig, ax = plt.subplots(2, sharex=True)\n", + " model.fit(x[:, np.newaxis], y)\n", + " ax[0].scatter(x, y)\n", + " ax[0].plot(xfit, model.predict(xfit[:, np.newaxis]))\n", + " ax[0].set(xlabel='x', ylabel='y', ylim=(-5, 1.5))\n", + " \n", + " if title:\n", + " ax[0].set_title(title)\n", + "\n", + " ax[1].plot(model.steps[0][1].mu_,\n", + " model.steps[1][1].coef_)\n", + " ax[1].set(xlabel='basis location',\n", + " ylabel='coefficient',\n", + " xlim=(0, 10))\n", + " \n", + "model = make_pipeline(GaussianFeatures(30), LinearRegression())\n", + "basis_plot(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "This is typical over-fitting behavior when basis functions overlap: \n", + "- the coefficients of adjacent basis functions blow up and cancel each other out.\n", + "\n", + "We know that such behavior is problematic" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "It would be nice if we could limit such spikes expliticly in the model \n", + "- by **penalizing large values of the model parameters**.\n", + "\n", + "Such a penalty is known as *regularization*, and comes in several forms." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Ridge regression ($L_2$ Regularization) 岭回归\n", + "\n", + "\n", + "*ridge regression* or $L_2$ *regularization*, sometimes also called *Tikhonov regularization*.\n", + "- Perhaps the most common form of regularization\n", + "\n", + "This proceeds by penalizing the **sum of squares** (2-norms) of the model coefficients; \n", + "- The penalty on the model fit would be \n", + "$$\n", + "P = \\alpha\\sum_{n=1}^N \\theta_n^2\n", + "$$\n", + "\n", + "where $\\alpha$ is a free parameter that controls the strength of the penalty.\n", + "\n", + "This type of penalized model is built into Scikit-Learn with the ``Ridge`` estimator:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:44:24.582352Z", + "start_time": "2018-05-20T15:44:24.362999Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.linear_model import Ridge\n", + "model = make_pipeline(GaussianFeatures(30), Ridge(alpha=0.1))\n", + "basis_plot(model, title='Ridge Regression')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The $\\alpha$ parameter is essentially a knob controlling the complexity of the resulting model.\n", + "- In the limit $\\alpha \\to 0$, we recover the standard linear regression result; \n", + "- in the limit $\\alpha \\to \\infty$, all model responses will be suppressed.\n", + "\n", + "One advantage of ridge regression in particular is that it can be computed very efficiently\n", + "- at hardly more computational cost than the original linear regression model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Lasso regression ($L_1$ regularization) 套索回归\n", + "\n", + "Lasso regression involves penalizing the **sum of absolute values** (1-norms) of regression coefficients:\n", + "$$\n", + "P = \\alpha\\sum_{n=1}^N |\\theta_n|\n", + "$$\n", + "Though this is conceptually very similar to ridge regression, the results can differ surprisingly: \n", + "- for example, due to geometric reasons lasso regression tends to favor *sparse models* where possible: \n", + " - it preferentially sets model coefficients to exactly zero.\n", + "\n", + "We can see this behavior in duplicating the ridge regression figure, but using L1-normalized coefficients:" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:47:04.365611Z", + "start_time": "2018-05-20T15:47:04.159181Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:466: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations\n", + " ConvergenceWarning)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.linear_model import Lasso\n", + "model = make_pipeline(GaussianFeatures(30), Lasso(alpha=0.001))\n", + "basis_plot(model, title='Lasso Regression')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With the lasso regression penalty, **the majority of the coefficients are exactly zero**, \n", + "- with the functional behavior being modeled by a small subset of the available basis functions.\n", + "\n", + "As with ridge regularization, the $\\alpha$ parameter tunes the strength of the penalty, and should be determined via, for example, cross-validation (refer back to [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb) for a discussion of this)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Example: Predicting Bicycle Traffic" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "To predict the number of bicycle trips across Seattle's Fremont Bridge based on weather, season, and other factors.\n", + "\n", + "We have seen this data already in [Working With Time Series](03.11-Working-with-Time-Series.ipynb).\n", + "\n", + "- we will join the bike data with another dataset, and \n", + "- try to determine the extent to which weather and seasonal factors—temperature, precipitation, and daylight hours—affect the volume of bicycle traffic through this corridor.\n", + "\n", + "- the NOAA makes available their daily [weather station data](http://www.ncdc.noaa.gov/cdo-web/search?datasetid=GHCND) (I used station ID USW00024233) \n", + "- we can easily use Pandas to join the two data sources.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We will perform a simple linear regression to relate weather and other information to bicycle counts, in order to estimate how a change in any one of these parameters affects the number of riders on a given day.\n", + "\n", + "In particular, this is an example of how the tools of Scikit-Learn can be used in a statistical modeling framework, in which the parameters of the model are assumed to have interpretable meaning.\n", + "\n", + "Let's start by loading the two datasets, indexing by date:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# !curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:53:35.910674Z", + "start_time": "2018-05-20T15:53:20.663864Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "counts = pd.read_csv('data/Fremont_Bridge.csv', index_col='Date', parse_dates=True)\n", + "weather = pd.read_csv('data/BicycleWeather.csv', index_col='DATE', parse_dates=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Next we will compute the total daily bicycle traffic, and put this in its own dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:53:53.996710Z", + "start_time": "2018-05-20T15:53:53.981379Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "daily = counts.resample('d').sum()\n", + "daily['Total'] = daily.sum(axis=1)\n", + "daily = daily[['Total']] # remove other columns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We saw previously that the patterns of use generally vary from day to day; let's account for this in our data by adding binary columns that indicate the day of the week:" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:54:09.994337Z", + "start_time": "2018-05-20T15:54:09.942189Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n", + "for i in range(7):\n", + " daily[days[i]] = (daily.index.dayofweek == i).astype(float)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Similarly, we might expect riders to behave differently on holidays; let's add an indicator of this as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:54:34.478168Z", + "start_time": "2018-05-20T15:54:34.445100Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from pandas.tseries.holiday import USFederalHolidayCalendar\n", + "cal = USFederalHolidayCalendar()\n", + "holidays = cal.holidays('2012', '2016')\n", + "daily = daily.join(pd.Series(1, index=holidays, name='holiday'))\n", + "daily['holiday'].fillna(0, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We also might suspect that the hours of daylight would affect how many people ride; let's use the standard astronomical calculation to add this information:" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:55:20.848224Z", + "start_time": "2018-05-20T15:55:20.530107Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 17)" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def hours_of_daylight(date, axis=23.44, latitude=47.61):\n", + " \"\"\"Compute the hours of daylight for the given date\"\"\"\n", + " days = (date - pd.datetime(2000, 12, 21)).days\n", + " m = (1. - np.tan(np.radians(latitude))\n", + " * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))\n", + " return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.\n", + "\n", + "daily['daylight_hrs'] = list(map(hours_of_daylight, daily.index))\n", + "daily[['daylight_hrs']].plot()\n", + "plt.ylim(8, 17)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We can also add the average temperature and total precipitation to the data.\n", + "In addition to the inches of precipitation, let's add a flag that indicates whether a day is dry (has zero precipitation):" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:55:35.967003Z", + "start_time": "2018-05-20T15:55:35.952760Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "# temperatures are in 1/10 deg C; convert to C\n", + "weather['TMIN'] /= 10\n", + "weather['TMAX'] /= 10\n", + "weather['Temp (C)'] = 0.5 * (weather['TMIN'] + weather['TMAX'])\n", + "\n", + "# precip is in 1/10 mm; convert to inches\n", + "weather['PRCP'] /= 254\n", + "weather['dry day'] = (weather['PRCP'] == 0).astype(int)\n", + "\n", + "daily = daily.join(weather[['PRCP', 'Temp (C)', 'dry day']])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Finally, let's add a counter that increases from day 1, and measures how many years have passed.\n", + "This will let us measure any observed annual increase or decrease in daily crossings:" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:55:51.546978Z", + "start_time": "2018-05-20T15:55:51.528230Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "daily['annual'] = (daily.index - daily.index[0]).days / 365." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Now our data is in order, and we can take a look at it:" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:56:04.238306Z", + "start_time": "2018-05-20T15:56:04.217949Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TotalMonTueWedThuFriSatSunholidaydaylight_hrsPRCPTemp (C)dry dayannual
Date
2012-10-033521.00.00.01.00.00.00.00.00.011.2773590.013.351.00.000000
2012-10-043475.00.00.00.01.00.00.00.00.011.2191420.013.601.00.002740
2012-10-053148.00.00.00.00.01.00.00.00.011.1610380.015.301.00.005479
2012-10-062006.00.00.00.00.00.01.00.00.011.1030560.015.851.00.008219
2012-10-072142.00.00.00.00.00.00.01.00.011.0452080.015.851.00.010959
\n", + "
" + ], + "text/plain": [ + " Total Mon Tue Wed Thu Fri Sat Sun holiday daylight_hrs \\\n", + "Date \n", + "2012-10-03 3521.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 11.277359 \n", + "2012-10-04 3475.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 11.219142 \n", + "2012-10-05 3148.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 11.161038 \n", + "2012-10-06 2006.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 11.103056 \n", + "2012-10-07 2142.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 11.045208 \n", + "\n", + " PRCP Temp (C) dry day annual \n", + "Date \n", + "2012-10-03 0.0 13.35 1.0 0.000000 \n", + "2012-10-04 0.0 13.60 1.0 0.002740 \n", + "2012-10-05 0.0 15.30 1.0 0.005479 \n", + "2012-10-06 0.0 15.85 1.0 0.008219 \n", + "2012-10-07 0.0 15.85 1.0 0.010959 " + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "daily.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With this in place, we can choose the columns to use, and fit a linear regression model to our data.\n", + "We will set ``fit_intercept = False``, because the daily flags essentially operate as their own day-specific intercepts:" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:56:39.750887Z", + "start_time": "2018-05-20T15:56:39.734285Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "# Drop any rows with null values\n", + "daily.dropna(axis=0, how='any', inplace=True)\n", + "\n", + "column_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'holiday',\n", + " 'daylight_hrs', 'PRCP', 'dry day', 'Temp (C)', 'annual']\n", + "X = daily[column_names]\n", + "y = daily['Total']\n", + "\n", + "model = LinearRegression(fit_intercept=False)\n", + "model.fit(X, y)\n", + "daily['predicted'] = model.predict(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Finally, we can compare the total and predicted bicycle traffic visually:" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:56:48.137178Z", + "start_time": "2018-05-20T15:56:47.862115Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "daily[['Total', 'predicted']].plot(alpha=0.5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "It is evident that we have missed some key features, especially during the summer time.\n", + "\n", + "- Either our features are not complete\n", + " - i.e., people decide whether to ride to work based on more than just these\n", + "- or there are some nonlinear relationships that we have failed to take into account \n", + " - e.g., perhaps people ride less at both high and low temperatures\n", + "\n", + "Nevertheless, our rough approximation is enough to give us some insights, and we can take a look at the coefficients of the linear model to estimate how much each feature contributes to the daily bicycle count:" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:58:02.441342Z", + "start_time": "2018-05-20T15:58:02.435225Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Mon 504.882756\n", + "Tue 610.233936\n", + "Wed 592.673642\n", + "Thu 482.358115\n", + "Fri 177.980345\n", + "Sat -1103.301710\n", + "Sun -1133.567246\n", + "holiday -1187.401381\n", + "daylight_hrs 128.851511\n", + "PRCP -664.834882\n", + "dry day 547.698592\n", + "Temp (C) 65.162791\n", + "annual 26.942713\n", + "dtype: float64" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = pd.Series(model.coef_, index=X.columns)\n", + "params" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "These numbers are difficult to interpret without some measure of their uncertainty.\n", + "We can compute these uncertainties quickly using bootstrap resamplings of the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:58:23.047893Z", + "start_time": "2018-05-20T15:58:20.770355Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from sklearn.utils import resample\n", + "np.random.seed(1)\n", + "err = np.std([model.fit(*resample(X, y)).coef_\n", + " for i in range(1000)], 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "With these errors estimated, let's again look at the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-20T15:58:37.008473Z", + "start_time": "2018-05-20T15:58:37.001643Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " effect error\n", + "Mon 505.0 86.0\n", + "Tue 610.0 83.0\n", + "Wed 593.0 83.0\n", + "Thu 482.0 85.0\n", + "Fri 178.0 81.0\n", + "Sat -1103.0 80.0\n", + "Sun -1134.0 83.0\n", + "holiday -1187.0 163.0\n", + "daylight_hrs 129.0 9.0\n", + "PRCP -665.0 62.0\n", + "dry day 548.0 33.0\n", + "Temp (C) 65.0 4.0\n", + "annual 27.0 18.0\n" + ] + } + ], + "source": [ + "print(pd.DataFrame({'effect': params.round(0),\n", + " 'error': err.round(0)}))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- We first see that there is a relatively stable trend in the weekly baseline: \n", + " - there are many more riders on weekdays than on weekends and holidays.\n", + "- We see that for each additional hour of daylight, 129 ± 9 more people choose to ride; \n", + "- a temperature increase of one degree Celsius encourages 65 ± 4 people to grab their bicycle; \n", + "- a dry day means an average of 548 ± 33 more riders, and each inch of precipitation means 665 ± 62 more people leave their bike at home.\n", + "\n", + "Once all these effects are accounted for, we see a modest increase of 27 ± 18 new daily riders each year.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- Our model is almost certainly missing some relevant information. \n", + " - For example, nonlinear effects \n", + " - such as effects of precipitation *and* cold temperature \n", + " - nonlinear trends within each variable \n", + " - such as disinclination to ride at very cold and very hot temperatures\n", + "- Additionally, we have thrown away some of the finer-grained information\n", + " - such as the difference between a rainy morning and a rainy afternoon, \n", + "- and we have ignored correlations between days\n", + " - such as the possible effect of a rainy Tuesday on Wednesday's numbers, \n", + " - or the effect of an unexpected sunny day after a streak of rainy days.\n", + " \n", + "These are all potentially interesting effects, and you now have the tools to begin exploring them if you wish!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "< [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb) | [Contents](Index.ipynb) | [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb) >" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "268px", + "left": "1058px", + "top": "113px", + "width": "180px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/code/09.06-Support-Vector-Machines.ipynb b/code/09.07-Support-Vector-Machines.ipynb similarity index 100% rename from code/09.06-Support-Vector-Machines.ipynb rename to code/09.07-Support-Vector-Machines.ipynb diff --git a/code/09.08-Random-Forests.ipynb b/code/09.08-Random-Forests.ipynb index 6f7a629..ac229a0 100755 --- a/code/09.08-Random-Forests.ipynb +++ b/code/09.08-Random-Forests.ipynb @@ -20,7 +20,7 @@ }, "source": [ "\n", - "\n", + "\n", "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" @@ -1136,25 +1136,6 @@ "toc_position": {}, "toc_section_display": true, "toc_window_display": false - }, - "widgets": { - "state": { - "1db86dba92ab4806b92380c277d1ab05": { - "views": [ - { - "cell_index": 29 - } - ] - }, - "bb9f05972cf34f0d9784403cf321e070": { - "views": [ - { - "cell_index": 23 - } - ] - } - }, - "version": "1.2.0" } }, "nbformat": 4, diff --git a/code/09.09-machine-learning-summary.ipynb b/code/09.09-machine-learning-summary.ipynb new file mode 100644 index 0000000..285bad0 --- /dev/null +++ b/code/09.09-machine-learning-summary.ipynb @@ -0,0 +1,3055 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "***\n", + "***\n", + "# 计算传播与机器学习\n", + "\n", + "***\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "![](./img/machine.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 1、 监督式学习\n", + "\n", + "工作机制:\n", + "- 这个算法由一个目标变量或结果变量(或因变量)组成。\n", + "- 这些变量由已知的一系列预示变量(自变量)预测而来。\n", + "- 利用这一系列变量,我们生成一个将输入值映射到期望输出值的函数。\n", + "- 这个训练过程会一直持续,直到模型在训练数据上获得期望的精确度。\n", + "- 监督式学习的例子有:回归、决策树、随机森林、K – 近邻算法、逻辑回归等。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 2、非监督式学习\n", + "\n", + "工作机制:\n", + "- 在这个算法中,没有任何目标变量或结果变量要预测或估计。\n", + "- 这个算法用在不同的组内聚类分析。\n", + "- 这种分析方式被广泛地用来细分客户,根据干预的方式分为不同的用户组。\n", + "- 非监督式学习的例子有:关联算法和 K–均值算法。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 3、强化学习\n", + "\n", + "工作机制:\n", + "- 这个算法训练机器进行决策。\n", + "- 它是这样工作的:机器被放在一个能让它通过反复试错来训练自己的环境中。\n", + "- 机器从过去的经验中进行学习,并且尝试利用了解最透彻的知识作出精确的商业判断。 \n", + "- 强化学习的例子有马尔可夫决策过程。alphago\n", + "\n", + "> Chess. Here, the agent decides upon a series of moves depending on the state of the board (the environment), and the\n", + "reward can be defined as win or lose at the end of the game:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "- 线性回归\n", + "- 逻辑回归\n", + "- 决策树\n", + "- SVM\n", + "- 朴素贝叶斯\n", + "---\n", + "- K最近邻算法\n", + "- K均值算法\n", + "- 随机森林算法\n", + "- 降维算法\n", + "- Gradient Boost 和 Adaboost 算法\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn做线性回归\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# 线性回归\n", + "- 通常用于估计连续性变量的实际数值(房价、呼叫次数、总销售额等)。\n", + "- 通过拟合最佳直线来建立自变量X和因变量Y的关系。\n", + "- 这条最佳直线叫做回归线,并且用 $Y= \\beta *X + C$ 这条线性等式来表示。\n", + "- 系数 $\\beta$ 和 C 可以通过最小二乘法获得" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:22.109042Z", + "start_time": "2019-04-22T08:22:20.811040Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import sklearn\n", + "from sklearn import datasets\n", + "from sklearn import linear_model\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.preprocessing import scale" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:24.400103Z", + "start_time": "2019-04-22T08:22:24.390296Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# boston data\n", + "boston = datasets.load_boston()\n", + "y = boston.target\n", + "X = boston.data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:25.362696Z", + "start_time": "2019-04-22T08:22:25.356162Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", + " 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='|t| [0.025 0.975]\n", + "-----------------------------------------------------------------------------------\n", + "Intercept 36.4595 5.103 7.144 0.000 26.432 46.487\n", + "boston.data[0] -0.1080 0.033 -3.287 0.001 -0.173 -0.043\n", + "boston.data[1] 0.0464 0.014 3.382 0.001 0.019 0.073\n", + "boston.data[2] 0.0206 0.061 0.334 0.738 -0.100 0.141\n", + "boston.data[3] 2.6867 0.862 3.118 0.002 0.994 4.380\n", + "boston.data[4] -17.7666 3.820 -4.651 0.000 -25.272 -10.262\n", + "boston.data[5] 3.8099 0.418 9.116 0.000 2.989 4.631\n", + "boston.data[6] 0.0007 0.013 0.052 0.958 -0.025 0.027\n", + "boston.data[7] -1.4756 0.199 -7.398 0.000 -1.867 -1.084\n", + "boston.data[8] 0.3060 0.066 4.613 0.000 0.176 0.436\n", + "boston.data[9] -0.0123 0.004 -3.280 0.001 -0.020 -0.005\n", + "boston.data[10] -0.9527 0.131 -7.283 0.000 -1.210 -0.696\n", + "boston.data[11] 0.0093 0.003 3.467 0.001 0.004 0.015\n", + "boston.data[12] -0.5248 0.051 -10.347 0.000 -0.624 -0.425\n", + "==============================================================================\n", + "Omnibus: 178.041 Durbin-Watson: 1.078\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126\n", + "Skew: 1.521 Prob(JB): 8.84e-171\n", + "Kurtosis: 8.281 Cond. No. 1.51e+04\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The condition number is large, 1.51e+04. This might indicate that there are\n", + "strong multicollinearity or other numerical problems.\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf\n", + "\n", + "# Fit regression model (using the natural log of one of the regressors)\n", + "results = smf.ols('boston.target ~ boston.data', data=boston).fit()\n", + "\n", + "print(results.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:29.198868Z", + "start_time": "2019-04-22T08:22:29.179869Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "regr = linear_model.LinearRegression()\n", + "lm = regr.fit(boston.data, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:30.210025Z", + "start_time": "2019-04-22T08:22:30.203639Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(36.45948838508965,\n", + " array([-1.08011358e-01, 4.64204584e-02, 2.05586264e-02, 2.68673382e+00,\n", + " -1.77666112e+01, 3.80986521e+00, 6.92224640e-04, -1.47556685e+00,\n", + " 3.06049479e-01, -1.23345939e-02, -9.52747232e-01, 9.31168327e-03,\n", + " -5.24758378e-01]),\n", + " 0.7406426641094095)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm.intercept_, lm.coef_, lm.score(boston.data, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:31.110418Z", + "start_time": "2019-04-22T08:22:31.107129Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "predicted = regr.predict(boston.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:32.479326Z", + "start_time": "2019-04-22T08:22:31.916490Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(y, predicted)\n", + "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", + "ax.set_xlabel('$Measured$', fontsize = 20)\n", + "ax.set_ylabel('$Predicted$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 训练集和测试集" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:36.365683Z", + "start_time": "2019-04-22T08:22:36.360788Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n", + " 4.9800e+00],\n", + " [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n", + " 9.1400e+00],\n", + " [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n", + " 4.0300e+00],\n", + " ...,\n", + " [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", + " 5.6400e+00],\n", + " [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n", + " 6.4800e+00],\n", + " [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", + " 7.8800e+00]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "boston.data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:48.265456Z", + "start_time": "2019-04-22T08:22:48.261247Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data,\n", + " boston.target, \n", + " test_size=0.2, \n", + " random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:51.873960Z", + "start_time": "2019-04-22T08:22:51.869286Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "regr = linear_model.LinearRegression()\n", + "lm = regr.fit(Xs_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:52.561738Z", + "start_time": "2019-04-22T08:22:52.555669Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(30.24675099392396,\n", + " array([-1.13055924e-01, 3.01104641e-02, 4.03807204e-02, 2.78443820e+00,\n", + " -1.72026334e+01, 4.43883520e+00, -6.29636221e-03, -1.44786537e+00,\n", + " 2.62429736e-01, -1.06467863e-02, -9.15456240e-01, 1.23513347e-02,\n", + " -5.08571424e-01]),\n", + " 0.7508856358979673)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:53.518402Z", + "start_time": "2019-04-22T08:22:53.515220Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "predicted = regr.predict(Xs_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:22:54.585839Z", + "start_time": "2019-04-22T08:22:54.380438Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(y_test, predicted)\n", + "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", + "ax.set_xlabel('$Measured$', fontsize = 20)\n", + "ax.set_ylabel('$Predicted$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 交叉验证" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# cross-validation \n", + " \n", + "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", + "- A model is trained using k-1 of the folds as training data;\n", + "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:23:06.421218Z", + "start_time": "2019-04-22T08:23:06.407755Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-1.5841985220997412" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, boston.data , boston.target, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:03.323654Z", + "start_time": "2019-04-22T08:24:01.612164Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "scores = [cross_val_score(regr, boston.data,\\\n", + " boston.target,\\\n", + " cv = int(i)).mean() \\\n", + " for i in range(3, 50)]\n", + "plt.plot(range(3, 50), scores,'r-o')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:34.174960Z", + "start_time": "2019-04-22T08:24:34.155764Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.45059442471362826" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_scale = scale(boston.data)\n", + "scores = cross_val_score(regr,data_X_scale, boston.target,\\\n", + " cv = 7)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 使用天涯bbs数据" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:46.198546Z", + "start_time": "2019-04-22T08:24:46.171912Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlelinkauthorauthor_pageclickreplytime
0【民间语文第161期】宁波px启示:船进港湾人应上岸/post-free-2849477-1.shtml贾也http://www.tianya.cn/5049945019467527032012-10-29 07:59
1宁波镇海PX项目引发群体上访 当地政府发布说明(转载)/post-free-2839539-1.shtml无上卫士ABChttp://www.tianya.cn/743418358824410412012-10-24 12:41
\n", + "
" + ], + "text/plain": [ + " title link author \\\n", + "0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 \n", + "1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC \n", + "\n", + " author_page click reply time \n", + "0 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59 \n", + "1 http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('../data/tianya_bbs_threads_list.txt', sep = \"\\t\", header=None)\n", + "df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})\n", + "df[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:47.185301Z", + "start_time": "2019-04-22T08:24:47.169337Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# 定义这个函数的目的是让读者感受到:\n", + "# 抽取不同的样本,得到的结果完全不同。\n", + "def randomSplit(dataX, dataY, num):\n", + " dataX_train = []\n", + " dataX_test = []\n", + " dataY_train = []\n", + " dataY_test = []\n", + " import random\n", + " test_index = random.sample(range(len(df)), num)\n", + " for k in range(len(dataX)):\n", + " if k in test_index:\n", + " dataX_test.append([dataX[k]])\n", + " dataY_test.append(dataY[k])\n", + " else:\n", + " dataX_train.append([dataX[k]])\n", + " dataY_train.append(dataY[k])\n", + " return dataX_train, dataX_test, dataY_train, dataY_test, " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:48.122580Z", + "start_time": "2019-04-22T08:24:48.081523Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variance score: 0.42\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Use only one feature\n", + "data_X = df.reply\n", + "# Split the data into training/testing sets\n", + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), \n", + " np.log(df.reply+1), 20)\n", + "# Create linear regression object\n", + "regr = linear_model.LinearRegression()\n", + "# Train the model using the training sets\n", + "regr.fit(data_X_train, data_y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % regr.score(data_X_test, data_y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:49.133689Z", + "start_time": "2019-04-22T08:24:49.129343Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[12.179091917198399], [11.387872315966666], [11.323941765302724]]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train[:3]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:50.276495Z", + "start_time": "2019-04-22T08:24:50.273286Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "y_true, y_pred = data_y_test, regr.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:51.151351Z", + "start_time": "2019-04-22T08:24:50.992991Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADT1JREFUeJzt3UGIo/d5x/HfT7MbEjWBwM4cjNfzvimUQJtDzAqXYujBEFhMaHvoIUH1KUVgCDi0UGp0ykHXkLMgpinzkhBwDsWkBEM3BEPiROPaIfamJQ3W1CWwE0JIFkFLsk8PO7vZXc+MXs3onVeP9P2AYEd6X73P/r3+8vJKIzkiBADIo9P2AACAxRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJXGriSbe3t6MsyyaeGgDW0v7+/i8iYqfOto2EuyxLTSaTJp4aANaS7WndbblUAgDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3AJxDVVUqy1KdTkdlWaqqqsaP2cj7uAFgE1RVpcFgoNlsJkmaTqcaDAaSpH6/39hxOeMGgDMaDof3o33PbDbTcDhs9LiEGwDO6ODgYKH7l4VwA8AZ7e7uLnT/stQOt+0t2/9u+5UmBwKALEajkbrd7kP3dbtdjUajRo+7yBn3C5JuNjUIAGTT7/c1Ho9VFIVsqygKjcfjRl+YlCRHxPyN7KuSvippJOnvIuLTp23f6/WCTwcEgPps70dEr862dc+4vyzpHyTdOfNUAIClmBtu25+WdCsi9udsN7A9sT05PDxc2oAAgIfVOeN+WtJf2H5X0tclPWN779GNImIcEb2I6O3s1PoSBwDAGcwNd0S8GBFXI6KU9BlJ/xYRf9P4ZACAY/E+bgBIZqHPKomI70j6TiOTAABq4YwbAJIh3ACQDOEGgGQIN9CwNj5oH+uNL1IAGtTWB+1jvXHGDTSorQ/ax3oj3ECD2vqgfaw3wg00qK0P2sd6I9xAg9r6oH2sN8INNKitD9rHeqv1RQqL4osUAGAxTXyRAgBgRRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJKZG27bH7T9A9tv2X7b9hcvYjAAwPEu1djmfyU9ExG3bV+W9Jrtf42I7zc8GwDgGHPDHREh6fbRj5ePbtHkUACAk9W6xm17y/abkm5JejUiXm92LADASWqFOyJ+FxGflHRV0lO2P/HoNrYHtie2J4eHh8ueEwBwZKF3lUTEryTdkHT9mMfGEdGLiN7Ozs6y5gMAPKLOu0p2bH/06M8fkvQpST9pejAAwPHqvKvkMUlftb2lu6H/RkS80uxYAICT1HlXyY8kPXkBswAAauA3JwEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbG6mqKpVlqU6no7IsVVVV2yMBtdX5WFdgrVRVpcFgoNlsJkmaTqcaDAaSpH6/3+ZoQC2ccWPjDIfD+9G+ZzabaTgctjQRsBjCjY1zcHCw0P3AqiHc2Di7u7sL3Q+sGsKNjTMajdTtdh+6r9vtajQatTQRsBjCjY3T7/c1Ho9VFIVsqygKjcdjXphEGo6IpT9pr9eLyWSy9OcFgHVlez8ienW25YwbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkMzccNt+wvYN2+/Yftv2CxcxGADgeJdqbPNbSX8fEW/Y/oikfduvRsQ7Dc8GADjG3DPuiPh5RLxx9OffSLop6fGmBwMAHG+ha9y2S0lPSnq9iWEAAPPVDrftD0t6WdIXIuLXxzw+sD2xPTk8PFzmjACAB9QKt+3LuhvtKiK+edw2ETGOiF5E9HZ2dpY5IwDgAXXeVWJJX5F0MyK+1PxIAIDT1DnjflrSc5Kesf3m0e3ZhucCAJxg7tsBI+I1Sb6AWQAANfCbkwCQDOEGgGQI94aoqkplWarT6agsS1VV1fj+5z0mgBNExNJv165dC6yOvb296Ha7Ien+rdvtxt7eXmP7n/eYwKaRNImajfXd7Zer1+vFZDJZ+vPibMqy1HQ6fd/9RVHo3XffbWT/8x4T2DS29yOiV2tbwr3+Op2OjvvvbFt37txpZP/zHhPYNIuEm2vcG2B3d3eh+5ex/3mPCeBkhHsDjEYjdbvdh+7rdrsajUaN7X/eYwI4Rd2L4YvceHFy9ezt7UVRFGE7iqJY+EXCs+x/3mMCm0S8OAkAuXCNGwDWGOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuJPh68AAXGp7ANRXVZUGg4Fms5kkaTqdajAYSJL6/X6bowG4QJxxJzIcDu9H+57ZbKbhcNjSRADaQLgTOTg4WOh+AOuJcCfC14EBkAh3KnwdGACJcKfS7/c1Ho9VFIVsqygKjcdjXpgENgxfXQYAK4CvLgOANUa4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIJm54bb9ku1btn98EQMBAE5X54z7nyRdb3gOAEBNc8MdEd+V9MsLmAUAUAPXuAEgmaWF2/bA9sT25PDwcFlPCwB4xNLCHRHjiOhFRG9nZ2dZTwsAeASXSgAgmTpvB/yapO9J+rjt92x/rvmxAAAnuTRvg4j47EUMAgCoh0slAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnBvgKqqVJalOp2OyrJUVVVtjwTgHOa+jxu5VVWlwWCg2WwmSZpOpxoMBpKkfr/f5mgAzogz7jU3HA7vR/ue2Wym4XDY0kQAzotwr7mDg4OF7gew+gj3mtvd3V3ofgCrj3CvudFopG63+9B93W5Xo9GopYkAnBfhXnP9fl/j8VhFUci2iqLQeDzmhUkgMUfE0p+01+vFZDJZ+vMCwLqyvR8RvTrbcsYNAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMnUCrft67b/w/ZPbf9jE4NUVaWyLNXpdFSWpaqqWujxZRzjtO23t7e1vb39vn2XMVfTf486+7Sxvpug6TVhzTdURJx6k7Ql6b8k/aGkD0h6S9Ifn7bPtWvXYhF7e3vR7XZD0v1bt9uNvb29Wo8v4xh1tn903+eff/7ccy3qLGuxiuu7CZpeE9Z8vUiaxJwe37vVCfefSfr2Az+/KOnF0/ZZNNxFURwbx6Ioaj2+jGPU3f7B29bW1rnnWtRZ1mIV13cTNL0mrPl6WSTcvrv9yWz/taTrEfG3Rz8/J+lPI+Lzj2w3kDSQpN3d3WvT6fTU531Qp9PRcXPY1p07d+Y+voxj1N2+jkXmWtRZ1mIV13cTNL0mrPl6sb0fEb062y7txcmIGEdELyJ6Ozs7C+27u7t76v3zHl/GMc7y3FtbW+eea1FnmXcV13cTNL0mrPkGm3dKrgu4VLKK12C5xs017vPiGjcWoSVf474k6WeSPqbfvzj5J6fts2i4I+7+IyyKImxHURTv+8c37/FlHOO07a9cuRJXrlx5377LmKvpv0edfdpY303Q9Jqw5utjkXDPvcYtSbaflfRl3X2HyUsRMTpt+16vF5PJZO7zAgDuWuQa96U6G0XEtyR961xTAQCWgt+cBIBkCDcAJEO4ASAZwg0AyRBuAEim1tsBF35S+1BS/d95X23bkn7R9hArjjWqh3WqZ1PXqYiIWr923ki414ntSd33Vm4q1qge1qke1mk+LpUAQDKEGwCSIdzzjdseIAHWqB7WqR7WaQ6ucQNAMpxxA0AyhPsEF/EFydnZfsn2Lds/bnuWVWb7Cds3bL9j+23bL7Q906qx/UHbP7D91tEafbHtmVYZl0qOYXtL0n9K+pSk9yT9UNJnI+KdVgdbMbb/XNJtSf8cEZ9oe55VZfsxSY9FxBu2PyJpX9Jf8e/p92xb0h9ExG3blyW9JumFiPh+y6OtJM64j/eUpJ9GxM8i4v8kfV3SX7Y808qJiO9K+mXbc6y6iPh5RLxx9OffSLop6fF2p1otR98lcPvox8tHN84qT0C4j/e4pP9+4Of3xP9oWALbpaQnJb3e7iSrx/aW7Tcl3ZL0akSwRicg3MAFsf1hSS9L+kJE/LrteVZNRPwuIj4p6aqkp2xz+e0EhPt4/yPpiQd+vnp0H3AmR9dtX5ZURcQ3255nlUXEryTdkHS97VlWFeE+3g8l/ZHtj9n+gKTPSPqXlmdCUkcvvH1F0s2I+FLb86wi2zu2P3r05w/p7hsDftLuVKuLcB8jIn4r6fOSvq27LyR9IyLebneq1WP7a5K+J+njtt+z/bm2Z1pRT0t6TtIztt88uj3b9lAr5jFJN2z/SHdPnF6NiFdanmll8XZAAEiGM24ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMn8P7Lcj2jEg96EAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(y_pred, y_true, color='black')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:52.301659Z", + "start_time": "2019-04-22T08:24:52.130224Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot outputs\n", + "plt.scatter(data_X_test, data_y_test, color='black')\n", + "plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:53.326537Z", + "start_time": "2019-04-22T08:24:53.321437Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('Coefficients: \\n', array([0.68623605]))" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The coefficients\n", + "'Coefficients: \\n', regr.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:55.007412Z", + "start_time": "2019-04-22T08:24:55.002637Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Residual sum of squares: 0.98'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The mean square error\n", + "\"Residual sum of squares: %.2f\" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:24:55.875656Z", + "start_time": "2019-04-22T08:24:55.846855Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " if __name__ == '__main__':\n", + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " from ipykernel import kernelapp as app\n" + ] + } + ], + "source": [ + "df.click_log = [[np.log(df.click[i]+1)] for i in range(len(df))]\n", + "df.reply_log = [[np.log(df.reply[i]+1)] for i in range(len(df))]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:13.823742Z", + "start_time": "2019-04-22T08:25:13.811227Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.62'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)\n", + "\n", + "# Create linear regression object\n", + "regr = linear_model.LinearRegression()\n", + "# Train the model using the training sets\n", + "regr.fit(Xs_train, y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % regr.score(Xs_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:18.210290Z", + "start_time": "2019-04-22T08:25:18.010690Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot outputs\n", + "plt.scatter(Xs_test, y_test, color='black')\n", + "plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:26.241798Z", + "start_time": "2019-04-22T08:25:26.227633Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.6837007391943056" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, df.click_log, \\\n", + " df.reply_log, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2019-04-22T08:25:30.245410Z", + "start_time": "2019-04-22T08:25:30.227128Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.7188149722820985" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regr = linear_model.LinearRegression()\n", + "scores = cross_val_score(regr, df.click_log, \n", + " df.reply_log, cv =5)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn做logistic回归\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- logistic回归是一个分类算法而不是一个回归算法。\n", + "- 可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。\n", + "- 简单来说,它通过将数据拟合进一个逻辑函数(logistic function)来预估一个事件出现的概率。\n", + "- 因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "$$odds= \\frac{p}{1-p} = \\frac{probability\\: of\\: event\\: occurrence} {probability \\:of \\:not\\: event\\: occurrence}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$ln(odds)= ln(\\frac{p}{1-p})$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$logit(x) = ln(\\frac{p}{1-p}) = b_0+b_1X_1+b_2X_2+b_3X_3....+b_kX_k$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/logistic.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:46:50.277195Z", + "start_time": "2018-04-29T07:46:50.272229Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "repost = []\n", + "for i in df.title:\n", + " if u'转载' in i:\n", + " repost.append(1)\n", + " else:\n", + " repost.append(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:06.292994Z", + "start_time": "2018-04-29T07:47:06.270715Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[194675, 2703], [88244, 1041], [82779, 625]]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]\n", + "data_X[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:45.269303Z", + "start_time": "2018-04-29T07:47:45.259792Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.61241970021413272" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "df['repost'] = repost\n", + "model = LogisticRegression()\n", + "model.fit(data_X,df.repost)\n", + "model.score(data_X,df.repost)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:47:59.648431Z", + "start_time": "2018-04-29T07:47:59.633936Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "def randomSplitLogistic(dataX, dataY, num):\n", + " dataX_train = []\n", + " dataX_test = []\n", + " dataY_train = []\n", + " dataY_test = []\n", + " import random\n", + " test_index = random.sample(range(len(df)), num)\n", + " for k in range(len(dataX)):\n", + " if k in test_index:\n", + " dataX_test.append(dataX[k])\n", + " dataY_test.append(dataY[k])\n", + " else:\n", + " dataX_train.append(dataX[k])\n", + " dataY_train.append(dataY[k])\n", + " return dataX_train, dataX_test, dataY_train, dataY_test, " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:48:27.726443Z", + "start_time": "2018-04-29T07:48:27.710922Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.45'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Split the data into training/testing sets\n", + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "# Create logistic regression object\n", + "log_regr = LogisticRegression()\n", + "# Train the model using the training sets\n", + "log_regr.fit(data_X_train, data_y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:48:56.873331Z", + "start_time": "2018-04-29T07:48:56.870219Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "y_true, y_pred = data_y_test, log_regr.predict(data_X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:39:12.344043Z", + "start_time": "2018-04-29T07:39:12.338223Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_true, y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:39:13.175680Z", + "start_time": "2018-04-29T07:39:13.171386Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.50 0.17 0.25 6\n", + " 1 0.72 0.93 0.81 14\n", + "\n", + "avg / total 0.66 0.70 0.64 20\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_true, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:43.039620Z", + "start_time": "2018-04-29T07:51:43.034812Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:47.690742Z", + "start_time": "2018-04-29T07:51:47.683127Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Variance score: 0.60'" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create logistic regression object\n", + "log_regr = LogisticRegression()\n", + "# Train the model using the training sets\n", + "log_regr.fit(Xs_train, y_train)\n", + "# Explained variance score: 1 is perfect prediction\n", + "'Variance score: %.2f' % log_regr.score(Xs_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:51:55.780061Z", + "start_time": "2018-04-29T07:51:55.771924Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic score for test set: 0.595745\n", + "Logistic score for training set: 0.613941\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.03 0.05 39\n", + " 1 0.59 1.00 0.74 55\n", + "\n", + "avg / total 0.76 0.60 0.46 94\n", + "\n" + ] + } + ], + "source": [ + "print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))\n", + "print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))\n", + "y_true, y_pred = y_test, log_regr.predict(Xs_test)\n", + "print(classification_report(y_true, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:52:53.880925Z", + "start_time": "2018-04-29T07:52:53.866672Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.53333333333333333" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logre = LogisticRegression()\n", + "scores = cross_val_score(logre, data_X, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T07:53:26.825100Z", + "start_time": "2018-04-29T07:53:26.810871Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.62948717948717947" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logre = LogisticRegression()\n", + "data_X_scale = scale(data_X)\n", + "# The importance of preprocessing in data science and the machine learning pipeline I: \n", + "scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现贝叶斯预测\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# Naive Bayes algorithm\n", + "\n", + "It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. \n", + "\n", + "In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. \n", + "\n", + "why it is known as ‘Naive’? For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "贝叶斯定理为使用$p(c)$, $p(x)$, $p(x|c)$ 计算后验概率$P(c|x)$提供了方法:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$\n", + "p(c|x) = \\frac{p(x|c) p(c)}{p(x)}\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "- P(c|x) is the posterior probability of class (c, target) given predictor (x, attributes).\n", + "- P(c) is the prior probability of class.\n", + "- P(x|c) is the likelihood which is the probability of predictor given class.\n", + "- P(x) is the prior probability of predictor." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/Bayes_41.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Step 1: Convert the data set into a frequency table\n", + "\n", + "Step 2: Create Likelihood table by finding the probabilities like:\n", + "- p(Overcast) = 0.29, p(rainy) = 0.36, p(sunny) = 0.36\n", + "- p(playing) = 0.64, p(rest) = 0.36\n", + "\n", + "Step 3: Now, use Naive Bayesian equation to calculate the posterior probability for each class. The class with the highest posterior probability is the outcome of prediction." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## Problem: Players will play if weather is sunny. Is this statement is correct?\n", + "\n", + "We can solve it using above discussed method of posterior probability.\n", + "\n", + "$P(Yes | Sunny) = \\frac{P( Sunny | Yes) * P(Yes) } {P (Sunny)}$\n", + "\n", + "Here we have P (Sunny |Yes) = 3/9 = 0.33, P(Sunny) = 5/14 = 0.36, P( Yes)= 9/14 = 0.64\n", + "\n", + "Now, $P (No | Sunny) = \\frac{0.33 * 0.64}{0.36} = 0.60$, which has higher probability.\n", + "\n", + "$P(No | Sunny) = \\frac{P( Sunny | No) * P(No) } {P (Sunny)}$\n", + "\n", + "Here we have P (Sunny |No) = 2/5 = 0.4, P(Sunny) = 5/14 = 0.36, P( No)= 5/14 = 0.36\n", + "\n", + "Now, $P (Yes | Sunny) = \\frac{0.4 * 0.36}{0.36} = 0.4$, which has lower probability.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ABCMeta BaseDiscreteNB BaseEstimator BaseNB BernoulliNB ClassifierMixin GaussianNB LabelBinarizer MultinomialNB __all__ __builtins__ __doc__ __file__ __name__ __package__ _check_partial_fit_first_call abstractmethod binarize check_X_y check_array check_is_fitted in1d issparse label_binarize logsumexp np safe_sparse_dot six'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import naive_bayes\n", + "' '.join(dir(naive_bayes)) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "- naive_bayes.GaussianNB\tGaussian Naive Bayes (GaussianNB)\n", + "- naive_bayes.MultinomialNB([alpha, ...])\tNaive Bayes classifier for multinomial models\n", + "- naive_bayes.BernoulliNB([alpha, binarize, ...])\tNaive Bayes classifier for multivariate Bernoulli models." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:02:37.644606Z", + "start_time": "2018-04-29T08:02:37.635952Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "#Import Library of Gaussian Naive Bayes model\n", + "from sklearn.naive_bayes import GaussianNB\n", + "import numpy as np\n", + "\n", + "#assigning predictor and target variables\n", + "x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])\n", + "Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:02:52.828101Z", + "start_time": "2018-04-29T08:02:52.818463Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([4, 3])" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Create a Gaussian Classifier\n", + "model = GaussianNB()\n", + "\n", + "# Train the model using the training sets \n", + "model.fit(x[:8], Y[:8])\n", + "\n", + "#Predict Output \n", + "predicted= model.predict([[1,2],[3,4]])\n", + "predicted" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# cross-validation \n", + " \n", + "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", + "- A model is trained using k-1 of the folds as training data;\n", + "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:04:04.297675Z", + "start_time": "2018-04-29T08:04:04.273413Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)\n", + "# Train the model using the training sets \n", + "model.fit(data_X_train, data_y_train)\n", + "\n", + "#Predict Output \n", + "predicted= model.predict(data_X_test)\n", + "predicted" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:04:34.184513Z", + "start_time": "2018-04-29T08:04:34.178511Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.65000000000000002" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(data_X_test, data_y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:05:04.297453Z", + "start_time": "2018-04-29T08:05:04.249311Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=7.\n", + " % (min_labels, self.n_folds)), Warning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.53413410073295453" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.cross_validation import cross_val_score\n", + "\n", + "model = GaussianNB()\n", + "scores = cross_val_score(model, [[c] for c in df.click],\\\n", + " df.reply, cv = 7)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现决策树\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# 决策树\n", + "- 这个监督式学习算法通常被用于分类问题。\n", + "- 它同时适用于分类变量和连续因变量。\n", + "- 在这个算法中,我们将总体分成两个或更多的同类群。\n", + "- 这是根据最重要的属性或者自变量来分成尽可能不同的组别。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/tree.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/playtree.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 在上图中你可以看到,根据多种属性,人群被分成了不同的四个小组,来判断 “他们会不会去玩”。\n", + "### 为了把总体分成不同组别,需要用到许多技术,比如说 Gini、Information Gain、Chi-square、entropy。" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:10:20.871345Z", + "start_time": "2018-04-29T08:10:20.855125Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "from sklearn import tree\n", + "model = tree.DecisionTreeClassifier(criterion='gini')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:10:49.988277Z", + "start_time": "2018-04-29T08:10:49.973060Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.91275167785234901" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "model.fit(data_X_train,data_y_train)\n", + "model.score(data_X_train,data_y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:11:12.730866Z", + "start_time": "2018-04-29T08:11:12.725782Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict\n", + "model.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:11:28.411441Z", + "start_time": "2018-04-29T08:11:28.397481Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.33461538461538459" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# crossvalidation\n", + "scores = cross_val_score(model, data_X, df.repost, cv = 3)\n", + "scores.mean() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "> # 使用sklearn实现SVM支持向量机\n", + "***\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/svm.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- 将每个数据在N维空间中用点标出(N是你所有的特征总数),每个特征的值是一个坐标的值。\n", + " - 举个例子,如果我们只有身高和头发长度两个特征,我们会在二维空间中标出这两个变量,每个点有两个坐标(这些坐标叫做支持向量)。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/xyplot.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "- 现在,我们会找到将两组不同数据分开的一条直线。\n", + " - 两个分组中距离最近的两个点到这条线的距离同时最优化。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "![](./img/sumintro.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "## 上面示例中的黑线将数据分类优化成两个小组\n", + "- 两组中距离最近的点(图中A、B点)到达黑线的距离满足最优条件。\n", + " - 这条直线就是我们的分割线。接下来,测试数据落到直线的哪一边,我们就将它分到哪一类去。" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:29.788250Z", + "start_time": "2018-04-29T08:17:29.785022Z" + } + }, + "outputs": [], + "source": [ + "from sklearn import svm\n", + "# Create SVM classification object \n", + "model=svm.SVC() " + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:31.035310Z", + "start_time": "2018-04-29T08:17:31.030713Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __path__ __spec__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "' '.join(dir(svm))" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:41.872379Z", + "start_time": "2018-04-29T08:17:41.849759Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.90380313199105144" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", + "model.fit(data_X_train,data_y_train)\n", + "model.score(data_X_train,data_y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:17:47.661313Z", + "start_time": "2018-04-29T08:17:47.655841Z" + }, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict\n", + "model.predict(data_X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:18:00.419986Z", + "start_time": "2018-04-29T08:17:58.671257Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# crossvalidation\n", + "scores = []\n", + "cvs = [3, 5, 10, 25, 50, 75, 100]\n", + "for i in cvs:\n", + " score = cross_val_score(model, data_X, df.repost,\n", + " cv = i)\n", + " scores.append(score.mean() ) # Try to tune cv\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-29T08:18:05.493658Z", + "start_time": "2018-04-29T08:18:05.359658Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(cvs, scores, 'b-o')\n", + "plt.xlabel('$cv$', fontsize = 20)\n", + "plt.ylabel('$Score$', fontsize = 20)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "\n", + "> # 泰坦尼克号数据分析\n", + "\n", + "王成军\n", + "\n", + "wangchengjun@nju.edu.cn\n", + "\n", + "计算传播网 http://computational-communication.com" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:31:28.492497Z", + "start_time": "2018-05-29T07:31:28.488728Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn import tree\n", + "import warnings \n", + "warnings.filterwarnings(\"ignore\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-06T07:02:49.855926Z", + "start_time": "2018-06-06T07:02:49.705773Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "train = pd.read_csv('../data/tatanic_train.csv', \n", + " sep = \",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-06T07:02:52.803564Z", + "start_time": "2018-06-06T07:02:52.759733Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 PassengerId Survived Pclass \\\n", + "0 0 1 0 3 \n", + "1 1 2 1 1 \n", + "2 2 3 1 3 \n", + "3 3 4 1 1 \n", + "4 4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:58.070575Z", + "start_time": "2018-05-29T07:28:57.897862Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", + "train[\"Fare\"] = train[\"Fare\"].fillna(train[\"Fare\"].median())\n", + "#Convert the male and female groups to integer form\n", + "train[\"Sex\"][train[\"Sex\"] == \"male\"] = 0\n", + "train[\"Sex\"][train[\"Sex\"] == \"female\"] = 1\n", + "#Impute the Embarked variable\n", + "train[\"Embarked\"] = train[\"Embarked\"].fillna('S')\n", + "#Convert the Embarked classes to integer form\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"S\"] = 0\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"C\"] = 1\n", + "train[\"Embarked\"][train[\"Embarked\"] == \"Q\"] = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:08.358884Z", + "start_time": "2018-05-29T07:28:08.346226Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.12294397 0.31274009 0.23680307 0.32751287]\n", + "0.977553310887\n" + ] + } + ], + "source": [ + "#Create the target and features numpy arrays: target, features_one\n", + "target = train['Survived'].values\n", + "features_one = train[[\"Pclass\", \"Sex\", \"Age\", \"Fare\"]].values\n", + "\n", + "#Fit your first decision tree: my_tree_one\n", + "my_tree_one = tree.DecisionTreeClassifier()\n", + "my_tree_one = my_tree_one.fit(features_one, target)\n", + "#Look at the importance of the included features and print the score\n", + "print(my_tree_one.feature_importances_)\n", + "print(my_tree_one.score(features_one, target))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:15.915998Z", + "start_time": "2018-05-29T07:28:15.705994Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "test = pd.read_csv('../data/tatanic_test.csv', sep = \",\")\n", + "# Impute the missing value with the median\n", + "test.Fare[152] = test.Fare.median()\n", + "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", + "#Convert the male and female groups to integer form\n", + "test[\"Sex\"][test[\"Sex\"] == \"male\"] = 0\n", + "test[\"Sex\"][test[\"Sex\"] == \"female\"] = 1\n", + "\n", + "#Impute the Embarked variable\n", + "test[\"Embarked\"] = test[\"Embarked\"].fillna('S')\n", + "#Convert the Embarked classes to integer form\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"S\"] = 0\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"C\"] = 1\n", + "test[\"Embarked\"][test[\"Embarked\"] == \"Q\"] = 2\n", + "\n", + "# Extract the features from the test set: Pclass, Sex, Age, and Fare.\n", + "test_features = test[[\"Pclass\",\"Sex\", \"Age\", \"Fare\"]].values\n", + "\n", + "# Make your prediction using the test set\n", + "my_prediction = my_tree_one.predict(test_features)\n", + "\n", + "# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions\n", + "PassengerId =np.array(test['PassengerId']).astype(int)\n", + "my_solution = pd.DataFrame(my_prediction, PassengerId, columns = [\"Survived\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:18.081288Z", + "start_time": "2018-05-29T07:28:18.074414Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Survived
8920
8930
8941
\n", + "
" + ], + "text/plain": [ + " Survived\n", + "892 0\n", + "893 0\n", + "894 1" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_solution[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:25:44.488717Z", + "start_time": "2018-05-29T07:25:44.484381Z" + }, + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(418, 1)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check that your data frame has 418 entries\n", + "my_solution.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "# Write your solution to a csv file with the name my_solution.csv \n", + "my_solution.to_csv(\"../data/tatanic_solution_one.csv\", \n", + " index_label = [\"PassengerId\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:26.996353Z", + "start_time": "2018-05-29T07:28:26.982601Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.905723905724\n" + ] + } + ], + "source": [ + "# Create a new array with the added features: features_two\n", + "features_two = train[[\"Pclass\",\"Age\",\"Sex\",\"Fare\",\\\n", + " \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "\n", + "#Control overfitting by setting \"max_depth\" to 10 and \"min_samples_split\" to 5 : my_tree_two\n", + "max_depth = 10\n", + "min_samples_split = 5\n", + "my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, \n", + " min_samples_split = min_samples_split, \n", + " random_state = 1)\n", + "my_tree_two = my_tree_two.fit(features_two, target)\n", + "\n", + "#Print the score of the new decison tree\n", + "print(my_tree_two.score(features_two, target))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:28.033226Z", + "start_time": "2018-05-29T07:28:28.018293Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.979797979798\n" + ] + } + ], + "source": [ + "# create a new train set with the new variable\n", + "train_two = train\n", + "train_two['family_size'] = train.SibSp + train.Parch + 1\n", + "\n", + "# Create a new decision tree my_tree_three\n", + "features_three = train[[\"Pclass\", \"Sex\", \"Age\", \\\n", + " \"Fare\", \"SibSp\", \"Parch\", \"family_size\"]].values\n", + "\n", + "my_tree_three = tree.DecisionTreeClassifier()\n", + "my_tree_three = my_tree_three.fit(features_three, target)\n", + "\n", + "# Print the score of this decision tree\n", + "print(my_tree_three.score(features_three, target))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:28:32.678968Z", + "start_time": "2018-05-29T07:28:32.465958Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.939393939394\n", + "418\n", + "[0 0 0]\n" + ] + } + ], + "source": [ + "#Import the `RandomForestClassifier`\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "#We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables\n", + "features_forest = train[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "\n", + "#Building the Forest: my_forest\n", + "n_estimators = 100\n", + "forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, \n", + " n_estimators = n_estimators, random_state = 1)\n", + "my_forest = forest.fit(features_forest, target)\n", + "\n", + "#Print the score of the random forest\n", + "print(my_forest.score(features_forest, target))\n", + "\n", + "#Compute predictions and print the length of the prediction vector:test_features, pred_forest\n", + "test_features = test[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", + "pred_forest = my_forest.predict(test_features)\n", + "print(len(test_features))\n", + "print(pred_forest[:3])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-29T07:26:25.602062Z", + "start_time": "2018-05-29T07:26:25.572689Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.14130255 0.17906027 0.41616727 0.17938711 0.05039699 0.01923751\n", + " 0.0144483 ]\n", + "[ 0.10384741 0.20139027 0.31989322 0.24602858 0.05272693 0.04159232\n", + " 0.03452128]\n", + "0.905723905724\n", + "0.939393939394\n" + ] + } + ], + "source": [ + "#Request and print the `.feature_importances_` attribute\n", + "print(my_tree_two.feature_importances_)\n", + "print(my_forest.feature_importances_)\n", + "\n", + "#Compute and print the mean accuracy score for both models\n", + "print(my_tree_two.score(features_two, target))\n", + "print(my_forest.score(features_two, target))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 阅读材料\n", + "机器学习算法的要点(附 Python 和 R 代码)http://blog.csdn.net/a6225301/article/details/50479672\n", + "\n", + "The \"Python Machine Learning\" book code repository and info resource https://github.com/rasbt/python-machine-learning-book\n", + "\n", + "An Introduction to Statistical Learning (James, Witten, Hastie, Tibshirani, 2013) : Python code https://github.com/JWarmenhoven/ISLR-python\n", + "\n", + "BuildingMachineLearningSystemsWithPython https://github.com/luispedro/BuildingMachineLearningSystemsWithPython" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# 作业\n", + "https://www.datacamp.com/community/tutorials/the-importance-of-preprocessing-in-data-science-and-the-machine-learning-pipeline-i-centering-scaling-and-k-nearest-neighbours" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python [conda env:anaconda]", + "language": "python", + "name": "conda-env-anaconda-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 0, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "780px", + "left": "1279px", + "top": "168.667px", + "width": "341px" + }, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/code/09.machine_learning_with_sklearn.ipynb b/code/09.machine_learning_with_sklearn.ipynb deleted file mode 100644 index 5a89315..0000000 --- a/code/09.machine_learning_with_sklearn.ipynb +++ /dev/null @@ -1,3016 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "***\n", - "***\n", - "# 计算传播与机器学习\n", - "\n", - "***\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "![](./img/machine.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 1、 监督式学习\n", - "\n", - "工作机制:\n", - "- 这个算法由一个目标变量或结果变量(或因变量)组成。\n", - "- 这些变量由已知的一系列预示变量(自变量)预测而来。\n", - "- 利用这一系列变量,我们生成一个将输入值映射到期望输出值的函数。\n", - "- 这个训练过程会一直持续,直到模型在训练数据上获得期望的精确度。\n", - "- 监督式学习的例子有:回归、决策树、随机森林、K – 近邻算法、逻辑回归等。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 2、非监督式学习\n", - "\n", - "工作机制:\n", - "- 在这个算法中,没有任何目标变量或结果变量要预测或估计。\n", - "- 这个算法用在不同的组内聚类分析。\n", - "- 这种分析方式被广泛地用来细分客户,根据干预的方式分为不同的用户组。\n", - "- 非监督式学习的例子有:关联算法和 K–均值算法。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 3、强化学习\n", - "\n", - "工作机制:\n", - "- 这个算法训练机器进行决策。\n", - "- 它是这样工作的:机器被放在一个能让它通过反复试错来训练自己的环境中。\n", - "- 机器从过去的经验中进行学习,并且尝试利用了解最透彻的知识作出精确的商业判断。 \n", - "- 强化学习的例子有马尔可夫决策过程。alphago\n", - "\n", - "> Chess. Here, the agent decides upon a series of moves depending on the state of the board (the environment), and the\n", - "reward can be defined as win or lose at the end of the game:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- 线性回归\n", - "- 逻辑回归\n", - "- 决策树\n", - "- SVM\n", - "- 朴素贝叶斯\n", - "---\n", - "- K最近邻算法\n", - "- K均值算法\n", - "- 随机森林算法\n", - "- 降维算法\n", - "- Gradient Boost 和 Adaboost 算法\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn做线性回归\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# 线性回归\n", - "- 通常用于估计连续性变量的实际数值(房价、呼叫次数、总销售额等)。\n", - "- 通过拟合最佳直线来建立自变量X和因变量Y的关系。\n", - "- 这条最佳直线叫做回归线,并且用 $Y= \\beta *X + C$ 这条线性等式来表示。\n", - "- 系数 $\\beta$ 和 C 可以通过最小二乘法获得" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:10:39.010055Z", - "start_time": "2018-04-29T07:10:39.002664Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import sklearn\n", - "from sklearn import datasets\n", - "from sklearn import linear_model\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.preprocessing import scale" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:11:24.244682Z", - "start_time": "2018-04-29T07:11:24.234905Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# boston data\n", - "boston = datasets.load_boston()\n", - "y = boston.target\n", - "X = boston.data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:11:45.142201Z", - "start_time": "2018-04-29T07:11:45.137656Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", - " 'TAX', 'PTRATIO', 'B', 'LSTAT'], \n", - " dtype='|t| [95.0% Conf. Int.]\n", - "-----------------------------------------------------------------------------------\n", - "Intercept 36.4911 5.104 7.149 0.000 26.462 46.520\n", - "boston.data[0] -0.1072 0.033 -3.276 0.001 -0.171 -0.043\n", - "boston.data[1] 0.0464 0.014 3.380 0.001 0.019 0.073\n", - "boston.data[2] 0.0209 0.061 0.339 0.735 -0.100 0.142\n", - "boston.data[3] 2.6886 0.862 3.120 0.002 0.996 4.381\n", - "boston.data[4] -17.7958 3.821 -4.658 0.000 -25.302 -10.289\n", - "boston.data[5] 3.8048 0.418 9.102 0.000 2.983 4.626\n", - "boston.data[6] 0.0008 0.013 0.057 0.955 -0.025 0.027\n", - "boston.data[7] -1.4758 0.199 -7.398 0.000 -1.868 -1.084\n", - "boston.data[8] 0.3057 0.066 4.608 0.000 0.175 0.436\n", - "boston.data[9] -0.0123 0.004 -3.278 0.001 -0.020 -0.005\n", - "boston.data[10] -0.9535 0.131 -7.287 0.000 -1.211 -0.696\n", - "boston.data[11] 0.0094 0.003 3.500 0.001 0.004 0.015\n", - "boston.data[12] -0.5255 0.051 -10.366 0.000 -0.625 -0.426\n", - "==============================================================================\n", - "Omnibus: 178.029 Durbin-Watson: 1.078\n", - "Prob(Omnibus): 0.000 Jarque-Bera (JB): 782.015\n", - "Skew: 1.521 Prob(JB): 1.54e-170\n", - "Kurtosis: 8.276 Cond. No. 1.51e+04\n", - "==============================================================================\n", - "\n", - "Warnings:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", - "[2] The condition number is large, 1.51e+04. This might indicate that there are\n", - "strong multicollinearity or other numerical problems.\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import statsmodels.api as sm\n", - "import statsmodels.formula.api as smf\n", - "\n", - "# Fit regression model (using the natural log of one of the regressors)\n", - "results = smf.ols('boston.target ~ boston.data', data=boston).fit()\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:13:21.823618Z", - "start_time": "2018-04-29T07:13:21.812795Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "regr = linear_model.LinearRegression()\n", - "lm = regr.fit(boston.data, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:13:29.286705Z", - "start_time": "2018-04-29T07:13:29.280511Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(36.491103280363603,\n", - " array([ -1.07170557e-01, 4.63952195e-02, 2.08602395e-02,\n", - " 2.68856140e+00, -1.77957587e+01, 3.80475246e+00,\n", - " 7.51061703e-04, -1.47575880e+00, 3.05655038e-01,\n", - " -1.23293463e-02, -9.53463555e-01, 9.39251272e-03,\n", - " -5.25466633e-01]),\n", - " 0.74060774286494269)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lm.intercept_, lm.coef_, lm.score(boston.data, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:14:24.251725Z", - "start_time": "2018-04-29T07:14:24.248401Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "predicted = regr.predict(boston.data)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:14:33.380349Z", - "start_time": "2018-04-29T07:14:32.952670Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "ax.scatter(y, predicted)\n", - "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", - "ax.set_xlabel('$Measured$', fontsize = 20)\n", - "ax.set_ylabel('$Predicted$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## 训练集和测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 190, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 6.32000000e-03, 1.80000000e+01, 2.31000000e+00, ...,\n", - " 1.53000000e+01, 3.96900000e+02, 4.98000000e+00],\n", - " [ 2.73100000e-02, 0.00000000e+00, 7.07000000e+00, ...,\n", - " 1.78000000e+01, 3.96900000e+02, 9.14000000e+00],\n", - " [ 2.72900000e-02, 0.00000000e+00, 7.07000000e+00, ...,\n", - " 1.78000000e+01, 3.92830000e+02, 4.03000000e+00],\n", - " ..., \n", - " [ 6.07600000e-02, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.96900000e+02, 5.64000000e+00],\n", - " [ 1.09590000e-01, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.93450000e+02, 6.48000000e+00],\n", - " [ 4.74100000e-02, 0.00000000e+00, 1.19300000e+01, ...,\n", - " 2.10000000e+01, 3.96900000e+02, 7.88000000e+00]])" - ] - }, - "execution_count": 190, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "boston.data" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:27.403480Z", - "start_time": "2018-04-29T07:16:27.398197Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data,\n", - " boston.target, \n", - " test_size=0.2, \n", - " random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:43.427978Z", - "start_time": "2018-04-29T07:16:43.423656Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "regr = linear_model.LinearRegression()\n", - "lm = regr.fit(Xs_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:16:47.859814Z", - "start_time": "2018-04-29T07:16:47.854257Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(30.288948339369036,\n", - " array([ -1.12463481e-01, 3.00810168e-02, 4.07309919e-02,\n", - " 2.78676719e+00, -1.72406347e+01, 4.43248784e+00,\n", - " -6.23998173e-03, -1.44848504e+00, 2.62113793e-01,\n", - " -1.06390978e-02, -9.16398679e-01, 1.24516469e-02,\n", - " -5.09349120e-01]),\n", - " 0.75088377867329148)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:17:35.601265Z", - "start_time": "2018-04-29T07:17:35.598315Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "predicted = regr.predict(Xs_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:17:43.752187Z", - "start_time": "2018-04-29T07:17:43.605493Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEXCAYAAAC+mHPKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xt4VOW1+PHvSggQEIkoCAQEIwKiiAEUvEG9oqgIaEWPIoIWsVattoggd8Kl6Gm91NOfqBwrWAWRIhaBHsULKIiBoBhsFESQCHIzXAOEZP3+2DNhkuxJZpK5JbM+z5Mnyd7v7HlnK3vlva1XVBVjjDGmIgnRroAxxpjqwQKGMcaYgFjAMMYYExALGMYYYwJiAcMYY0xALGAYY4wJiAUMY4wxAbGAYYwxJiAWMIwxxgSkVrQrEEqnnXaatm7dOtrVMMaYamXNmjW7VbVxReVqVMBo3bo1mZmZ0a6GMcZUKyKyJZBy1iVljDEmIBYwjDHGBMQChjHGmIBYwDDGGBMQCxjGGGMCEjMBQ0R+EJGNnq/lnmOPiMhWEckRkeujXUdjjIlnMRMwAFS1jefrchE5C3gQOBfoB7wiIknRraExxsSOvLw83nvvvYi9X0wFjFL6AXNV9YCqbgB+ALpEt0rGGBN9hw4dYtq0aaSlpdGvXz+2bt0akfeNpYCRLyKbRGSViPQCWgK+i0m2Ac1Kv0hEhopIpohk7tq1K1J1NcaYqBk0aBAjR47kl19+4dixY0yaNCki7xszAUNVz1HVs4DhwOtAbaDIp0gRUOjyuhmq2lVVuzZuXOHKdmOMqfYeffTREr//7//+L999913Y3zdmAoaXqi7H6X7aDqT6nGoB/BiNOhljTCy59NJL6d27NwANGzZkwoQJNGtWpgMm5GIil5SI1AdOVtXtIpKO0/X0AfCSiDwNtAIaAeuiWE1jjIkIVWXRokVkZ2czYsQI1zIZGRl06tSJ4cOHc8opp0SkXjERMIB6wMcikgjsA+5S1U9FZDaQDRwB7lNVjWYljTEm3JYtW8aTTz7JqlWrSExM5JZbbqFNmzZlyqWnp5Oenh7RusVEl5Sq7lLVtqp6lqp2VtUPPcenqOqZnvGNT6NdT2OMCZfPP/+cq6++mquuuopVq1YBUFhYyPjx46NbMR8xETCMMSZeffXVV/Tp04fu3bvzwQcflDn/5ptvsm3btijUrCwLGMYYEwXffvstd9xxB506deLdd991LdOnTx/Wrl1LixYtIlw7d7EyhmGMMXFh69atTJw4kVdffZXCwjIrBQC46qqryMjIoHv37hGuXfksYBhjTATs2LGDKVOm8OKLL3Ls2DHXMhdffDGTJ0/miiuuiHDtAmMBwxhjIuDNN9/k+eefdz3XqVMnMjIyuOGGGxCRCNcscDaGYYwxETBs2DBSU1NLHGvbti1z5sxh7dq13HjjjTEdLMAChjHGRETdunUZM2YMAK1atWLmzJlkZ2dz2223kZBQPR7F1iVljDEhUFBQwMyZM3nttddYtmwZderUKVNmyJAh1KpVi7vuusv1fKyrHmHNGGNiVGFhIbNnz6Z9+/YMGzaMzz77jBkzZriWTUpK4t57762WwQIsYBhjTKWoKv/85z/p1KkTAwcO5Pvvvy8+l5GRwaFDh6JYu/CwgGGMMUFQVZYuXcpFF11E//79yc7OLlNm3759fP7551GoXXhZwDDGmACtWLGCX/3qV1x33XVkZmaWOV+rVi2GDRvGpk2buPLKK6NQw/CyQW9jjKnA2rVrGT16NIsXL3Y9LyLcddddjB8/nrS0tAjXLnIsYBhjjB+bN2/m8ccfZ968eX7L3HLLLUycOJEOHTpEsGbRYQHDGGP8OHbsGPPnz3c9d91115GRkUGXLl0iXKvosTEMY4zxo127dgwaNKjEscsuu4xPPvmExYsXx1WwAAsYxhjDgQMH/J4bO3YsSUlJdOnShcWLF/PJJ59w+eWXR7B2scMChjEmbu3fv5/x48eTmprqdxps69at+eKLL/jiiy+47rrrYirf04KsXC6dtowzn1jEpdOWsSArN6zvZwHDGBN3Dh8+zPTp0znzzDOZMGECBw4cYPTo0X7Ld+rUKaYCBTjBYuT89eTm5aNAbl4+I+evD2vQsIBhjIkbx44d44UXXuCss85ixIgR7N27t/jc+++/z4cffhjF2gXnqaU55BeU3IApv6CQp5bmhO09bZaUMabGO378OLNnz2b8+PFs2bLFtcypp57Krl27IlyzyvspLz+o46FgLQxjTI1VVFTE3LlzOe+88xg8eLBrsDj55JOZOHEimzdv5rbbbotCLSuneUpyUMdDwVoYxpgaR1V57733GD16NOvWrXMtk5yczMMPP8zw4cM59dRTI1zD8i3IyuWppTn8lJdP85RkhvdqR9/0kpsvDe/VjpHz15folkpOSmR4r3Zhq5cFDGNMjfLzzz/Tr18/Vq5c6Xo+KSmJ+++/n1GjRtGsWbMI165i3sFsbyDwDmYDJYKG9+eKAksoWcAwxtQojRs3dl1XkZCQwKBBgxg7diytW7eOfMUCVN5gdulg0Dc9NawBojQbwzDG1CgJCQlMmjSpxLHbbruN7OxsZs6cGdPBAoIbzI7bdRgiUltENojIy57fHxGRrSKSIyLXR7t+xpjYsnHjRvLy8lzP3XzzzVx00UXccMMNZGVlMWfOHNq3bx/hGlZOoIPZ8b4OYxTwA4CInAU8CJwL9ANeEZGk6FXNGBMrfvzxR4YOHUr79u15+umnXcuICMuWLeNf//oXF1xwQYRrWDXDe7UjOSmxxDG3wexorMOIiYAhIucAFwJzPYf6AXNV9YCqbsAJJPGV5csYU8LOnTt59NFHOfvss3nppZcoLCzkmWeeYefOna7l69evX6n3iXQ3T2l901OZ2r8jqSnJCJCakszU/h3LjFVEYx1G1Ae9xVlv/xzwAHCZ53BL4GufYtsA1+kMIjIUGApwxhlnhK+ixpio+OWXX3j66ad59tlny+yTfejQIaZOncpf/vKXkLxXoDOUwi2QwezmKcnkugSHcK7DiIUWxjDgI1Xd6HOsNlDk83sRULLt5aGqM1S1q6p2bdy4cRiraYyJpIMHDzJlyhTS0tKYMmVKmWAB0KZNGy655JKQvWc0unkqK9Cuq1CKegsDGAg0EJFfA42A+jgtDt/w2gL4MQp1M8ZE2JEjR3jxxReZMmWK3+6mli1bMnbsWAYNGkRSUuiGN6PRzVNZcbkOQ1WL/zwQkXtwuqX+BcwSkaeBVjiBxH25pjGmRigoKODvf/87EyZMYNu2ba5lmjRpwqhRo7j//vupW7duyOsQjW6eqoj0OoyoBww3qrpGRGYD2cAR4D5V1ShXyxgTJkVFRVx00UV+03ikpKTw+OOP89BDD3HSSSeFrR7RSLdRncRUwFDVV4FXPT9PAaZEsz7GmMhISEjguuuuKxMw6tevz+9//3v++Mc/kpKSEvZ6RKObpzqRmvSHe9euXTUzMzPa1TDGVMLevXtJS0tj37591KlTh9/+9rc88cQTNGnSJNpVq/FEZI2qdq2oXEy1MIwxNdtnn31GUVERl112WZlzjRo1YuTIkXz//feMGTOGFi1aRKGGpjwWMIwxYbdu3TpGjx7NokWLOP/888nKyiIhoeys/hEjRkShdiZQsbAOwxhTQ+Xk5DBgwADS09NZtGgRAF999RVz586t4JUmFlnAMMaE3A8//MDgwYPp0KGDa3AYO3YshYWua3FNDLMuKWNquEB2bwuV7du3M3nyZGbMmEFBQYFrmUsvvZTJkyeTmJjoet7ELgsYxtRgkcqNtGfPHqZPn87zzz9Pfr77quj09HQyMjK4/vrrcVLImerGuqSMqcHCnRvpwIEDTJw4kbS0NKZPn+4aLNq3b89bb71FZmYmvXv3tmBRjVkLw5gaLNy5ke655x7mz5/veq5169aMHz+eO++8k1q17FFTE1gLw5gaLNDd2yrrj3/8Y5ljTZs25YUXXiAnJ4dBgwZZsKhBLGAYU4OFOwX2xRdfzI033gg4C++mT5/Opk2b+O1vf0vt2rVD8h4mdljoN6YGq2puJFVl/vz5fPPNN4wePbr4uO/Mq4ZpfRgw9GxmPDWek08+OSyfw8QGyyVljClDVVmyZAmjR49m7dq1JCYm8s0333D22WeXmXkFTqvFbRtRUz0Emkuq3C4pEZknIiNFpJeI2HZ2xsSBTz75hB49etC7d2/Wrl0LQGFhIePGjQOq1650JrQqGsPoD0wG3gN2iMhWEVkgImNF5AYRcd1n2xhT/WRmZtKrVy969uzJihUrypx/6623yM3NrVa70pnQqmgMoxXQ2fPVxfO9D3CTt4CI/Ays9XytAdaqqm2nakw1kZ2dzZgxY/jnP//pt8ytt97KxIkTSU1NpXlKTrXalc6ETrkBw/Pg/xF4x3tMRGYA9wHfAHuANKA3cL1Pmd2qeno4KmyMCY1NmzYxfvx4Xn/9dfyNZV5//fVkZGTQuXPn4mO2K138CmqWlIgMB+4EeqjqCp/jlwATgKtwAkzNGUk3JkwimePJ17Zt25g0aRIzZ87k+PHjrmV69OjB5MmTXfetsF3p4lew02ofBN7wDRYAqvoZcI2IjADGApeGqH7G1EiRyvHkZt68ecyYMcP1XNeuXZk8eTLXXHNNuSk8+qanWoCIQ8Eu3GuC0w3lSlX/BHwNjKpKpYypyRZk5fKHuV9GbabRsGHDyuxmd+655zJ//nxWr17Ntddea/mejKtgA8Z3wNUVlPkIuLxStTGmhvO2LAr9jBnk5uVz6bRlLMjKrfJ7+RuXqFu3LmPHjgUgLS2NWbNm8eWXX9KvXz8LFKZcwXZJzQT+IiKjVHWKnzJNgUZVq5YxNZPbGobSKts95R0Tyd29n4Sc90ncvIL1mauoW7dumbL33HMPderU4Y477iApKSm4D2HiVrAtjL8C7wOTRGSZiJQYERORG4DbcQa+jTGlBLpWIdjuqQVZuTwxbx3/+fgdtr00lM3/eoGN2V/y4Jg/uZZPSkri7rvvDihYLMjK5dJpyzjziUUha/2Y6imogKGqhcANOC2NXwEfi8hOEckUka3AQiAJeDbUFTWmJghmrUKgwaWoqIgR0//Gpr8NZe+S5yjcv6v43Kz/9wwHDx4Mup5e3i603Lx8lBOtHwsa8SnobLWqWqCqvwG6A/8AinAW9DUDcoBBqvp8SGtpTA3hlj3W36hBRcFFVXn33XdJT0/n2zcnc/yXn8qUOX40ny+++KKy1bU0IKaESmerVdXVwEAAEakDFKqq+6RuYwzgvobhivaNeXtNblAL4ZYtW8YDj/yRb7/Oci+QWIsGF1xPu153c8UVV1S6vv5aOd7BeVuHEV8qFTBEpD3QDeePoy2q+mFIa2VMDea2hqFrq0ZMeDebXw4XAFCnlnvjf9WqVTz55JMsW7bM/eKSwEkdr6bhpbfT4NRmPNm/Y5Xq2jwl2TUNiEDx8UiuITHRFexK7wTgFeBu7yGcLqlanvOilciX7rnuUpzcVQo8rKpLReQR4A9APvB7VV0c7LWNqUi0VlyXdqSgqPjnvPyCEg/hjRs38thjj/Huu+/6fX29c3rS6PI7qXVK85B9Drc0IELZVA7ebioLGDVbsC2MEcAg4DPg78C1OBltvS4RkdeBwUG2OhS4W1W3i8h1wGQR2YizsvxcoCXwvoi0UtWCIOtsjF/RXHHtq7yxgr7pqRQVFfHee++5vja5TTdSLr+L2k3ORIDN024IWb3cutDcWhxg2WrjQbABYzDOwHZPVS0Ukeb4BAxV/VREjgO3AQEHDE+rZLvn11bAl0A/YK6qHgA2iMgPOBlzVwVZZ2P8quhBHSkVpQxv27YtgwYNYubMmcXn6ra6gJQeA6nT/MRYRzgyxpbuQrt02jLLVhungp0l1QpY7Jle688a4JJgKyIij4vIHuBRYCJOq2KLT5FtODOxSr9uqGdab+auXbtKnzbGlXdtQXl/LUdy/YH3YVt05KDrcYCxY8dSu3ZtLr74Yia+OIfWA6eWCBaRyhgb7n3CTewKNmDsB+pUUCYXlwd7RVR1uqqeipOHailQG2d8xKsIKBOoVHWGqnZV1a6NG9umgKZivmsL/GmYnBTR9QfDujfhwPLX2PY/gzia+x+g7EO4VatWZGVl8emnnzJm6G1M7d+R1JRkBEhNSY7YFql901Oj9t4muoLtkvoCuFpEElS1yE+ZIqBhZSukqvNF5DmcLirf/wNbYCvITQhUlJ4jOSkRESLSVXXgwAGeffZZnnrqKfbv3w9A3vLX6DzsL66D1h06dCj+OZoZYy1bbXwKtoXxCnA2TpeRP+dTTkZbNyKSJiJNPT9fDBwBFgG3i0g9ETkHJz/VuiDra0wZ5Q3Oev9azjvsPrciVAO7+fn5/PnPfyYtLY0xY8YUBwuAI1u+YtJFNkXVxJ6gWhiq+raIzAFGeh7iR3zPi0g/nGy284KsRwqwREQSgZ3AAFVdIyKzgWzP+9xXmSm7xpTmb6ZPakoynz5xJeC0QsIxsFtQUMDMmTOZNGkSubnu3Vunn346eXl5VXofY8Ih6NQgODvu/Q3oi5NoEBH5UETW4wSKAsA945kfqrpWVduq6lmqerGqrvEcn6KqZ6rqOar6aSXqakwZgQzahnpgt7CwkNmzZ9O+fXuGDRvmGixOOeUUpk2bxqZNm+jfv7/LVYyJrqBXenvGLn4nIrOAh4BrgJ6e018Bw70PfGOiyd+CPG9Xz/iF2eTlO11PdZNK/u3ku/4gNy+fRJESOZQC7S5SVRYsWMCYMWPIzs52LXPSSSfx2GOP8dhjj9GwYaWH/4wJu2BXev8X8LGq5qrq58DnnuN1AFHVI+VewJgICWRB3tHjJ+Zt/HK4oMx57/fKLuzbsWMHN910E5mZma7n69aty4MPPsiIESOwGX6mOgi2S2o2cG/pg6p61IKFiSUVZVkNNAtrVbK1NmnShKNHj5Y5XqtWLYYNG8bGjRt5+umnLViYaqMyYxjlEpFbRWRSqK9rTDAqWjld0flAr1OehIQEJk068U9BRBg4cCA5OTn87W9/IzXVZkGZ6qXCLikRuR3IBDYFeM1zcRbfjalCvYypEn8zobyznCo6H+h1AL755hvW7Czibyt/LjNe0qdPH7p160aLFi2YOHFiiXUUgYiVxIjGQGBjGP/ASQ540PP9GhH5CVgLrHdJBlgfKNsON9VCTXlAuWVZ9Z3l5C8Lq3efhyvaN+bD/+wiNy+/THZW73U2b97M+PHjmT17Nqdc/GtOumwgUHac48MPPyQ5OfjpuLGSGNEYL6loaYOIPAak4yT+a+857H1RAbABJ3hk4aQOmQIcVNVzwlHh8nTt2lX9DTCaipV+QIHzcKyuaR9KBz9vECj9u1tQ8Cc1JZl7Ozckc8ErvPzyyxQUOH8vSVIdUu9/mcT6p5Qo613XURn+cl1V9brGlCYia1S1a0XlKmxhqOqffS5aBLwKrMTZlrUz0BG4oNTLfhdMZU1siJXMraHiO4XW7a/1t9fkMrV/R7+L9EorOryPxj+9y4MT/s6RIyXneGjBUfatfItGVw8tPlbVVeFVGT8xJhyCXYcxHchU1eKV3J7V2R1wgsapwBpVXR66KppIqakPqAVZufxh7pcUlmpN5xcU8ujcdVSUP6Do6CH2r17A/swF/HjM/V4kndqSuq3OL3GsqqvCAx1nMSZSgk0N8oTLsUJgvefLVGPV8QFV0ZiLt2VROlh4lRcsigqOcGDtv9i/6m2KjhxwLZOWlsZN9zzMkiNtOOLTOAtFuu+KxmGMifSYY7AL9/rg7HXxgqpa5tgapro9oEYvWM/rq7YWjz24DQpXlJnWjR4v4MCXS9i/ci6Fh35xLdO8eXPGjh3LkCFDSEpKCss/XLfd7qrrJAQTetGYFFHhoHeJwiJLcMYtWqvqYZfzldrTO1Rs0LvqqsssqQVZuTw6Z53rQHWqp96Bjk340qJCtv/vQxTs3up6/uSURowb8yQPPPBApWY+GRMqoZwUEbJB71LOB/7tFiw82ojIR8AfVPXNIK9tYkB12efgqaU5fmc1ef/SCrZlASAJiTQ57zJyP/pHieOJdepz25AHePFPY2nQoEElamxMaEVjzDHYgNGIktumlqCq34nINuAewAKGCSnf1k95zVhvosDKkk59SFz1DoVHDpGcnMwjjzzC8OHDadSoUaWvaUyoRWPMMdiAsYuKt19dB9xYueoY43BbQ/H2mtwKA4GA3wFucIJJoSpHtn4FkkDdlueVLVP3JBp0H4Ac2s1/Tx7P4GvSq/pxjAm5aIw5BhswPgNuEJFkVfXX7skDTqtatUy8WZCVy4R3s/nFZae73Lz8EoPb/ghwZ/czihfjlZaakszNqYeZMG4shzdnkdS4Nc0GP0e92knUqZVQnOocoGE3Zz+Kl9f8wuBrqvLJjAmPaEyKCHbQ+1fAMuAt4E5VPe5S5l9Ad1WNeNCwQe/qaUFWLsPnfUlBYeXmSwiU+MfitmJdftlKk2/fYfVHS0u89uwBTzJ9xAN+B9Ddrm9MTRPooHdQ2WpV9SPgOeDXwApPAPF9018D1+OsBDcmIE8tzal0sABIqZfEoaPHeXTOOi6dtgyAqf07kpqSzPG9uRxa8me2vPRgmWABoGvmcNP5Tcvt91VODKQvyHLfVtWYeBB0enNV/T0wDegKfCAiu0VkjYj8iDPQXeQ5b0xAqjqr45fDBeTlF5R4sO/e8RP1Vr9M7isPsPvLZbi1pOu17sR9o/+bxMRE1y1ZSwt0Hwxjaqqgt2gFUNVRIjIX+ANwLU5yQoCvgSds/20TDH+zPSqj8NAv5K58i6FTF6OFZcdDAGo3a0dKj4Ekt76AhduTGUHZ/mB/7Z3qnibFmKooN2CIyACc3FAbS59T1XXAQE8526LVVNrwXu2qNIYBUHjkIPtXz+dA5jtogXt2/aTGrZ1AcdZFiAhQMgD4rkHxtygqltOkGBNuFbUw3gCKRKSDqn4rItNw0pivVdXvvIVU1fa/MEEpPW12wIUtWfTVdtdZUr6SkxJJEDh0rOT02j3vPUP+d6tcX1PrlOakXHYn9c65HJGSvbD+AkB1S5NiTCRUFDD+iLMPhndl9+N4tg0QkYM4ay68e2GsBTaoalF4qmpqivJSjfdNT/X7132iCFP7d2T8wmygZMBo2O3WMgEjsUFjGl56Byd1vApJKDs+UV4AsDxOxpRVbsDw3QvD40pO7IPRGbgUuJwTe88cEZH1ON1YD4a4rqaGqGjfDX/jBEWqZG7ZW2K9hFed1PYkt7mI/I2radjoNG699yGWJ17AUXUfyE5JTmJ8n3PLDQDVJU2KMZESbHrzj4CPvL+LSD2gEyWDSDpwIWABI8oikUiwMu9RUQ4ct0Fw1SKOfbucv67YRspld7q+vm3v+xjQtC8PP/ww9evXL66b2456R49bQ9iYYAW1cC+gC4rUBs5T1bUhvXAAbOHeCZHYbtXtPQBOqZfEuJv8//WePvHfrmMVCeLsT1ErAQo8z3NVJX/j5+R9MouC3VtAEmh+399IalTy2uV9toqyelaXDL3GhEtIFu6JyDwRGSkivUSkcSBvrKrHgg0WIlJXRGaISI6IbBGRRz3HHxGRrZ7j1wdzzXhXXrdPON8DnHUR5S1y8/c3SpE6rYCCIk+g2JzFjll/YNf8DCdYAGgReSteL/Pa8gJheS0ab9DL9UyltQV6xvhXUZdUf8+Xd6A7F2dwey2wBme21PYQ1KM+sBS4H2eb12wRWYvTrXUu0BJ4X0RaqWr502gMEJnUx+Vdq7y9wPe5jEH4OrLtG/KWv8bRre6bOB7O+YzjB/ZQq8GpgDMYXl6LoLysnjVtH3NjwqmigNGKE2MTXTzf+wA3eQuIyM+UDSJB7canqnuAtz2/7vasGu8BzFXVA8AGEfnBUwf3uZOmhEikPq5owZ2/gOLvdcd+3kTeJ7PI/95ft6JQv0NPGl72X8XBAuCObi3LrWd5U2QfnbMuqLobE88qmiX1I/Aj8I73mIjMAO4DvgH2AGlAb5wcUt4yu1X19MpUSETOA+riZLz92ufUNlxSq4vIUGAowBlnnFGZt6yRwrmOoLzBZF++wanEa6RkuYI9P5K3/HUO56zw+5712l7MlIxJ7Epqwhuf/0ihKoki3NGtJRl9O5Zb3/KmyPrblc8W6BlTVrB7eg8H7gR6qOoKn+OXABOAq3ACTKVG0kXkNGAWMBgYgpOXyquI0pPvAVWdAcwAZ9C7Mu8byyo7IBuOdQQLsnIZvzC7xLRWfzfcNziVHhz3jmEc3/czeSve4FD2MvCzfKdu63RO6TGQIf2u5hFPYKgoQLjxN0XWFugZE7hgc0k9CLzhGywAVPUz4BoRGQGMxVmfERQROQV4Fxilql94Brl9/4W3wAlGcaOqm7yHch2BvxlRXinJSdSvU8s1OPkbHD/83ecc+vp91+vVSe1ASo+BNG3XucL1ElVhC/SMCVywAaMJTjeUK1X9k4j0B0YBtwV6URE5GVgITFbVxZ7Di4BZIvI0zlhKI5yV5XEjlgZk/T30vfblF7Bu3LWu5/yNBzS44Hr2r/4nhQd2FR+r37wN9S+5k7M6X87j17WPyOe0BXrGBCbYgPEdcHUFZT4C7g7yug/jDKg/IyLPeI5dC8wGsoEjwH0a6kUjMS6cM52C7eqq6D3L6/Nv1rAuP+0rm5dSaiXR8NLb2bvkeWo1akHadUP4ZtY4EhKCzrpvjImAYAPGTOAvIjJKVaf4KdMUpzUQMFXNADJcTk3xfMWlcM10Kr3DXW5ePsPnfQn47+oqb0aUvz7/OSs3MiLjaX7K/D+aDXwaqVW7TJmTOl5NQlJd6rW/jGMJieUGC1tgZ0x0Bfun3F+B94FJIrJMRC7zPSkiNwC3E2djDeHitqlPKAZkJ7ybXSaVeEGhMuHd7KDqAs6qbt9Fcwuycrk4Yymn9nqQO6/txpb3XqRg5/ccyFpc5rUAkpBI/Q49kYTEcgOhLbAzJvqCzSVV6AkK/wPcC3wsInuArTjjG6k4WyA/G+qKxqNwDcj6SyFeXmrxQOryduZWHpr0HLs+ns3xfT+XeP2+VXM5qdO1JNWpR6Fqmem4FQXCWBq2aiolAAAY3klEQVTPMSZeVTqXlIhcBDwEXIMTLApxxjimquqskNUwCJZLKjCtn1jk99wP024IuuunqKiI+fPnc/eDfyB/51bXMlI7mSa3jiO55Xls9nmP3Lx8EkWK11UUqpKakswV7Rvz4X92FdfBX3eYAJun3RDU5zfGlBRoLqlKbdEKoKqrKbnjXqGqHq/s9UzkpCQnuaYIT0lOCmoqr6qyZMkSRo8ezdq17unDpFZtGnS+kZO73UJivYbF3U7ea/m+V6GeGFOZvepE4ClvgaAtsDMmcio1HUVE2ovIIBG5R0SuUNWjFiyqj/F9ziUpoeRy66QEYXyfcwNOWvjJJ5/Qo0cPevfu7R4sEmpxUvoNNB/6EqdcMYTEeg3LdDtVNFXXl+K0JnzZAjtjIivYld4JwCucmDYrOCuwa3nOS7xNfa2OyhuPqCi3Uk5ODg8//DD//ve/3S8uCdQ/9woaXnoHSSlNiw+nunRtBTs9WD3XsVlSxkRHsF1SI4BBwGfA33HWSvT3OX+JiLwODFbVD0NTRRNKpccn/jLgghIP3YZ+uqsaJicBkJiYyAcffOB67XrtLiPlsjtJOu1EMsBT6iWRNdZ9QV9FyQvd5ObluwYfY0z4BdslNRjIAXqq6kuUTA6Iqn4KHCeIVd4mcgKZmlo6MWDp423atGHw4MElzvXu3Ztm9zxL475PlAgWAHnlzLzyN1W3Ijal1pjoCDZgtAIWq2p5Hc9rgEsqXyUTLoGMT3gf8IX5+0uU833wjx07ltq1a9OzZ09WrFjBokWLOLPdea7vWd6gdN/0VKb270iqp0yiJyqlBjCQHerNoIwxFQu2S2o/UKeCMrnAFZWrjgmXBVm5frt/fMcSGtc6yn+WvsbBdYtpMmAydVucA5R88Lds2ZKvv/6aNm3aIJ6H/PBe7UqsHgdISpQKB6X95XHyt62qv3obY8Iv2BbGF8DVnsFvf4qAhpWvkgk1b1eUP81TksnLy+O23/yeNdPv5EDmO+jxY+Qtfw1VdZ2NdPbZZxcHi2KlpztUYfpDIN1VNqXWmMgKNmC8ApwNTCynzPmUk9HWRF5501f12BFqZ79Dy1ateevlZykqOJEk8OjW9STt+Lrc/bJ936OgqFS6kSKtdLdR6e4qm1JrTPQFmxrkbRGZA4wUkXNwssgWE5F+ONls54WuiqayfFdTl6bHCziwbjH7Vs1l66E819cnNjiNJD0e0GykUGXW9bfK3BIPGhN9lVnpfSdOC+IB7wER+RBnS9UOQAHwp5DUroYL50PQ34ZHWlTIwfUfsO+zNyjcv8v1tQn1GtKw+200SL+eIy4ZZt2EIrNuRavMLUAYE11BdUl5WhWqqr/DmQn1BrAb6AmcC6wHblTVNaGuaE0T7uyrpbuhVIs4tOFjfnr5AfYuec41WEid+qRcPpDU+1/m5AtvRmrVDviBH4rMuoGuMjfGREewLYxsnE2N7lbVz4HPoTiXlKhq2V1yjKtwZ1/17Qo6fmAPO98aR8GuH1zLSlIdGnfrS8Nut3CsVr3i48E88Pump5K5ZS9vfP5jcSLBW7oE1yoI54ZRxpiqCzZg/ILLXheqejQ01Ykf4X44+nYRJZ50inuhxFo0uKA3p19+O0/d3QOofCr1BVm5vL0mtziBYKEqb6/JpWurRiWuUV43XLg2jDLGhEawAWM50D4cFYk34X44Du/Vrng8QCSBlB4D2fX2JAASEhM5rXMvane9lTPOaFXioV3Z1k0gLaaKxih86+xV3WdD2WC9qUmCDRiTgeUicqGqfhGOCsWLcDwcv/zyS1q2bEmjRo3KJBhs06Unp25cTHqHs5kwYQJnn312lT+Dr0BaTBUFlXBtGBUtwaSKN6Y6CDZg3AosA94XkYdV9e9hqFNcCOXDMScnh3HjxjFnzhyeeOIJpk6dWvwevtc7+tgK6tSpaKF+5QTSYgokqNSk2VC2S6CpaYINGMM5sTXBTBGZBizCGfzOBNbbvhiBq+rDccuWLUycOJFXX32VoqIiAJ577jkeeeQRmjZtWqZ8uIIFBNZiircxChvENzVNsCu9r8QJGv/AyVrbGBgC/D+cgHFARFaLyP+EtJamhB07dvDQQw/Rtm1bZs6cWRwsAA4fPky7G4fS+olFnDXyPUYv8J8SJJR8V2YLTgLB0ivEQzH1tjrxFwhraoA0NV+wK70/Aj7y/i4i9YBOQGefr05AF+C3oaqkcezdu5fp06fz3HPPkZ/v/ldqUpMzqXtmZ8CZqeTd6jSjb8ew16+iFlNNG6OoSE0cxDfxTUK9QZ6I1AbOU1X3TZ7DqGvXrpqZmRnptw0b7wybbT/vQb9+jz2r3ubwwQOuZdu2bUveOf2p2+4SSueGTBRh09TekaiyKcVmSZnqQETWqGrXisoF1MIQkfHA/cCpwBac3fb+pKpldsdR1WNAxINFTbMgK5cRczLZtfpd9q16i6JS+1N4tWrVinHjxjFw4EDajF7qWqbQds2Nmpo0iG9MhQFDRIYAY30OnQVMAC4Ebg5lZUQkGWipqt+G8rrV0fTFG/j+pd9xfO821/Onn346o0eP5je/+U3xYHaiiGtwSPS3jZ4xxgQhkEHvYcAxnKSDLXCy0a4FbhSRX4eiEiJysogsAH4GHvc5/oiIbBWRHBG5PhTvVV1s33+Meu3KblyYUPckpk2bxqZNm/jd735XYubTHd1alilf3nFjjAlGIF1SZwHzVPUNz+8/icg1wEbgbuCtENSjCHge+BfQHUBEzgIexElq2BJn7Ucrt26wmqh5SjLHL+rPwbWLKDp6CKmdzMld+9LumtsZMaKP62u8A9u++Zzu6NYyIgPexpiaL5CAcQpOcCimqnkisgintVFlqnoQ+EBE7vE53A+Yq6oHgA0i8gPO7KtVoXjPWKCqjHvhdeZ/tZNDjdqVGBR1Ztgco+Elt1N4cC8nd7+Vkxo2YuTN5T/8M/p2tABhjAmLQKfVFrkc24ozCB4uLYGvfX7fBjQrXUhEhgJDAc4444wwVie0li9fzrBHhrMh63OSTmtFs8HPuaaOeKp+bZthY4yJCZXZQMnrOJAUqoq4qE3JQFUElNlnVFVnADPAmVYbxvqUUNnpkmvWrGH06NEsWbKk+FjB7i0c/s9y6nf4VZncShYgjDGxItCV3mNEZL2IvCwiQ0XkAqoWbAKxHfB9WrbAJbV6NFRm86MNGzZwyy230LVr1xLBwitvxT9QdeKjpY4wxsSiQALGB8A+nMHnIcDfgDXAKAAReVpE7hKRc6X0irGqWQTcLiL1PDv9NQLWhfD6lRbMznDff/89d999N+eddx7z5893vV7dM7twWp/HixfcWeoIY0wsqrCVoKrXAIhIGtDV5ysdaAg8hpOQEOCoiHwNZKnq/YFWQkQaAFlAA6CuiPwK+A3O7n7ZwBHgPg31svRKCiSpXG5uLhkZGbz88sscP+6ej7FDejcOdrwVaXZO8TFLHWGMiVUBdyup6vfA98Bc7zERaUvJIHKB53sXnJXhgV77ANDG5dSHwJRArxMp5WVd3b17N9OmTeOFF17gyBH3HWu7dOnC5MmTufbaa3ln3U+WOsIYUy1UaRzCsyL7W5zstYiIAOfgBIwaq7ykcvfeey8LFy50fV2HDh2YNGkS/fr1Qzyrr21g2xhTXYRyzAF1bFDVWaG8bqwpL5X3qFGjypRPS0vjtdde46uvvqJ///7FwcIYY6qTkGerjaZYyVZ78803s3DhQlJTUxkzZgxDhgwhKSmcM5CNMabyQpqt1pxw/PhxZs2axaZNm8jIyHAtk5GRQc+ePXnggQdITrYZT8aYmiHuA0agC/CKioqYN28eY8eOJScnh4SEBAYOHEi7dmVnNHXs2JGOHS09hzGmZgnpGEZ1E8gCPFVl0aJFdO7cmQEDBpCT46y1KCoqYty4cVGquTHGRF5ctzDKW4DXNz2Vjz76iFGjRrFy5UrX1y9YsIAdO3bQtGnTkNTH29rJzcsv3tsi1abaGmNiRFy3MPwtwNu8YR3XXHMNV1xxhWuwSEhIYMiQIeTk5IQ0WHhbO3Bil7xA0o4YY0wkxHULo/QCvGO7fiBv+Wzyv1vFdj+vGTBgABMmTHAdu6gKt9aOl2+rxxhjoiWuA4Z3Ad7+nT+St+J1Dm/4hBNZTkq66aabmDRpEp06dQpLXSpKOGgJCY0x0RbXXVLeBXh1fvqSwxs+xi1YXHnllaxcuZKFCxeGLVhAxQkHLSGhMSba4jpggBM0vpn337RsWXLf627duvH+++/zwQcf0L1797DXY3ivdiQnJbqes4SExphYEPcBA6BOnTrFU2Q7duzIwoULWblyJVdddVXE6uCbbgQg0ZM+xDftiDHGRFNcj2H4GjRoEI0aNeLmm28mISE6cdQSERpjYpkFDI9atWrRr1+/aFfDGGNilnVJGWOMCYgFDGOMMQGxgGGMMSYgFjCMMcYExAKGMcaYgFjAMMYYExALGMYYYwJiAcMYY0xALGAYY4wJiAUMY4wxAbGAYYwxJiAxHzBE5DYR2SwiG0VkSLTrY4wx8Sqmkw+KSAPgv4HuQCGwTkTeVdVd0a2ZCcSCrFyeWprDT3n5NE9JZnivdpaN15hqLNZbGL2Aj1U1V1V3AMuAyG1SYSptQVYuI+evJzcvHwVy8/IZOX89C7Jyo101Y0wlxXrAaAls8fl9G9DMt4CIDBWRTBHJ3LXLGh6x4qmlOeQXFJY4ll9QyFNLc6JUI2NMVcV6wKgNFPn8XoTTNVVMVWeoaldV7dq4ceOIVs7491NeflDHjTGxL9YDxnbAt9O7BfBjlOpigtDcs9VsoMeNMbEv1gPGUqCXiDQRkabAJcC/o1wnE4DhvdqRnJRY4lhyUiLDe7WLUo2MMVUV07OkVPVnEXkSWOk59AdVPRTNOpnAeGdD2SwpY2oOUdVo1yFkunbtqpmZmdGuhjHGVCsiskZVu1ZULta7pIwxxsQICxjGGGMCYgHDGGNMQCxgGGOMCYgFDGOMMQGxgGGMMSYgFjCMMcYExAKGMcaYgFjAMMYYExALGMYYYwJiAcMYY0xALGAYY4wJiAUMY4wxAYnp9OaxbEFWrqXuNsbEFQsYlbAgK5eR89cX71mdm5fPyPnrASxoGGNqLOuSqoSnluYUBwuv/IJCnlqaE6UaGWNM+FnAqISf8vKDOm6MMTWBBYxKaJ6SHNRxY4ypCSxgVMLwXu1ITkoscSw5KZHhvdpFqUbGGBN+NuhdCd6BbZslZYyJJxYwKqlveqoFCGNMXLEuKWOMMQGxgGGMMSYgFjCMMcYExAKGMcaYgFjAMMYYExBR1WjXIWREZBewJdr1qKLTgN3RrkQMsftxgt2Lkux+nFDVe9FKVRtXVKhGBYyaQEQyVbVrtOsRK+x+nGD3oiS7HydE6l5Yl5QxxpiAWMAwxhgTEAsYsWdGtCsQY+x+nGD3oiS7HydE5F7YGIYxxpiAWAvDGGNMQCxgxAARSRaRttGuhzHGlMcCRhSJyMkisgD4GXjc5/gjIrJVRHJE5Pro1TByRKSuiMzwfOYtIvKo53jc3QsAEUkQkf8TkW89n72X53hc3g8AEaktIhtE5GXP7/F8L34QkY2er+WeY2G/H5bePLqKgOeBfwHdAUTkLOBB4FygJfC+iLRS1YKo1TIy6gNLgfuBU4FsEVlLfN4LAAXuVtXtInIdMFlENhK/9wNgFPADxPW/k2Kq2sb7c6Tuh7UwokhVD6rqB8Bxn8P9gLmqekBVN+D8A+kSjfpFkqruUdW31bEb+BHoQRzeCwDPfdju+bUV8CVx+v8GgIicA1wIzPUcitt74UdE7ocFjNjTkpLpTbYBzaJUl6gQkfOAujjpDuL2XojI4yKyB3gUmEic/r8hIgI8Bzziczgu74WPfBHZJCKrPN2VEbkfFjBiT22criqvIqAwSnWJOBE5DZgFDCbO74WqTlfVU3G6YpYSv/djGPCRqm70ORav9wIAVT1HVc8ChgOvE6H7YQEj9mwHfPd+bYHTPVPjicgpwLvAKFX9gji+F75UdT5wEvF7PwYCt4vIOpyWVj9gB/F5L0pQ1eU43U8R+X/DAkbsWYTzj6Oep9+2EbAuynUKOxE5GVgITFbVxZ7DcXkvAEQkTUSaen6+GDhCnN4PVb1EVTuq6gXAWOCfOBNF4u5eAIhIfRFp5vk5Hafr6QMicD9sllQUiUgDIAtoANQVkV8BvwFmA9k4D4n7ND6W4z8MdAaeEZFnPMeuJT7vBUAKsEREEoGdwABVXSMi8Xo/Sojze1EP+Njz/8Y+4C5V/TQS98NSgxhjjAmIdUkZY4wJiAUMY4wxAbGAYYwxJiAWMIwxxgTEAoYxxpiAWMAwxhgTEAsYxhhjAmIBwxjjl4icISIqIvOjXRcTfRYwTLXi2VRIPV8Pl1PuFZ9yr0SyjjVMZ8/3tVGthYkJFjBMddOZE/uHnO9WQES64WS79WbrzIxAvWoqb8BYE9VamJhgAcNUG55dxRoBq3G2tS0TMEQkAXgB2MWJQGEBo/K8m/BYC8NYwDDVSlfP9zU4SRvP9QQIX/fjPOQeB84CCoCvSl9IRG4RkcUisltEjonIdyIyypPQrXTZ/xKR1z37ax8QkV9E5AsRGexWSRG5XETmeza4OSIiO0VktYhMKVVulKfLrJ/LNVqVHjsQkSs8x54SkYtE5B0R2es5dm4VPl8tz37QX4lIvjh7qj/u2bioM5Crqj+7fVYTXyxgmOrEN2CsxcnaeZb3pGfzpcnASuBjnB371qvqUZ8yiSLyBjAPaAO8BfwPzoYzk4GZvm/oySj8dyANWA78FZgPnAnMFJERpcqPAj7BCVofAH/G2eOjDnBdqc9TXneP97NmuZQ/z1OXIuBF4B/Afyr5+WoD7wHP4HTh/dVT7wnADKAp1rowXqpqX/ZVLb6ADwHF2ej+Vs/Pt/icfxnnoZcO3OI5/2Kpa/zVc3wqUMvneBLwqedcB5/jJwFNXerSDDgA/Mfn2Ok44yvLgdourzmt1O+bgV1+PutUT11u9Dn2uufYAaC7n9cF+/le8hwbgyd7ted4D89xBcZH+7+9fcXGl7UwTLXg6R5JBw7j/DXt/av3fM/5bsAQnACRxYm+90yfa3QDfgu8o6ojVdU7eI6qFuC0JAC6+Rw/qKo7StdHVbcDP+GMqXi1BxKBb1X1mMtrdvvUpRHQGv+Dyd4Whu9f994WxiOquqr0C4L9fCJyEXAf8ImqTlJV9Sn/CfCN51cb8DaAbaBkqo+2QEPgM1UtBL4XkTzgfJ+B7j3AaE957wPXd8D7IUCAwyIy3uU9zvN8F+8Bz7axvwNuANoBJ1OyK9e3yygbZ0ObISLSGKdF8G9V/cXlvbwPf38D8p2Bnar6k6ce9XHuwU7gVT+vCfbzPeT5PtbP9fZ4vluXlAEsYJjqwy0ArAM6AkNxWhS/UdW9nnOdgaPA1z7lr/V8v6OC99oKICLnA//G6WpaDbwJ7MUZSD8TuBv40vsiVd0tIpcB44DewE1AoYj8H/Ckqvo+eL0toDJ/vYtIGk7LZYnP4U44gWqRqhb5qXdQn89Tfg/OmIubNOBnVc2t4HomTljAMNWF74C3VxbQE5gCfAG8AiAiZwKnAqs9XTGISF2gMU73S88A33MWzlapV6jqR74nRGSi58cSLQRV/Rr4tWcwuQdOMPs1cKGIpOqJAfh0z3e3v957+3w+L2+L5HO3igb7+TzlmwBZvl1RPucvAZoDi0ufM/HLxjBMdeEWMNbidK+kAL/zefCVGb/gRDfMaYG8mYi0xBkf+cglWKTgLAwsXZ9iqnpMVd9X1duAFTgB7HSfIu2BAlXdUuradXCmBns/n1dFXVhBfT6cyQGFOEHDzQSXOpg4ZwHDxDzPGMUFwCFODMSCMx20H3Clqq72OV6m+0pV83HWY3QQkf5+3ucyn3UKRzzf00QkyafMqcAcoAXOjKh1nuPpnoWFpa/ZBmfsYCuwzefUMSBJRNr6lK2PM+3VO9ZQuoVxDFjvVvdgP5+n5fUdkCoiN5UqNwK42vOrDXibYtYlZaqDc3Cmt37q23/vGa9Y4FLerYUBMBxYBLwtIu/jPGATgFTPa5JU9QzPtXeJyDLgSuBzT/lmwPU403uLgA2q6g0sDwODRGQ1zuD3Tpxxjj6e80NKjT0sBS4EPhaRf3o+31WeOm3HWWPyPRS3OjoAX7nNvqrM5/OYijNz6m0ReRPYAfwKZ1zoR6Al1sIwvqI9r9e+7KuiL5zBZQWeDbD8Hpzpt4ku5y7EWdS2A2fwejfOX+0vAleVKtsYeA0nzcgB4DNPXdI99XnFp2xfnDGPHGA/TmvgB5y1IWe71KMu8CzO1NzDOMHtfpzutSKcrjBv2S64rCnx89kD/nye8o/gBKYCnHQrb+O0ZrYAu6P9396+YutLVMuMdxljjDFl2BiGMcaYgFjAMMYYExALGMYYYwJiAcMYY0xALGAYY4wJiAUMY4wxAbGAYYwxJiAWMIwxxgTEAoYxxpiAWMAwxhgTEAsYxhhjAvL/ARaWPc4EJdAvAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "ax.scatter(y_test, predicted)\n", - "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n", - "ax.set_xlabel('$Measured$', fontsize = 20)\n", - "ax.set_ylabel('$Predicted$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 交叉验证" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# cross-validation \n", - " \n", - "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", - "- A model is trained using k-1 of the folds as training data;\n", - "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:21:10.344979Z", - "start_time": "2018-04-29T07:21:10.333153Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-1.5787701857180245" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, boston.data , boston.target, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:25:40.617010Z", - "start_time": "2018-04-29T07:25:39.304291Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scores = [cross_val_score(regr, data_X_scale,\\\n", - " boston.target,\\\n", - " cv = int(i)).mean() \\\n", - " for i in range(3, 50)]\n", - "plt.plot(range(3, 50), scores,'r-o')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:25:34.856887Z", - "start_time": "2018-04-29T07:25:34.840623Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.45384871359695633" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_scale = scale(boston.data)\n", - "scores = cross_val_score(regr,data_X_scale, boston.target,\\\n", - " cv = 7)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 使用天涯bbs数据" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:08.949140Z", - "start_time": "2018-05-29T07:23:08.554345Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlelinkauthorauthor_pageclickreplytime
0【民间语文第161期】宁波px启示:船进港湾人应上岸/post-free-2849477-1.shtml贾也http://www.tianya.cn/5049945019467527032012-10-29 07:59
1宁波镇海PX项目引发群体上访 当地政府发布说明(转载)/post-free-2839539-1.shtml无上卫士ABChttp://www.tianya.cn/743418358824410412012-10-24 12:41
\n", - "
" - ], - "text/plain": [ - " title link author \\\n", - "0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 \n", - "1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC \n", - "\n", - " author_page click reply time \n", - "0 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59 \n", - "1 http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41 " - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv('../data/tianya_bbs_threads_list.txt', sep = \"\\t\", header=None)\n", - "df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})\n", - "df[:2]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:27.984100Z", - "start_time": "2018-05-29T07:23:27.969145Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# 定义这个函数的目的是让读者感受到:\n", - "# 抽取不同的样本,得到的结果完全不同。\n", - "def randomSplit(dataX, dataY, num):\n", - " dataX_train = []\n", - " dataX_test = []\n", - " dataY_train = []\n", - " dataY_test = []\n", - " import random\n", - " test_index = random.sample(range(len(df)), num)\n", - " for k in range(len(dataX)):\n", - " if k in test_index:\n", - " dataX_test.append([dataX[k]])\n", - " dataY_test.append(dataY[k])\n", - " else:\n", - " dataX_train.append([dataX[k]])\n", - " dataY_train.append(dataY[k])\n", - " return dataX_train, dataX_test, dataY_train, dataY_test, " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:28.537926Z", - "start_time": "2018-05-29T07:23:28.509765Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'linear_model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m np.log(df.reply+1), 20)\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# Create linear regression object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mregr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlinear_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinearRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m# Train the model using the training sets\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mregr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_X_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_y_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'linear_model' is not defined" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "# Use only one feature\n", - "data_X = df.reply\n", - "# Split the data into training/testing sets\n", - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), \n", - " np.log(df.reply+1), 20)\n", - "# Create linear regression object\n", - "regr = linear_model.LinearRegression()\n", - "# Train the model using the training sets\n", - "regr.fit(data_X_train, data_y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "print('Variance score: %.2f' % regr.score(data_X_test, data_y_test))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:23:16.208659Z", - "start_time": "2018-05-29T07:23:16.054583Z" - } - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'data_X_train' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata_X_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'data_X_train' is not defined" - ] - } - ], - "source": [ - "data_X_train[:3]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:26:38.754002Z", - "start_time": "2018-04-29T07:26:38.751117Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "y_true, y_pred = data_y_test, regr.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:26:41.635527Z", - "start_time": "2018-04-29T07:26:41.541620Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAD+CAYAAADxhFR7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADiFJREFUeJzt3cGLJId1x/Hf690d1iWtkDMzjkLiroowIgHn4u0Q2/ggCET5A5I40CRohV1kdbBxEuSwTW6ui3KISG5NkATeyiFgxdgHWzGOk1MO7o1jcAwyQlGPY9by7p7iNEZy9uUwM8vM7PR0dU9VV7/u7wcGNNVF1Zse9quiqqfK3F0AgBg6bQ8AAKiOaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACORi3Rvc2dnxLMvq3iwArLVbt27ddffdWevVHu0syzQajereLACsNTMbV1mP0yMAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaAPYeGVZKssydTodZVmmsizbHmmq2j+nDQCRlGWpPM81mUwkSePxWHmeS5L6/X6bo52KI20AG20wGDwI9qHJZKLBYNDSRGcj2gA22t7e3lzL20a0AWy0brc71/K2EW0AG60oCiVJcmxZkiQqiqKlic5GtAFstH6/r+FwqDRNZWZK01TD4XAlL0JKkrl7rRvs9XrOXf4AYD5mdsvde7PW40gbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAAiHaABAI0QaAQIg2AARSKdpmtmVm3zezv2t6IADAdFWPtG9IervBOQAAFcyMtpn9uqTflPQPzY8DADjLmdE2M5P0N5I+O2O93MxGZja6c+dOnfMBAI6YdaT9J5L+xd3fPGsldx+6e8/de7u7u/VNBwA45uKM1/9I0hUz+31JvyDpETN7w93/qvnRAAAnnRltd//44X+b2bOSPkGwAaA9fE4bAAKZdXrkAXd/VdKrjU0CAJiJI20ACIRoA0AgRBsAAiHaAHAOZVkqyzJ1Oh1lWaayLBvdX+ULkQCA48qyVJ7nmkwmkqTxeKw8zyVJ/X6/kX1ypA0ACxoMBg+CfWgymWgwGDS2T6INAAva29uba3kdiDYALKjb7c61vA5EGwAWVBSFkiQ5tixJEhVF0dg+iTYALKjf72s4HCpNU5mZ0jTVcDhs7CKkJJm717rBXq/no9Go1m0CwLozs1vu3pu1HkfaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAApkZbTPrmNk3zOwHZvaGmT2zjMEAAA+rcqTtkv7Y3Z+S9FlJRbMjAQCmuThrBXd3SbcPvk0lfbfRiQAAU82MtiSZ2QuSPi/pjqSHTo+YWS4pl6Rut1vnfACAIypdiHT3F919W9INSa+bmZ14fejuPXfv7e7uNjEnAEBzfnrE3V+T9Kik7WbGAQCcpcqnR540sycO/vtjkn7m7ncbnwwA8JAq57Qfl/R1M7sg6SeSPtnsSACAaap8euTfJT21hFkAADPwF5EAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACIRoA0AgRBsAAiHaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEMjPaZnbZzIZm9oaZjc3sc8sYDADwsCpH2o9Iel3Sr0m6KukvzOyDjU4FADjVzGi7+z13/5Lvuyvph5Ieb340AMBJc53TNrMPS7os6XsnludmNjKz0Z07d+qcD3hIWZbKskydTkdZlqksy7ZHApamcrTNbEfSFyVdc3c/+pq7D9295+693d3dumcEHijLUnmeazwey901Ho+V5znhxsaoFG0ze7+kr0q64e7fbnYkYLrBYKDJZHJs2WQy0WAwaGkiYLmqfHrkMUlfkVS4+9eaHwmYbm9vb67lwLqpcqT9GUkfkfSSmb158PVkw3MBp+p2u3MtB9ZNlU+PfMHdH3H3Dx35emsZwwEnFUWhJEmOLUuSREVRtDQRsFz8RSRC6ff7Gg6HStNUZqY0TTUcDtXv99seDVgKO/FBkHPr9Xo+Go1q3SYArDszu+XuvVnrcaQNAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCi3bCyLJVlmTqdjrIsU1mWbY8EILCLbQ+wzsqyVJ7nmkwmkqTxeKw8zyWJx2MBWAhH2g0aDAYPgn1oMploMBi0NBGA6Ih2g/b29uZaDgCzEO0GdbvduZYDwCxEu0FFUShJkmPLkiRRURQtTQQgOqLdoH6/r+FwqDRNZWZK01TD4ZCLkAAWZu5e6wZ7vZ6PRqNatwkA687Mbrl7b9Z6HGkDQCBEGwACIdoAEAjRBoBAiDYABEK0G1SWpXZ2dmRmMjPt7OxwwygA58INoxpSlqWee+45vfvuuw+W3bt3T9euXZPEDaMALIYj7YYMBoNjwT703nvvccMoAAurHG0ze5+ZPdXkMOvkrJtCccMoAIuaGW0ze8zMvizpHUkvND/SejjrplDcMArAoqocad+X9LeS/rThWdZKURTa2tp6aPmlS5e4YRSAhc2Mtrv/1N2/KennS5hnbfT7fb388sva3t5+sGx7e1uvvPIKFyEBLKzyDaPM7FlJn3D3T53yWi4pl6Rut3t1PB7XOSMArL2l3jDK3Yfu3nP33u7ubh2bBACcgo/8AUAgRBsAApn5F5FmdkXSdyRdkXTZzJ6W9Gl3/1bDswEATpgZbXf/H0kfWsIsAIAZOD0CAIEQbQAIhGgDQCBEGwACIdoAEAjRxlRlWSrLMnU6HWVZttSn7rS5b2CV8eQanKosS+V5rslkIkkaj8fK81xS80/daXPfwKqrfMOoqnq9no9Go1q3ieXLskyn3fgrTVO9/fbba7tvoC1LvWEU1s+0p+ss46k7be4bWHVEG6ea9nSdZTx1p819A6uOaONURVEoSZJjy5IkWcpTd9rcN7DqiDZO1e/3NRwOlaapzExpmmo4HC7lQmCb+wZWHRciAWAFcCESANYQ0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCBEGwACIdoAEAjRBoBAiDYABEK0ASAQog0AgRBtAAiEaANAIEQbAAIh2gAQCNEGgECINgAEQrQBIBCiDQCBEG0ACKRStM3sD8zsv8zsTTN7rolByrJUlmXqdDrKskxlWU5dx8x08eJFmdmDdc967Tz7XGT2559/vpbtzrvfo/uZ9lpdP3PToswJLJ27n/kl6YqkH0r6ZUlPSPqxpN1p61+9etXndfPmTU+SxCU9+EqSxG/evHnmOodfW1tbfunSpVNfO7mdefa56OxVZziPs+af9tr169dr+ZmbVtfvBohE0shn9NjdK0X79yTdPPL930v6w2nrLxLtNE1PjV2apjPXqfJ1dDvz7PM8s593u4vuN03Tqa9duHBhKbOdV12/GyCSqtG2/XWnM7PPSdpx98HB9y9Kuu3uf31knVxSLkndbvfqeDw+c5sndTodnTaHmen+/ftnrlPF0e3Ms88qqs4173YX3a+ZSdJc71Xds51XXb8bIBIzu+XuvVnrVTmnvSXp6L+U+5L+7+gK7j50956793Z3d+ebVFK32525fNo6i26/yj4X3XYd2110e91ud+prFy5cmGtbbanrdwOsoyrRvq3989mHfkX757hrUxSFkiQ5tixJEhVFceY6h7a2tnTp0qVTXzu5nXn2uejsVWc4j7Pmn/Zanue1/MxNq+t3A6ylWedPJP2ipB9J+oD2L0S+JemRaesvck7bff/iU5qmbmaepunUi4eH5zsPz88ernvWa+fZ5yKzX79+vZbtzrvfkxduT3utrp+5aVHmBOqius5pS5KZPSvpLw++/XN3/8dp6/Z6PR+NRov/XwQANlDVc9oXq2zM3V+V9Oo5ZwIAnBN/EQkAgRBtAAiEaANAIEQbAAIh2gAQSKWP/M21QbM7kub7O/b27Ei62/YQAfA+VcP7NBvv0XSpu8/8k/Laox2JmY2qfC5y0/E+VcP7NBvv0flxegQAAiHaABDIpkd72PYAQfA+VcP7NBvv0Tlt9DltAIhm04+0ASAUoo2ZzOx9ZvZU23MA2NBoL+Pp8uvAzB4zsy9LekfSC23Ps4rM7LKZDc3sDTMbHzyeDyeYWcfMvmFmPzh4r55pe6aoNu6ctpldkfR9SR/V/mPT/kPSb7j7nVYHW0Fm9qik35L0q5I+6u6fanmklWNm25KelvSapG1J/ymp5+61Pt0pOtt/eOkT7n7bzH5X0hf4vPZiNvFI+xlJ/+ruP3L3H0v6Z0m/3fJMK8ndf+ru35T087ZnWVXufs/dv3Tw8JG72n8U3+Ntz7VqDt6f2wffppK+2+Y8kVV6CMKa+aCO/5n9f0v6pZZmwRoxsw9Luizpe23PsorM7AVJn5d0R/sHT1jAJh5pz3y6PDAvM9uR9EVJ13zTzjlW5O4vuvu2pBuSXj84ZYI5bWK0G3+6PDaLmb1f0lcl3XD3b7c9z6pz99ckPar9awCY0yZG+3VJz5jZB8zsCUkfl/RPLc+EoMzsMUlfkVS4+9fanmdVmdmTB//eZGYfk/Szg2sAmNPGndN293fMbCDp3w4W/Zm7/2+bM62qg0/afEfSFUmXzexpSZ9292+1Othq+Yykj0h6ycxeOlj2O+7+VoszraLHJX3dzC5I+omkT7Y8T1gb95E/AIhsE0+PAEBYRBsAAiHaABAI0QaAQIg2AARCtAEgEKINAIEQbQAIhGgDQCD/D/+oO+KxGV+rAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.scatter(y_pred, y_true, color='black')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:00.422795Z", - "start_time": "2018-04-29T07:27:00.326748Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot outputs\n", - "plt.scatter(data_X_test, data_y_test, color='black')\n", - "plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:36.147084Z", - "start_time": "2018-04-29T07:27:36.142088Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "('Coefficients: \\n', array([ 0.68334304]))" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The coefficients\n", - "'Coefficients: \\n', regr.coef_" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:48.770254Z", - "start_time": "2018-04-29T07:27:48.765411Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Residual sum of squares: 0.40'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The mean square error\n", - "\"Residual sum of squares: %.2f\" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:27:56.521151Z", - "start_time": "2018-04-29T07:27:56.496715Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "df.click_log = [[np.log(df.click[i]+1)] for i in range(len(df))]\n", - "df.reply_log = [[np.log(df.reply[i]+1)] for i in range(len(df))]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:02.712616Z", - "start_time": "2018-04-29T07:28:02.701169Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.62'" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)\n", - "\n", - "# Create linear regression object\n", - "regr = linear_model.LinearRegression()\n", - "# Train the model using the training sets\n", - "regr.fit(Xs_train, y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % regr.score(Xs_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:16.645996Z", - "start_time": "2018-04-29T07:28:16.549017Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot outputs\n", - "plt.scatter(Xs_test, y_test, color='black')\n", - "plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:28:41.441426Z", - "start_time": "2018-04-29T07:28:41.428476Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-0.68370073919430563" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, df.click_log, \\\n", - " df.reply_log, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:29:00.237224Z", - "start_time": "2018-04-29T07:29:00.220565Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-0.71881497228209845" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "regr = linear_model.LinearRegression()\n", - "scores = cross_val_score(regr, df.click_log, \n", - " df.reply_log, cv =5)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn做logistic回归\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- logistic回归是一个分类算法而不是一个回归算法。\n", - "- 可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。\n", - "- 简单来说,它通过将数据拟合进一个逻辑函数(logistic function)来预估一个事件出现的概率。\n", - "- 因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "$$odds= \\frac{p}{1-p} = \\frac{probability\\: of\\: event\\: occurrence} {probability \\:of \\:not\\: event\\: occurrence}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$ln(odds)= ln(\\frac{p}{1-p})$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$logit(x) = ln(\\frac{p}{1-p}) = b_0+b_1X_1+b_2X_2+b_3X_3....+b_kX_k$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/logistic.jpg)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:46:50.277195Z", - "start_time": "2018-04-29T07:46:50.272229Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "repost = []\n", - "for i in df.title:\n", - " if u'转载' in i:\n", - " repost.append(1)\n", - " else:\n", - " repost.append(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:06.292994Z", - "start_time": "2018-04-29T07:47:06.270715Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[[194675, 2703], [88244, 1041], [82779, 625]]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]\n", - "data_X[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:45.269303Z", - "start_time": "2018-04-29T07:47:45.259792Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.61241970021413272" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "df['repost'] = repost\n", - "model = LogisticRegression()\n", - "model.fit(data_X,df.repost)\n", - "model.score(data_X,df.repost)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:47:59.648431Z", - "start_time": "2018-04-29T07:47:59.633936Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "def randomSplitLogistic(dataX, dataY, num):\n", - " dataX_train = []\n", - " dataX_test = []\n", - " dataY_train = []\n", - " dataY_test = []\n", - " import random\n", - " test_index = random.sample(range(len(df)), num)\n", - " for k in range(len(dataX)):\n", - " if k in test_index:\n", - " dataX_test.append(dataX[k])\n", - " dataY_test.append(dataY[k])\n", - " else:\n", - " dataX_train.append(dataX[k])\n", - " dataY_train.append(dataY[k])\n", - " return dataX_train, dataX_test, dataY_train, dataY_test, " - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:48:27.726443Z", - "start_time": "2018-04-29T07:48:27.710922Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.45'" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Split the data into training/testing sets\n", - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "# Create logistic regression object\n", - "log_regr = LogisticRegression()\n", - "# Train the model using the training sets\n", - "log_regr.fit(data_X_train, data_y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:48:56.873331Z", - "start_time": "2018-04-29T07:48:56.870219Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "y_true, y_pred = data_y_test, log_regr.predict(data_X_test)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:39:12.344043Z", - "start_time": "2018-04-29T07:39:12.338223Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", - " array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_true, y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:39:13.175680Z", - "start_time": "2018-04-29T07:39:13.171386Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.50 0.17 0.25 6\n", - " 1 0.72 0.93 0.81 14\n", - "\n", - "avg / total 0.66 0.70 0.64 20\n", - "\n" - ] - } - ], - "source": [ - "print(classification_report(y_true, y_pred))" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:43.039620Z", - "start_time": "2018-04-29T07:51:43.034812Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:47.690742Z", - "start_time": "2018-04-29T07:51:47.683127Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Variance score: 0.60'" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create logistic regression object\n", - "log_regr = LogisticRegression()\n", - "# Train the model using the training sets\n", - "log_regr.fit(Xs_train, y_train)\n", - "# Explained variance score: 1 is perfect prediction\n", - "'Variance score: %.2f' % log_regr.score(Xs_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:51:55.780061Z", - "start_time": "2018-04-29T07:51:55.771924Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Logistic score for test set: 0.595745\n", - "Logistic score for training set: 0.613941\n", - " precision recall f1-score support\n", - "\n", - " 0 1.00 0.03 0.05 39\n", - " 1 0.59 1.00 0.74 55\n", - "\n", - "avg / total 0.76 0.60 0.46 94\n", - "\n" - ] - } - ], - "source": [ - "print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))\n", - "print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))\n", - "y_true, y_pred = y_test, log_regr.predict(Xs_test)\n", - "print(classification_report(y_true, y_pred))" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:52:53.880925Z", - "start_time": "2018-04-29T07:52:53.866672Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.53333333333333333" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logre = LogisticRegression()\n", - "scores = cross_val_score(logre, data_X, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T07:53:26.825100Z", - "start_time": "2018-04-29T07:53:26.810871Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.62948717948717947" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logre = LogisticRegression()\n", - "data_X_scale = scale(data_X)\n", - "# The importance of preprocessing in data science and the machine learning pipeline I: \n", - "scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现贝叶斯预测\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# Naive Bayes algorithm\n", - "\n", - "It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. \n", - "\n", - "In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. \n", - "\n", - "why it is known as ‘Naive’? For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "贝叶斯定理为使用$p(c)$, $p(x)$, $p(x|c)$ 计算后验概率$P(c|x)$提供了方法:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "$$\n", - "p(c|x) = \\frac{p(x|c) p(c)}{p(x)}\n", - "$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- P(c|x) is the posterior probability of class (c, target) given predictor (x, attributes).\n", - "- P(c) is the prior probability of class.\n", - "- P(x|c) is the likelihood which is the probability of predictor given class.\n", - "- P(x) is the prior probability of predictor." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/Bayes_41.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Step 1: Convert the data set into a frequency table\n", - "\n", - "Step 2: Create Likelihood table by finding the probabilities like:\n", - "- p(Overcast) = 0.29, p(rainy) = 0.36, p(sunny) = 0.36\n", - "- p(playing) = 0.64, p(rest) = 0.36\n", - "\n", - "Step 3: Now, use Naive Bayesian equation to calculate the posterior probability for each class. The class with the highest posterior probability is the outcome of prediction." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## Problem: Players will play if weather is sunny. Is this statement is correct?\n", - "\n", - "We can solve it using above discussed method of posterior probability.\n", - "\n", - "$P(Yes | Sunny) = \\frac{P( Sunny | Yes) * P(Yes) } {P (Sunny)}$\n", - "\n", - "Here we have P (Sunny |Yes) = 3/9 = 0.33, P(Sunny) = 5/14 = 0.36, P( Yes)= 9/14 = 0.64\n", - "\n", - "Now, $P (Yes | Sunny) = \\frac{0.33 * 0.64}{0.36} = 0.60$, which has higher probability." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'ABCMeta BaseDiscreteNB BaseEstimator BaseNB BernoulliNB ClassifierMixin GaussianNB LabelBinarizer MultinomialNB __all__ __builtins__ __doc__ __file__ __name__ __package__ _check_partial_fit_first_call abstractmethod binarize check_X_y check_array check_is_fitted in1d issparse label_binarize logsumexp np safe_sparse_dot six'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn import naive_bayes\n", - "' '.join(dir(naive_bayes)) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- naive_bayes.GaussianNB\tGaussian Naive Bayes (GaussianNB)\n", - "- naive_bayes.MultinomialNB([alpha, ...])\tNaive Bayes classifier for multinomial models\n", - "- naive_bayes.BernoulliNB([alpha, binarize, ...])\tNaive Bayes classifier for multivariate Bernoulli models." - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:02:37.644606Z", - "start_time": "2018-04-29T08:02:37.635952Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "#Import Library of Gaussian Naive Bayes model\n", - "from sklearn.naive_bayes import GaussianNB\n", - "import numpy as np\n", - "\n", - "#assigning predictor and target variables\n", - "x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])\n", - "Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:02:52.828101Z", - "start_time": "2018-04-29T08:02:52.818463Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([4, 3])" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Create a Gaussian Classifier\n", - "model = GaussianNB()\n", - "\n", - "# Train the model using the training sets \n", - "model.fit(x[:8], Y[:8])\n", - "\n", - "#Predict Output \n", - "predicted= model.predict([[1,2],[3,4]])\n", - "predicted" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# cross-validation \n", - " \n", - "k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", - "- A model is trained using k-1 of the folds as training data;\n", - "- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy)." - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:04:04.297675Z", - "start_time": "2018-04-29T08:04:04.273413Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)\n", - "# Train the model using the training sets \n", - "model.fit(data_X_train, data_y_train)\n", - "\n", - "#Predict Output \n", - "predicted= model.predict(data_X_test)\n", - "predicted" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:04:34.184513Z", - "start_time": "2018-04-29T08:04:34.178511Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.65000000000000002" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.score(data_X_test, data_y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:05:04.297453Z", - "start_time": "2018-04-29T08:05:04.249311Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=7.\n", - " % (min_labels, self.n_folds)), Warning)\n" - ] - }, - { - "data": { - "text/plain": [ - "0.53413410073295453" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cross_validation import cross_val_score\n", - "\n", - "model = GaussianNB()\n", - "scores = cross_val_score(model, [[c] for c in df.click],\\\n", - " df.reply, cv = 7)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现决策树\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# 决策树\n", - "- 这个监督式学习算法通常被用于分类问题。\n", - "- 它同时适用于分类变量和连续因变量。\n", - "- 在这个算法中,我们将总体分成两个或更多的同类群。\n", - "- 这是根据最重要的属性或者自变量来分成尽可能不同的组别。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/tree.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/playtree.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 在上图中你可以看到,根据多种属性,人群被分成了不同的四个小组,来判断 “他们会不会去玩”。\n", - "### 为了把总体分成不同组别,需要用到许多技术,比如说 Gini、Information Gain、Chi-square、entropy。" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:10:20.871345Z", - "start_time": "2018-04-29T08:10:20.855125Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "from sklearn import tree\n", - "model = tree.DecisionTreeClassifier(criterion='gini')" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:10:49.988277Z", - "start_time": "2018-04-29T08:10:49.973060Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.91275167785234901" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "model.fit(data_X_train,data_y_train)\n", - "model.score(data_X_train,data_y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:11:12.730866Z", - "start_time": "2018-04-29T08:11:12.725782Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Predict\n", - "model.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:11:28.411441Z", - "start_time": "2018-04-29T08:11:28.397481Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.33461538461538459" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# crossvalidation\n", - "scores = cross_val_score(model, data_X, df.repost, cv = 3)\n", - "scores.mean() " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "> # 使用sklearn实现SVM支持向量机\n", - "***\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/svm.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- 将每个数据在N维空间中用点标出(N是你所有的特征总数),每个特征的值是一个坐标的值。\n", - " - 举个例子,如果我们只有身高和头发长度两个特征,我们会在二维空间中标出这两个变量,每个点有两个坐标(这些坐标叫做支持向量)。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/xyplot.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- 现在,我们会找到将两组不同数据分开的一条直线。\n", - " - 两个分组中距离最近的两个点到这条线的距离同时最优化。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "![](./img/sumintro.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## 上面示例中的黑线将数据分类优化成两个小组\n", - "- 两组中距离最近的点(图中A、B点)到达黑线的距离满足最优条件。\n", - " - 这条直线就是我们的分割线。接下来,测试数据落到直线的哪一边,我们就将它分到哪一类去。" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:29.788250Z", - "start_time": "2018-04-29T08:17:29.785022Z" - } - }, - "outputs": [], - "source": [ - "from sklearn import svm\n", - "# Create SVM classification object \n", - "model=svm.SVC() " - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:31.035310Z", - "start_time": "2018-04-29T08:17:31.030713Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __path__ __spec__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "' '.join(dir(svm))" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:41.872379Z", - "start_time": "2018-04-29T08:17:41.849759Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.90380313199105144" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)\n", - "model.fit(data_X_train,data_y_train)\n", - "model.score(data_X_train,data_y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:17:47.661313Z", - "start_time": "2018-04-29T08:17:47.655841Z" - }, - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Predict\n", - "model.predict(data_X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:18:00.419986Z", - "start_time": "2018-04-29T08:17:58.671257Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# crossvalidation\n", - "scores = []\n", - "cvs = [3, 5, 10, 25, 50, 75, 100]\n", - "for i in cvs:\n", - " score = cross_val_score(model, data_X, df.repost,\n", - " cv = i)\n", - " scores.append(score.mean() ) # Try to tune cv\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-29T08:18:05.493658Z", - "start_time": "2018-04-29T08:18:05.359658Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(cvs, scores, 'b-o')\n", - "plt.xlabel('$cv$', fontsize = 20)\n", - "plt.ylabel('$Score$', fontsize = 20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "\n", - "\n", - "> # 泰坦尼克号数据分析\n", - "\n", - "王成军\n", - "\n", - "wangchengjun@nju.edu.cn\n", - "\n", - "计算传播网 http://computational-communication.com" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:31:28.492497Z", - "start_time": "2018-05-29T07:31:28.488728Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn import tree\n", - "import warnings \n", - "warnings.filterwarnings(\"ignore\") \n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "ExecuteTime": { - "end_time": "2018-06-06T07:02:49.855926Z", - "start_time": "2018-06-06T07:02:49.705773Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "train = pd.read_csv('../data/tatanic_train.csv', \n", - " sep = \",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "ExecuteTime": { - "end_time": "2018-06-06T07:02:52.803564Z", - "start_time": "2018-06-06T07:02:52.759733Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 PassengerId Survived Pclass \\\n", - "0 0 1 0 3 \n", - "1 1 2 1 1 \n", - "2 2 3 1 3 \n", - "3 3 4 1 1 \n", - "4 4 5 0 3 \n", - "\n", - " Name Sex Age SibSp \\\n", - "0 Braund, Mr. Owen Harris male 22.0 1 \n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", - "2 Heikkinen, Miss. Laina female 26.0 0 \n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", - "4 Allen, Mr. William Henry male 35.0 0 \n", - "\n", - " Parch Ticket Fare Cabin Embarked \n", - "0 0 A/5 21171 7.2500 NaN S \n", - "1 0 PC 17599 71.2833 C85 C \n", - "2 0 STON/O2. 3101282 7.9250 NaN S \n", - "3 0 113803 53.1000 C123 S \n", - "4 0 373450 8.0500 NaN S " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train.head() " - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:58.070575Z", - "start_time": "2018-05-29T07:28:57.897862Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", - "train[\"Fare\"] = train[\"Fare\"].fillna(train[\"Fare\"].median())\n", - "#Convert the male and female groups to integer form\n", - "train[\"Sex\"][train[\"Sex\"] == \"male\"] = 0\n", - "train[\"Sex\"][train[\"Sex\"] == \"female\"] = 1\n", - "#Impute the Embarked variable\n", - "train[\"Embarked\"] = train[\"Embarked\"].fillna('S')\n", - "#Convert the Embarked classes to integer form\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"S\"] = 0\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"C\"] = 1\n", - "train[\"Embarked\"][train[\"Embarked\"] == \"Q\"] = 2" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:08.358884Z", - "start_time": "2018-05-29T07:28:08.346226Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.12294397 0.31274009 0.23680307 0.32751287]\n", - "0.977553310887\n" - ] - } - ], - "source": [ - "#Create the target and features numpy arrays: target, features_one\n", - "target = train['Survived'].values\n", - "features_one = train[[\"Pclass\", \"Sex\", \"Age\", \"Fare\"]].values\n", - "\n", - "#Fit your first decision tree: my_tree_one\n", - "my_tree_one = tree.DecisionTreeClassifier()\n", - "my_tree_one = my_tree_one.fit(features_one, target)\n", - "#Look at the importance of the included features and print the score\n", - "print(my_tree_one.feature_importances_)\n", - "print(my_tree_one.score(features_one, target))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:15.915998Z", - "start_time": "2018-05-29T07:28:15.705994Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "test = pd.read_csv('../data/tatanic_test.csv', sep = \",\")\n", - "# Impute the missing value with the median\n", - "test.Fare[152] = test.Fare.median()\n", - "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", - "#Convert the male and female groups to integer form\n", - "test[\"Sex\"][test[\"Sex\"] == \"male\"] = 0\n", - "test[\"Sex\"][test[\"Sex\"] == \"female\"] = 1\n", - "\n", - "#Impute the Embarked variable\n", - "test[\"Embarked\"] = test[\"Embarked\"].fillna('S')\n", - "#Convert the Embarked classes to integer form\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"S\"] = 0\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"C\"] = 1\n", - "test[\"Embarked\"][test[\"Embarked\"] == \"Q\"] = 2\n", - "\n", - "# Extract the features from the test set: Pclass, Sex, Age, and Fare.\n", - "test_features = test[[\"Pclass\",\"Sex\", \"Age\", \"Fare\"]].values\n", - "\n", - "# Make your prediction using the test set\n", - "my_prediction = my_tree_one.predict(test_features)\n", - "\n", - "# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions\n", - "PassengerId =np.array(test['PassengerId']).astype(int)\n", - "my_solution = pd.DataFrame(my_prediction, PassengerId, columns = [\"Survived\"])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:18.081288Z", - "start_time": "2018-05-29T07:28:18.074414Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Survived
8920
8930
8941
\n", - "
" - ], - "text/plain": [ - " Survived\n", - "892 0\n", - "893 0\n", - "894 1" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_solution[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:25:44.488717Z", - "start_time": "2018-05-29T07:25:44.484381Z" - }, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(418, 1)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Check that your data frame has 418 entries\n", - "my_solution.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# Write your solution to a csv file with the name my_solution.csv \n", - "my_solution.to_csv(\"../data/tatanic_solution_one.csv\", \n", - " index_label = [\"PassengerId\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:26.996353Z", - "start_time": "2018-05-29T07:28:26.982601Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.905723905724\n" - ] - } - ], - "source": [ - "# Create a new array with the added features: features_two\n", - "features_two = train[[\"Pclass\",\"Age\",\"Sex\",\"Fare\",\\\n", - " \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "\n", - "#Control overfitting by setting \"max_depth\" to 10 and \"min_samples_split\" to 5 : my_tree_two\n", - "max_depth = 10\n", - "min_samples_split = 5\n", - "my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, \n", - " min_samples_split = min_samples_split, \n", - " random_state = 1)\n", - "my_tree_two = my_tree_two.fit(features_two, target)\n", - "\n", - "#Print the score of the new decison tree\n", - "print(my_tree_two.score(features_two, target))" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:28.033226Z", - "start_time": "2018-05-29T07:28:28.018293Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.979797979798\n" - ] - } - ], - "source": [ - "# create a new train set with the new variable\n", - "train_two = train\n", - "train_two['family_size'] = train.SibSp + train.Parch + 1\n", - "\n", - "# Create a new decision tree my_tree_three\n", - "features_three = train[[\"Pclass\", \"Sex\", \"Age\", \\\n", - " \"Fare\", \"SibSp\", \"Parch\", \"family_size\"]].values\n", - "\n", - "my_tree_three = tree.DecisionTreeClassifier()\n", - "my_tree_three = my_tree_three.fit(features_three, target)\n", - "\n", - "# Print the score of this decision tree\n", - "print(my_tree_three.score(features_three, target))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:28:32.678968Z", - "start_time": "2018-05-29T07:28:32.465958Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.939393939394\n", - "418\n", - "[0 0 0]\n" - ] - } - ], - "source": [ - "#Import the `RandomForestClassifier`\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "#We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables\n", - "features_forest = train[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "\n", - "#Building the Forest: my_forest\n", - "n_estimators = 100\n", - "forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, \n", - " n_estimators = n_estimators, random_state = 1)\n", - "my_forest = forest.fit(features_forest, target)\n", - "\n", - "#Print the score of the random forest\n", - "print(my_forest.score(features_forest, target))\n", - "\n", - "#Compute predictions and print the length of the prediction vector:test_features, pred_forest\n", - "test_features = test[[\"Pclass\", \"Age\", \"Sex\", \"Fare\", \"SibSp\", \"Parch\", \"Embarked\"]].values\n", - "pred_forest = my_forest.predict(test_features)\n", - "print(len(test_features))\n", - "print(pred_forest[:3])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2018-05-29T07:26:25.602062Z", - "start_time": "2018-05-29T07:26:25.572689Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.14130255 0.17906027 0.41616727 0.17938711 0.05039699 0.01923751\n", - " 0.0144483 ]\n", - "[ 0.10384741 0.20139027 0.31989322 0.24602858 0.05272693 0.04159232\n", - " 0.03452128]\n", - "0.905723905724\n", - "0.939393939394\n" - ] - } - ], - "source": [ - "#Request and print the `.feature_importances_` attribute\n", - "print(my_tree_two.feature_importances_)\n", - "print(my_forest.feature_importances_)\n", - "\n", - "#Compute and print the mean accuracy score for both models\n", - "print(my_tree_two.score(features_two, target))\n", - "print(my_forest.score(features_two, target))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 阅读材料\n", - "机器学习算法的要点(附 Python 和 R 代码)http://blog.csdn.net/a6225301/article/details/50479672\n", - "\n", - "The \"Python Machine Learning\" book code repository and info resource https://github.com/rasbt/python-machine-learning-book\n", - "\n", - "An Introduction to Statistical Learning (James, Witten, Hastie, Tibshirani, 2013) : Python code https://github.com/JWarmenhoven/ISLR-python\n", - "\n", - "BuildingMachineLearningSystemsWithPython https://github.com/luispedro/BuildingMachineLearningSystemsWithPython" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# 作业\n", - "https://www.datacamp.com/community/tutorials/the-importance-of-preprocessing-in-data-science-and-the-machine-learning-pipeline-i-centering-scaling-and-k-nearest-neighbours" - ] - } - ], - "metadata": { - "celltoolbar": "Slideshow", - "kernelspec": { - "display_name": "Python [conda env:anaconda]", - "language": "python", - "name": "conda-env-anaconda-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.4" - }, - "latex_envs": { - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 0 - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": false, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "780px", - "left": "1279px", - "top": "168.667px", - "width": "341px" - }, - "toc_section_display": false, - "toc_window_display": true - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}