diff --git a/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb b/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb index 2f323b5..ab6c1fc 100644 --- a/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb +++ b/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb @@ -128,7 +128,8 @@ "source": [ "# Load the dataset of house prices in Ames, and convert to\n", "# a data frame format so it's easier to view and process\n", - "ames_df = pd.DataFrame(housing['data'])\n", + "ames_df = pd.DataFrame(housing['data'], columns = housing['feature_names'])\n", + "ames_df['SalePrice'] = housing.target\n", "ames_df" ] }, @@ -175,7 +176,7 @@ }, "outputs": [], "source": [ - "# Determine number of tracts that bound the Charles River two ways:\n", + "# Determine number of homes sold normally two ways:\n", "# (1) with the query function\n" ] }, @@ -218,19 +219,6 @@ "# Now calculate the percentage of houses sold normally.\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "AJZKng3Bs7Vd" - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, { "cell_type": "markdown", "metadata": { @@ -264,7 +252,7 @@ "id": "RLZ-k3L7s7Vq" }, "source": [ - "What percentage of tracts have a median price **between** $200,000 and $500,000?" + "What percentage of houses have a sale price **between** $200,000 and $500,000?" ] }, { diff --git a/Lessons/_Keys/KEY_Lesson22_Basic_Stats_II_Percents.ipynb b/Lessons/_Keys/KEY_Lesson22_Basic_Stats_II_Percents.ipynb index 7380dd1..c28eabf 100644 --- a/Lessons/_Keys/KEY_Lesson22_Basic_Stats_II_Percents.ipynb +++ b/Lessons/_Keys/KEY_Lesson22_Basic_Stats_II_Percents.ipynb @@ -1,31 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "pcMCEdrks7Ut", - "colab_type": "text" + "colab_type": "text", + "id": "pcMCEdrks7Ut" }, "source": [ "# Basic Statistics I: Percents" @@ -34,8 +13,8 @@ { "cell_type": "markdown", "metadata": { - "id": "wAk5jXgBs7U0", - "colab_type": "text" + "colab_type": "text", + "id": "wAk5jXgBs7U0" }, "source": [ "A **percentage** is a number or ratio expressed as a fraction of 100. We'll do some examples together to learn how to calculate percentages." @@ -44,8 +23,8 @@ { "cell_type": "markdown", "metadata": { - "id": "vvDdKp98s7U3", - "colab_type": "text" + "colab_type": "text", + "id": "vvDdKp98s7U3" }, "source": [ "**Example 1:** For a basket of 18 fruits, there are 5 apples, 3 bananas, 6 peaches, and 4 oranges." @@ -54,8 +33,8 @@ { "cell_type": "markdown", "metadata": { - "id": "DFsUN3HAs7U4", - "colab_type": "text" + "colab_type": "text", + "id": "DFsUN3HAs7U4" }, "source": [ "What percentage of fruits are apples? " @@ -63,23 +42,34 @@ }, { "cell_type": "code", + "execution_count": 1, "metadata": { - "id": "agUStgpUs7U5", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "agUStgpUs7U5" }, + "outputs": [ + { + "data": { + "text/plain": [ + "27.77777777777778" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Calculate percentage for apples\n", "5/18*100" - ], - "execution_count": 0, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "zGoaA_fhs7U9", - "colab_type": "text" + "colab_type": "text", + "id": "zGoaA_fhs7U9" }, "source": [ "What percentage of fruits are oranges **and** peaches? " @@ -87,131 +77,539 @@ }, { "cell_type": "code", + "execution_count": 2, "metadata": { - "id": "DIr9ZO4us7U-", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "DIr9ZO4us7U-" }, + "outputs": [ + { + "data": { + "text/plain": [ + "55.55555555555556" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Calculate percentage for oranges and peaches\n", "(4+6)/18*100" - ], - "execution_count": 0, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "5ADm2TV-s7VG", - "colab_type": "text" + "colab_type": "text", + "id": "5ADm2TV-s7VG" }, "source": [ - "**Example 2:** Let's learn to calculate percentages by using real world data. We will work with a dataset of Boston housing prices." + "**Example 2:** Let's learn to calculate percentages by using real world data. We will work with a dataset of Ames, Iowa housing prices." ] }, { "cell_type": "code", + "execution_count": 6, "metadata": { - "id": "CSoS_MUus7VH", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "CSoS_MUus7VH" }, + "outputs": [], "source": [ - "# Import the load_boston method \n", - "from sklearn.datasets import load_boston" - ], - "execution_count": 0, - "outputs": [] + "# Import the fetch_openml method \n", + "from sklearn.datasets import fetch_openml\n", + "housing = fetch_openml(name=\"house_prices\", as_frame=True, parser=\"auto\")" + ] }, { "cell_type": "code", + "execution_count": 7, "metadata": { - "id": "9Q6sI8C0s7VL", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "9Q6sI8C0s7VL" }, + "outputs": [], "source": [ - "# Import pandas, so that we can work with the data frame version of the Boston housing data\n", + "# Import pandas, so that we can work with the data frame version of the Ames housing data\n", "import pandas as pd" - ], - "execution_count": 0, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 10, "metadata": { - "scrolled": true, - "id": "hepVTCgss7VR", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "hepVTCgss7VR", + "scrolled": true }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
..................................................................
1455145660RL62.07917PaveNaNRegLvlAllPub...0NaNNaNNaN082007WDNormal175000
1456145720RL85.013175PaveNaNRegLvlAllPub...0NaNMnPrvNaN022010WDNormal210000
1457145870RL66.09042PaveNaNRegLvlAllPub...0NaNGdPrvShed250052010WDNormal266500
1458145920RL68.09717PaveNaNRegLvlAllPub...0NaNNaNNaN042010WDNormal142125
1459146020RL75.09937PaveNaNRegLvlAllPub...0NaNNaNNaN062008WDNormal147500
\n", + "

1460 rows × 81 columns

\n", + "
" + ], + "text/plain": [ + " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 1 60 RL 65.0 8450 Pave NaN Reg \n", + "1 2 20 RL 80.0 9600 Pave NaN Reg \n", + "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", + "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", + "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", + "... ... ... ... ... ... ... ... ... \n", + "1455 1456 60 RL 62.0 7917 Pave NaN Reg \n", + "1456 1457 20 RL 85.0 13175 Pave NaN Reg \n", + "1457 1458 70 RL 66.0 9042 Pave NaN Reg \n", + "1458 1459 20 RL 68.0 9717 Pave NaN Reg \n", + "1459 1460 20 RL 75.0 9937 Pave NaN Reg \n", + "\n", + " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n", + "0 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "1 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "2 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "3 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "4 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "... ... ... ... ... ... ... ... ... \n", + "1455 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "1456 Lvl AllPub ... 0 NaN MnPrv NaN 0 \n", + "1457 Lvl AllPub ... 0 NaN GdPrv Shed 2500 \n", + "1458 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "1459 Lvl AllPub ... 0 NaN NaN NaN 0 \n", + "\n", + " MoSold YrSold SaleType SaleCondition SalePrice \n", + "0 2 2008 WD Normal 208500 \n", + "1 5 2007 WD Normal 181500 \n", + "2 9 2008 WD Normal 223500 \n", + "3 2 2006 WD Abnorml 140000 \n", + "4 12 2008 WD Normal 250000 \n", + "... ... ... ... ... ... \n", + "1455 8 2007 WD Normal 175000 \n", + "1456 2 2010 WD Normal 210000 \n", + "1457 5 2010 WD Normal 266500 \n", + "1458 4 2010 WD Normal 142125 \n", + "1459 6 2008 WD Normal 147500 \n", + "\n", + "[1460 rows x 81 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Load the dataset of housing prices in Boston, and convert to\n", + "# Load the dataset of house prices in Ames, and convert to\n", "# a data frame format so it's easier to view and process\n", - "boston = load_boston()\n", - "boston_df = pd.DataFrame(boston['data'], columns = boston['feature_names'])\n", - "boston_df['PRICE'] = boston.target\n", - "boston_df" - ], - "execution_count": 0, - "outputs": [] + "ames_df = pd.DataFrame(housing['data'], columns = housing['feature_names'])\n", + "ames_df['SalePrice'] = housing.target\n", + "ames_df" + ] }, { "cell_type": "markdown", "metadata": { - "id": "eyMUHGews7VZ", - "colab_type": "text" + "colab_type": "text", + "id": "eyMUHGews7VZ" }, "source": [ - "CHAS is the indicator variable we used last week, where 1 indicates that the property (tract) is on the Charles River and 0 means otherwise." + "The `SaleCondition` column lists the condition of the house sale:\n", + "\n", + "\n", + "* `Normal`: Normal Sale \n", + "\n", + "* `Abnorml`: Abnormal Sale - trade, foreclosure, short sale\n", + "\n", + "* `AdjLand`: Adjoining Land Purchase\n", + "\n", + "* `Alloca`: Allocation - two linked properties with separate deeds, typically condo with a garage unit\n", + "\n", + "* `Family`: Sale between family members \n", + "\n", + "* `Partial`: Home was not completed when last assessed (associated with New Homes)\n" ] }, { "cell_type": "markdown", "metadata": { - "id": "IMpeHBEzs7VZ", - "colab_type": "text" + "colab_type": "text", + "id": "IMpeHBEzs7VZ" }, "source": [ - "What percentage of the tracts bound the Charles River? We'll see how to do this using the query method AND using boolean indexing." + "What percentage of the houses were sold normally? We'll see how to do this using the query method AND using boolean indexing." ] }, { "cell_type": "code", + "execution_count": 12, "metadata": { - "id": "sX1Nw-nRSEhW", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "sX1Nw-nRSEhW" }, + "outputs": [ + { + "data": { + "text/plain": [ + "1198" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Determine number of tracts that bound the Charles River two ways:\n", "# (1) with the query function\n", - "num_bound_river = len(boston_df.query(\"CHAS == 1\"))\n", - "num_bound_river" - ], - "execution_count": 0, - "outputs": [] + "num_normal = len(ames_df.query(\"SaleCondition == 'Normal'\"))\n", + "num_normal" + ] }, { "cell_type": "code", + "execution_count": 13, "metadata": { - "id": "qU1vhvM0s7Va", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "qU1vhvM0s7Va" }, + "outputs": [ + { + "data": { + "text/plain": [ + "1198" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# (2) using boolean indexing\n", - "num_bound_river = sum(boston_df[\"CHAS\"] == 1)\n", - "num_bound_river" - ], - "execution_count": 0, - "outputs": [] + "num_normal = sum(ames_df[\"SaleCondition\"] == \"Normal\")\n", + "num_normal" + ] }, { "cell_type": "markdown", "metadata": { - "id": "NnEK_TTTSWvi", - "colab_type": "text" + "colab_type": "text", + "id": "NnEK_TTTSWvi" }, "source": [ "How do these two methods give the same answer?" @@ -219,100 +617,141 @@ }, { "cell_type": "code", + "execution_count": 14, "metadata": { - "id": "mJO-elGkSMuC", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "mJO-elGkSMuC" }, - "source": [ - "# Determine the total number of tracts in the dataset\n", - "total_num = len(boston_df)\n", - "\n", - "# Now calculate the percentage of tracts that bounds the Charles River.\n", - "num_bound_river/total_num*100" + "outputs": [ + { + "data": { + "text/plain": [ + "82.05479452054794" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "AJZKng3Bs7Vd", - "colab_type": "code", - "colab": {} - }, "source": [ - "import numpy as np" - ], - "execution_count": 0, - "outputs": [] + "# Determine the total number of houses in the dataset\n", + "total_num = len(ames_df)\n", + "\n", + "# Now calculate the percentage of houses sold normally.\n", + "num_normal/total_num*100" + ] }, { "cell_type": "markdown", "metadata": { - "id": "kFGToww_s7Vg", - "colab_type": "text" + "colab_type": "text", + "id": "kFGToww_s7Vg" }, "source": [ - "What percentage of tracts have a median price less than $10,000?" + "What percentage of houses have a price less than $200,000?" ] }, { "cell_type": "code", + "execution_count": 15, "metadata": { - "id": "xiZbDvpOs7Vh", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "xiZbDvpOs7Vh" }, + "outputs": [ + { + "data": { + "text/plain": [ + "70.2054794520548" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Determine number of tracts that cost less than $10,000\n", - "num_cost_less_10k = sum(boston_df[\"PRICE\"] < 10)\n", + "# Determine number of houses that cost less than $200,000\n", + "num_cost_less_200k = sum(ames_df[\"SalePrice\"] < 200000)\n", "\n", - "# Calculate the percentage of tracts that cost less than $10k.\n", - "num_cost_less_10k/total_num*100" - ], - "execution_count": 0, - "outputs": [] + "# Calculate the percentage of houses that cost less than $200k.\n", + "num_cost_less_200k/total_num*100" + ] }, { "cell_type": "markdown", "metadata": { - "id": "RLZ-k3L7s7Vq", - "colab_type": "text" + "colab_type": "text", + "id": "RLZ-k3L7s7Vq" }, "source": [ - "What percentage of tracts have a median price **between** \\$10,000 and \\$30,000?" + "What percentage of houses have a sale price **between** $200,000 and $500,000?" ] }, { "cell_type": "code", + "execution_count": 17, "metadata": { - "id": "tWeQmqPos7Vr", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "tWeQmqPos7Vr" }, + "outputs": [ + { + "data": { + "text/plain": [ + "28.63013698630137" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Make an array of booleans with cost greater than $10,000 AND less than $30,000\n", - "between_10k_and_30k = (boston_df[\"PRICE\"] > 10) and (boston_df[\"PRICE\"] < 30)\n", + "# Make an array of booleans with cost greater than $200,000 AND less than $500,000\n", + "between_200k_and_500k = (ames_df[\"SalePrice\"] > 200000) & (ames_df[\"SalePrice\"] < 500000)\n", "\n", - "# Determine number of tracts that cost between $10,000 and $30,000\n", - "num_between_10k_and_30k = sum(between_10k_and_30k)\n", + "# Determine number of houses that cost between $200,000 and $500,000\n", + "num_between_200k_and_500k = sum(between_200k_and_500k)\n", "\n", - "# Calculate the percentage of tracts between $10,000 and $30,000\n", - "num_between_10k_and_30k/total_num*100" - ], - "execution_count": 0, - "outputs": [] + "# Calculate the percentage of houses between $200,000 and $500,000\n", + "num_between_200k_and_500k/total_num*100" + ] }, { "cell_type": "markdown", "metadata": { - "id": "eje8y19Gs7Vv", - "colab_type": "text" + "colab_type": "text", + "id": "eje8y19Gs7Vv" }, "source": [ "Good work! You just learned about how to calculate percentages in Python!" ] } - ] + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 0 }