From 7ee477a9b172a70bcce69d044ba1db17a5ee8694 Mon Sep 17 00:00:00 2001 From: cristhian Date: Sun, 12 Jan 2025 21:33:04 +0100 Subject: [PATCH] week 5 lab 4 --- lab-hypothesis-testing.ipynb | 303 ++++++++++++++++++++++++++++------- 1 file changed, 249 insertions(+), 54 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 18ad6d5..d13f229 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -45,13 +45,12 @@ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -101,7 +100,7 @@ " 65\n", " 45\n", " 1\n", - " False\n", + " Non-Legendary\n", " \n", " \n", " 1\n", @@ -115,7 +114,7 @@ " 80\n", " 60\n", " 1\n", - " False\n", + " Non-Legendary\n", " \n", " \n", " 2\n", @@ -129,7 +128,7 @@ " 100\n", " 80\n", " 1\n", - " False\n", + " Non-Legendary\n", " \n", " \n", " 3\n", @@ -143,7 +142,7 @@ " 120\n", " 80\n", " 1\n", - " False\n", + " Non-Legendary\n", " \n", " \n", " 4\n", @@ -157,7 +156,7 @@ " 50\n", " 65\n", " 1\n", - " False\n", + " Non-Legendary\n", " \n", " \n", " ...\n", @@ -185,7 +184,7 @@ " 150\n", " 50\n", " 6\n", - " True\n", + " Legendary\n", " \n", " \n", " 796\n", @@ -199,7 +198,7 @@ " 110\n", " 110\n", " 6\n", - " True\n", + " Legendary\n", " \n", " \n", " 797\n", @@ -213,7 +212,7 @@ " 130\n", " 70\n", " 6\n", - " True\n", + " Legendary\n", " \n", " \n", " 798\n", @@ -227,7 +226,7 @@ " 130\n", " 80\n", " 6\n", - " True\n", + " Legendary\n", " \n", " \n", " 799\n", @@ -241,7 +240,7 @@ " 90\n", " 70\n", " 6\n", - " True\n", + " Legendary\n", " \n", " \n", "\n", @@ -262,29 +261,30 @@ "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", "799 Volcanion Fire Water 80 110 120 130 90 \n", "\n", - " Speed Generation Legendary \n", - "0 45 1 False \n", - "1 60 1 False \n", - "2 80 1 False \n", - "3 80 1 False \n", - "4 65 1 False \n", - ".. ... ... ... \n", - "795 50 6 True \n", - "796 110 6 True \n", - "797 70 6 True \n", - "798 80 6 True \n", - "799 70 6 True \n", + " Speed Generation Legendary \n", + "0 45 1 Non-Legendary \n", + "1 60 1 Non-Legendary \n", + "2 80 1 Non-Legendary \n", + "3 80 1 Non-Legendary \n", + "4 65 1 Non-Legendary \n", + ".. ... ... ... \n", + "795 50 6 Legendary \n", + "796 110 6 Legendary \n", + "797 70 6 Legendary \n", + "798 80 6 Legendary \n", + "799 70 6 Legendary \n", "\n", "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\"\n", + "pokemon_data = pd.read_csv(url)\n", "df" ] }, @@ -297,11 +297,34 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-Statistic: 3.3350\n", + "P-Value: 0.0008\n", + "Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\n" + ] + } + ], "source": [ - "#code here" + "from scipy import stats\n", + "\n", + "dragon_hp = pokemon_data[pokemon_data['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = pokemon_data[pokemon_data['Type 1'] == 'Grass']['HP']\n", + "\n", + "t_statistic, p_value = stats.ttest_ind(dragon_hp, grass_hp, equal_var=False, alternative='greater')\n", + "\n", + "print(f\"T-Statistic: {t_statistic:.4f}\")\n", + "print(f\"P-Value: {p_value:.4f}\")\n", + "\n", + "if p_value < 0.05:\n", + " print(\"Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: No significant difference in HP between Dragon-type and Grass-type Pokémon.\")" ] }, { @@ -313,11 +336,54 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp_Atk',\n", + " 'Sp_Def', 'Speed', 'Generation', 'Legendary'],\n", + " dtype='object')\n", + "HP 0\n", + "Attack 0\n", + "Defense 0\n", + "Sp_Atk 0\n", + "Sp_Def 0\n", + "Speed 0\n", + "dtype: int64\n", + "Legendary count: 800\n", + "Non-Legendary count: 0\n", + "Insufficient data for one or both groups.\n" + ] + } + ], "source": [ - "#code here" + "from statsmodels.multivariate.manova import MANOVA\n", + "\n", + "pokemon_data = pokemon_data.rename(columns={'Sp. Atk': 'Sp_Atk', 'Sp. Def': 'Sp_Def'})\n", + "\n", + "print(pokemon_data.columns)\n", + "\n", + "print(pokemon_data[['HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def', 'Speed']].isnull().sum())\n", + "\n", + "pokemon_data['Legendary'] = pokemon_data['Legendary'].astype(bool)\n", + "\n", + "legendary_count = pokemon_data['Legendary'].sum()\n", + "non_legendary_count = len(pokemon_data) - legendary_count\n", + "\n", + "print(\"Legendary count:\", legendary_count)\n", + "print(\"Non-Legendary count:\", non_legendary_count)\n", + "\n", + "if legendary_count > 0 and non_legendary_count > 0:\n", + " # Perform MANOVA\n", + " formula = 'HP + Attack + Defense + Sp_Atk + Sp_Def + Speed ~ Legendary'\n", + " manova = MANOVA.from_formula(formula, data=pokemon_data)\n", + " fit = manova.mv_test()\n", + " print(fit)\n", + "else:\n", + " print(\"Insufficient data for one or both groups.\")" ] }, { @@ -337,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -433,34 +499,122 @@ " 1.9250\n", " 65500.0\n", " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 16995\n", + " -124.26\n", + " 40.58\n", + " 52.0\n", + " 2217.0\n", + " 394.0\n", + " 907.0\n", + " 369.0\n", + " 2.3571\n", + " 111400.0\n", + " \n", + " \n", + " 16996\n", + " -124.27\n", + " 40.69\n", + " 36.0\n", + " 2349.0\n", + " 528.0\n", + " 1194.0\n", + " 465.0\n", + " 2.5179\n", + " 79000.0\n", + " \n", + " \n", + " 16997\n", + " -124.30\n", + " 41.84\n", + " 17.0\n", + " 2677.0\n", + " 531.0\n", + " 1244.0\n", + " 456.0\n", + " 3.0313\n", + " 103600.0\n", + " \n", + " \n", + " 16998\n", + " -124.30\n", + " 41.80\n", + " 19.0\n", + " 2672.0\n", + " 552.0\n", + " 1298.0\n", + " 478.0\n", + " 1.9797\n", + " 85800.0\n", + " \n", + " \n", + " 16999\n", + " -124.35\n", + " 40.54\n", + " 52.0\n", + " 1820.0\n", + " 300.0\n", + " 806.0\n", + " 270.0\n", + " 3.0147\n", + " 94600.0\n", + " \n", " \n", "\n", + "

17000 rows × 9 columns

\n", "" ], "text/plain": [ - " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", - "0 -114.31 34.19 15.0 5612.0 1283.0 \n", - "1 -114.47 34.40 19.0 7650.0 1901.0 \n", - "2 -114.56 33.69 17.0 720.0 174.0 \n", - "3 -114.57 33.64 14.0 1501.0 337.0 \n", - "4 -114.57 33.57 20.0 1454.0 326.0 \n", + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "... ... ... ... ... ... \n", + "16995 -124.26 40.58 52.0 2217.0 394.0 \n", + "16996 -124.27 40.69 36.0 2349.0 528.0 \n", + "16997 -124.30 41.84 17.0 2677.0 531.0 \n", + "16998 -124.30 41.80 19.0 2672.0 552.0 \n", + "16999 -124.35 40.54 52.0 1820.0 300.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "... ... ... ... ... \n", + "16995 907.0 369.0 2.3571 111400.0 \n", + "16996 1194.0 465.0 2.5179 79000.0 \n", + "16997 1244.0 456.0 3.0313 103600.0 \n", + "16998 1298.0 478.0 1.9797 85800.0 \n", + "16999 806.0 270.0 3.0147 94600.0 \n", "\n", - " population households median_income median_house_value \n", - "0 1015.0 472.0 1.4936 66900.0 \n", - "1 1129.0 463.0 1.8200 80100.0 \n", - "2 333.0 117.0 1.6509 85700.0 \n", - "3 515.0 226.0 3.1917 73400.0 \n", - "4 624.0 262.0 1.9250 65500.0 " + "[17000 rows x 9 columns]" ] }, - "execution_count": 5, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\"\n", + "data = pd.read_csv(url)\n", + "data" ] }, { @@ -483,10 +637,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "school_coords = (-118, 37)\n", + "hospital_coords = (-122, 34)\n", + "\n", + "def calculate_distance(coords1, coords2):\n", + " return np.sqrt((coords1[0] - coords2[0])**2 + (coords1[1] - coords2[1])**2)\n", + "\n", + "data['distance_to_school'] = data.apply(lambda row: calculate_distance((row['longitude'], row['latitude']), school_coords), axis=1)\n", + "data['distance_to_hospital'] = data.apply(lambda row: calculate_distance((row['longitude'], row['latitude']), hospital_coords), axis=1)\n", + "\n", + "data['close_to_amenity'] = (data['distance_to_school'] < 0.50) | (data['distance_to_hospital'] < 0.50)\n", + "\n", + "close_houses = data[data['close_to_amenity'] == True]\n", + "far_houses = data[data['close_to_amenity'] == False]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-Statistic: -17.1742\n", + "P-Value: 0.0001\n", + "Reject the null hypothesis: Houses close to a school or hospital are significantly more expensive.\n" + ] + } + ], + "source": [ + "t_statistic, p_value = stats.ttest_ind(close_houses['median_house_value'], far_houses['median_house_value'], equal_var=False)\n", + "\n", + "print(f\"T-Statistic: {t_statistic:.4f}\")\n", + "print(f\"P-Value: {p_value:.4f}\")\n", + "\n", + "if p_value < 0.05:\n", + " print(\"Reject the null hypothesis: Houses close to a school or hospital are significantly more expensive.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: No significant difference in house prices based on proximity to a school or hospital.\")" + ] }, { "cell_type": "code", @@ -498,7 +693,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -512,9 +707,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.7" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }