diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 18ad6d5..c2130eb 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,20 +38,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +277,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,27 +296,175 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filtro los HP de los Pokémon tipo Dragon y tipo Grass\n", + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# Prueba t para dos muestras independientes\n", + "# Uso\"equal_var=False\" porque entiendo que las varianzas pueden ser diferentes\n", + "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-Statistic: 3.3349632905124063, P-Value: 0.0015987219490841197\n" + ] + } + ], + "source": [ + "print(f\"T-Statistic: {t_stat}, P-Value: {p_value}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legy. Choose the propper test and, with 5% significance, comment your findings.endar\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Creamos dos grupos: Legendarios y No Legendarios\n", + "legendary_stats = df[df['Legendary'] == True][['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]\n", + "non_legendary_stats = df[df['Legendary'] == False][['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Inicializamos un diccionario para almacenar los resultados\n", + "results = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Realizamos la prueba t para cada estadística\n", + "for stat in ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']:\n", + " t_stat, p_value = st.ttest_ind(legendary_stats[stat], non_legendary_stats[stat], equal_var=False)\n", + " results[stat] = {'T-Statistic': t_stat, 'P-Value': p_value}" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# Convertimos los resultados a un DataFrame para analizarlos mejor\n", + "results_df = pd.DataFrame(results).T" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
T-StatisticP-Value
HP8.9813701.002691e-13
Attack10.4381342.520372e-16
Defense7.6370784.826998e-11
Sp. Atk13.4174501.551461e-21
Sp. Def10.0156972.294933e-15
Speed11.4750441.049016e-18
\n", + "
" + ], + "text/plain": [ + " T-Statistic P-Value\n", + "HP 8.981370 1.002691e-13\n", + "Attack 10.438134 2.520372e-16\n", + "Defense 7.637078 4.826998e-11\n", + "Sp. Atk 13.417450 1.551461e-21\n", + "Sp. Def 10.015697 2.294933e-15\n", + "Speed 11.475044 1.049016e-18" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_df" ] }, { @@ -337,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -453,7 +600,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -483,17 +630,95 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Coordenadas de la escuela y el hospital\n", + "school_coords = (-118, 37)\n", + "hospital_coords = (-122, 34)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Función para calcular distancia euclidiana\n", + "def calc_dist(lat1, lon1, lat2, lon2):\n", + " return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Calcular las distancias\n", + "df['distance_to_school'] = calc_dist(df['longitude'], df['latitude'], school_coords[0], school_coords[1])\n", + "df['distance_to_hospital'] = calc_dist(df['longitude'], df['latitude'], hospital_coords[0], hospital_coords[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Clasifico las casas como cercanas o lejanas\n", + "df['close_to_school_or_hospital'] = (df['distance_to_school'] < 0.50) | (df['distance_to_hospital'] < 0.50)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Divido el dataset en dos grupos\n", + "close_group = df[df['close_to_school_or_hospital'] == True]['median_house_value']\n", + "far_group = df[df['close_to_school_or_hospital'] == False]['median_house_value']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Realizo prueba t para dos muestras independientes\n", + "t_stat, p_value = st.ttest_ind(close_group, far_group, equal_var=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estadístico t: -17.174167998688404\n", + "Valor p: 5.220018561223529e-05\n" + ] + } + ], + "source": [ + "print(f\"Estadístico t: {t_stat}\")\n", + "print(f\"Valor p: {p_value}\")" + ] } ], "metadata": { @@ -512,7 +737,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.8.2" } }, "nbformat": 4,