From 292d641693a787e2eea8ee2664b88729164417fc Mon Sep 17 00:00:00 2001 From: alinaskukina <75088548+alinaskukina@users.noreply.github.com> Date: Sat, 16 Apr 2022 17:22:16 +0500 Subject: [PATCH 1/2] =?UTF-8?q?=D0=A1=D0=BA=D1=83=D0=BA=D0=B8=D0=BD=D0=B0?= =?UTF-8?q?=20pandas2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Pandas_and_EDA_Task.ipynb | 781 ++++++++++++++++++ 1 file changed, 781 insertions(+) create mode 100644 Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb diff --git a/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb b/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb new file mode 100644 index 0000000..efc7486 --- /dev/null +++ b/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb @@ -0,0 +1,781 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "colab": { + "name": "Pandas and EDA. Task.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "EmV0s8YY05p7" + }, + "source": [ + "- __ID__ - Unique number for each athlete\n", + "- __Name__ - Athlete's name\n", + "- __Sex__ - M or F\n", + "- __Age__ - Integer\n", + "- __Height__ - In centimeters\n", + "- __Weight__ - In kilograms\n", + "- __Team__ - Team name\n", + "- __NOC__ - National Olympic Committee 3-letter code\n", + "- __Games__ - Year and season\n", + "- __Year__ - Integer\n", + "- __Season__ - Summer or Winter\n", + "- __City__ - Host city\n", + "- __Sport__ - Sport\n", + "- __Event__ - Event\n", + "- __Medal__ - Gold, Silver, Bronze, or NA" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rVCrMDMh05p_" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "D5Q4Z-JW05qC" + }, + "source": [ + "PATH = 'https://github.com/aksenov7/Kaggle_competition_group/blob/master/athlete_events.csv.zip?raw=true'" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mI0LtqkY4Kp-" + }, + "source": [ + "__0. Откройте файл используя необходимые параметры и не меняя переменную PATH__" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "h5SQwBLr05qG", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "4a67cced-8c5b-4783-98e5-0730370962d2" + }, + "source": [ + "data = pd.read_csv(PATH, compression='zip', sep=',')\n", + "data.head()" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " ID Name Sex Age Height Weight Team \\\n", + "0 1 A Dijiang M 24.0 180.0 80.0 China \n", + "1 2 A Lamusi M 23.0 170.0 60.0 China \n", + "2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark \n", + "3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden \n", + "4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands \n", + "\n", + " NOC Games Year Season City Sport \\\n", + "0 CHN 1992 Summer 1992 Summer Barcelona Basketball \n", + "1 CHN 2012 Summer 2012 Summer London Judo \n", + "2 DEN 1920 Summer 1920 Summer Antwerpen Football \n", + "3 DEN 1900 Summer 1900 Summer Paris Tug-Of-War \n", + "4 NED 1988 Winter 1988 Winter Calgary Speed Skating \n", + "\n", + " Event Medal \n", + "0 Basketball Men's Basketball NaN \n", + "1 Judo Men's Extra-Lightweight NaN \n", + "2 Football Men's Football NaN \n", + "3 Tug-Of-War Men's Tug-Of-War Gold \n", + "4 Speed Skating Women's 500 metres NaN " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDNameSexAgeHeightWeightTeamNOCGamesYearSeasonCitySportEventMedal
01A DijiangM24.0180.080.0ChinaCHN1992 Summer1992SummerBarcelonaBasketballBasketball Men's BasketballNaN
12A LamusiM23.0170.060.0ChinaCHN2012 Summer2012SummerLondonJudoJudo Men's Extra-LightweightNaN
23Gunnar Nielsen AabyM24.0NaNNaNDenmarkDEN1920 Summer1920SummerAntwerpenFootballFootball Men's FootballNaN
34Edgar Lindenau AabyeM34.0NaNNaNDenmark/SwedenDEN1900 Summer1900SummerParisTug-Of-WarTug-Of-War Men's Tug-Of-WarGold
45Christine Jacoba AaftinkF21.0185.082.0NetherlandsNED1988 Winter1988WinterCalgarySpeed SkatingSpeed Skating Women's 500 metresNaN
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "stYR4EbV05qP" + }, + "source": [ + "__1. Сколько лет было самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года ?__\n", + "- 16 и 15\n", + "- 14 и 13 \n", + "- 13 и 11\n", + "- 11 и 12" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HgiqBXtb05qR", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d7992f8a-9376-4041-ae7d-a36848fc03ff" + }, + "source": [ + "data[data['Year']==1992].groupby(['Sex'])['Age'].min()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Sex\n", + "F 12.0\n", + "M 11.0\n", + "Name: Age, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года было 11 и 12 лет соответственно" + ], + "metadata": { + "id": "6l-jg7yc9nJM" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GQ290dsi05qc" + }, + "source": [ + "__2. Каков был процент баскетболистов-мужчин среди всех мужчин-участников Олимпийских игр 2012 года? Округлите ответ до первого десятичного знака.__\n", + "\n", + "Здесь и далее при необходимости отбрасывайте дублированных спортсменов, чтобы считать только уникальных . \n", + "- 0.2\n", + "- 1.5 \n", + "- 2.5\n", + "- 7.7" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-fI5MqWP05qi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "96c87bfe-4067-41f5-cd35-5339e7592e47" + }, + "source": [ + "man = data[(data['Sex']=='M')&(data['Year']==2012)].drop_duplicates(subset=['ID'],inplace=False)\n", + "basketball_man = man['Sport'].value_counts(normalize=True)['Basketball']\n", + "np.round(basketball_man*100,1)" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2.5" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года было 11 и 12 лет" + ], + "metadata": { + "id": "BKw79NcoNRJ5" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5WrTgIC05qv" + }, + "source": [ + "__3. Каковы среднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года? Округлите ответ до первого десятичного знака.__\n", + "\n", + "- 171.8 и 6.5\n", + "- 179.4 и 10\n", + "- 180.7 и 6.7\n", + "- 182.4 и 9.1 " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vsKTqn6405qw", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d264e669-d2af-4e46-a290-4235f201f50b" + }, + "source": [ + "deviation = data[(data['Year'] == 2000)&(data['Sex'] == 'F')&(data['Sport'] == 'Tennis')]\n", + "res_2 = round(deviation['Height'].std(), 1)\n", + "res_1 = round(deviation['Height'].mean(), 1)\n", + "print(\"Cреднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года:\", res_1, \"и\", res_2)" + ], + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cреднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года: 171.8 и 6.5\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xOOEzhNQ05qy" + }, + "source": [ + "__4. Найдите спортсмена, который участвовал в Олимпийских играх 2006 года, с наибольшим весом среди других участников той же Олимпиады. Каким спортом он или она занимался?__\n", + "\n", + "- Judo\n", + "- Bobsleigh \n", + "- Skeleton\n", + "- Boxing" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EkWD1Tnb05qz", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5f219ce7-f1d8-49c0-f64a-a9e33e1a67a0" + }, + "source": [ + "highest_weight = data[(data['Year']==2006)]['Weight'].max()\n", + "print(data[(data['Year']==2006)&(data['Weight']==highest_weight)]['Sport'])" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "8102 Skeleton\n", + "Name: Sport, dtype: object\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Спортсмен с наибольшим весом, участвовавший в Олимпийских играх 2006 года, занимался Skeleton." + ], + "metadata": { + "id": "6DE5va9-NXk9" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UQzxZ3HT05q0" + }, + "source": [ + "__5. Сколько раз John Aalberg участвовал в Олимпийских играх в разные годы?__\n", + "\n", + "Один год - это один раз. Неважно сколько участий внутри одного года\n", + "- 0\n", + "- 1 \n", + "- 2\n", + "- 3 " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZSfkdjPO05q0", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "60cd0455-8924-4a76-f9db-056e65e4c32d" + }, + "source": [ + "count_name = len(data[data['Name'] == 'John Aalberg'].drop_duplicates(['Year']))\n", + "print(\"John Aalberg участвовал в Олимпийских играх\" ,count_name, \"раза\")" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "John Aalberg участвовал в Олимпийских играх 2 раза\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8EnLcNrk05q3" + }, + "source": [ + "__6. Сколько золотых медалей по теннису выиграли спортсмены сборной Switzerland на Олимпиаде-2008? Считайте каждую медаль от каждого спортсмена.__\n", + "\n", + "- 0\n", + "- 1 \n", + "- 2\n", + "- 3 " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y754OGI-05q3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a10310b6-05e8-4f00-f9dc-75d112dd7a34" + }, + "source": [ + "number_peoples = data[(data['Year'] == 2008)&(data['Sport'] == 'Tennis')&(data['Medal'] == 'Gold')&(data['Team'] == 'Switzerland')]\n", + "count_medal = len(number_peoples)\n", + "print(\"Спортсмены по теннису сборной Switzerland на Олимпиаде-2008 выиграли\", count_medal, \"медали\")" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Спортсмены по теннису сборной Switzerland на Олимпиаде-2008 выиграли 2 медали\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v3h5sQF805q5" + }, + "source": [ + "__7. Правда ли, что на Олимпийских играх 2016 Spain выиграла меньше медалей, чем Италия?__ \n", + "\n", + "- Да\n", + "- Нет" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gqJqDi2605q7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "55be8cf3-34d5-4936-d939-7551724cf0f8" + }, + "source": [ + "people_year = data[data['Year'] == 2016]\n", + "people_italy = people_year[people_year['Team'] == 'Italy']['Medal'].dropna()\n", + "people_spain = people_year[people_year['Team'] == 'Spain']['Medal'].dropna()\n", + "print('Правда') if len(people_spain) < len(people_italy) else print('Не правда')" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Правда\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kkSYL5mK05q-" + }, + "source": [ + "__8. К какой возрастной категории принадлежало наименьшее и наибольшее количество участников Олимпиады-2008?__\n", + "\n", + "- [45-55] и [25-35) соответственно\n", + "- [45-55] и [15-25) соответственно\n", + "- [35-45) и [25-35) соответственно\n", + "- [45-55] и [35-45) соответственно" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pMAQtW7i05q_", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "fae0e8ba-cc03-41a8-929c-b02564ab6a64" + }, + "source": [ + "age = data[(data['Year'] == 2008)][['Age']]\n", + "age_15_25 = len(age[(age['Age'] >= 15) & (age['Age'] < 25)])\n", + "age_25_35 = len(age[(age['Age'] >= 25) & (age['Age'] < 35)])\n", + "age_35_45 = len(age[(age['Age'] >= 35) & (age['Age'] < 45)])\n", + "age_45_55 = len(age[(age['Age'] >= 45) & (age['Age'] <= 55)])\n", + "print(f'[45-55] и [25-35): {age_45_55} и {age_25_35} соответственно\\n')\n", + "print(f'[45-55] и [15-25): {age_45_55} и {age_15_25} соответственно\\n')\n", + "print(f'[35-45) и [25-35): {age_35_45} и {age_25_35} соответственно\\n')\n", + "print(f'[45-55] и [35-45): {age_45_55} и {age_35_45} соответственно\\n')\n", + "print(f'Ответ: [45-55] и [25-35) соответственно')" + ], + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[45-55] и [25-35): 119 и 6367 соответственно\n", + "\n", + "[45-55] и [15-25): 119 и 6294 соответственно\n", + "\n", + "[35-45) и [25-35): 790 и 6367 соответственно\n", + "\n", + "[45-55] и [35-45): 119 и 790 соответственно\n", + "\n", + "Ответ: [45-55] и [25-35) соответственно\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JQmJPiXv05rB" + }, + "source": [ + "__9. Правда ли, что в Atlanta проводились летние Олимпийские игры? Правда ли, что в Squaw Valley проводились зимние Олимпийские игры? ?__\n", + "\n", + "- Да, Да\n", + "- Да, Нет\n", + "- Нет, Да \n", + "- Нет, Нет " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UU66wRHC05rB", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9ad04b54-2daf-4c26-9044-73c24d9c1bad" + }, + "source": [ + "atlanta_sum = data[(data['City'] == 'Atlanta')&(data['Season'] == 'Summer')]\n", + "squaw_valley_win = data[(data['City'] == 'Squaw Valley') & (data['Season'] == 'Winter')]\n", + "print('Да, ', end='') if len(atlanta_sum) > 0 else print('Нет, ', end='')\n", + "print('Да') if len(squaw_valley_win) > 0 else print('Нет')" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Да, Да\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4hxR5D-t05rF" + }, + "source": [ + "__10. Какова абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года?__\n", + "\n", + "- 3 \n", + "- 10\n", + "- 15\n", + "- 27 " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WKIr-TR105rF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "25430833-b5b0-48c2-c05e-d552b4af29d4" + }, + "source": [ + "abs_difference = abs(data[data['Year']==1986]['Sport'].nunique()-data[data['Year']==2002]['Sport'].nunique())\n", + "print(\"Абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года - \", abs_difference )" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года - 15\n" + ] + } + ] + } + ] +} \ No newline at end of file From ee08e4fceb7ad3e2f5aaa5fc62addac17bad49f8 Mon Sep 17 00:00:00 2001 From: alinaskukina <75088548+alinaskukina@users.noreply.github.com> Date: Sun, 17 Apr 2022 15:47:45 +0500 Subject: [PATCH 2/2] =?UTF-8?q?=D0=A1=D0=BA=D1=83=D0=BA=D0=B8=D0=BD=D0=B0?= =?UTF-8?q?=20pandas1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Pandas_Task_Part_1.ipynb | 1149 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1149 insertions(+) create mode 100644 Pandas_Task_Part_1.ipynb diff --git a/Pandas_Task_Part_1.ipynb b/Pandas_Task_Part_1.ipynb new file mode 100644 index 0000000..b870411 --- /dev/null +++ b/Pandas_Task_Part_1.ipynb @@ -0,0 +1,1149 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "colab": { + "name": "Pandas. Task. Part 1.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UTKVH3sMutTM" + }, + "source": [ + "**В задании предлагается с помощью Pandas ответить на несколько вопросов по данным репозитория UCI [Adult](https://archive.ics.uci.edu/ml/datasets/Adult)**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3lUT-CqYutTO" + }, + "source": [ + "Уникальные значения признаков (больше информации по ссылке выше):\n", + "- age: continuous.\n", + "- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.\n", + "- fnlwgt: continuous.\n", + "- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.\n", + "- education-num: continuous.\n", + "- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.\n", + "- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.\n", + "- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.\n", + "- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.\n", + "- sex: Female, Male.\n", + "- capital-gain: continuous.\n", + "- capital-loss: continuous.\n", + "- hours-per-week: continuous.\n", + "- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. \n", + "- salary: >50K,<=50K" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6GzulHvOutTR" + }, + "source": [ + "import pandas as pd" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SJ3LbaoiutTT", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 444 + }, + "outputId": "d8361b12-744c-4fdb-ccd2-a737c457a8ba" + }, + "source": [ + "data = pd.read_csv(\"https://raw.githubusercontent.com/aksenov7/Kaggle_competition_group/master/adult.data.csv\")\n", + "data.head()" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39 State-gov 77516 Bachelors 13 \n", + "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", + "2 38 Private 215646 HS-grad 9 \n", + "3 53 Private 234721 11th 7 \n", + "4 28 Private 338409 Bachelors 13 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country salary \n", + "0 2174 0 40 United-States <=50K \n", + "1 0 0 13 United-States <=50K \n", + "2 0 0 40 United-States <=50K \n", + "3 0 0 40 United-States <=50K \n", + "4 0 0 40 Cuba <=50K " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countrysalary
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EpQFv8t1ds05" + }, + "source": [ + "# def married(row):\n", + "# return \"Married\" in row\n", + "data[\"married\"] = data[\"marital-status\"].apply(lambda row: \"Married\" in row)" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 800 + }, + "id": "3Bb2mRTEeoJK", + "outputId": "5b83d50d-e4c2-47de-bce7-3259503af14d" + }, + "source": [ + "data" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39 State-gov 77516 Bachelors 13 \n", + "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", + "2 38 Private 215646 HS-grad 9 \n", + "3 53 Private 234721 11th 7 \n", + "4 28 Private 338409 Bachelors 13 \n", + "... ... ... ... ... ... \n", + "32556 27 Private 257302 Assoc-acdm 12 \n", + "32557 40 Private 154374 HS-grad 9 \n", + "32558 58 Private 151910 HS-grad 9 \n", + "32559 22 Private 201490 HS-grad 9 \n", + "32560 52 Self-emp-inc 287927 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "... ... ... ... ... ... \n", + "32556 Married-civ-spouse Tech-support Wife White Female \n", + "32557 Married-civ-spouse Machine-op-inspct Husband White Male \n", + "32558 Widowed Adm-clerical Unmarried White Female \n", + "32559 Never-married Adm-clerical Own-child White Male \n", + "32560 Married-civ-spouse Exec-managerial Wife White Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country salary \\\n", + "0 2174 0 40 United-States <=50K \n", + "1 0 0 13 United-States <=50K \n", + "2 0 0 40 United-States <=50K \n", + "3 0 0 40 United-States <=50K \n", + "4 0 0 40 Cuba <=50K \n", + "... ... ... ... ... ... \n", + "32556 0 0 38 United-States <=50K \n", + "32557 0 0 40 United-States >50K \n", + "32558 0 0 40 United-States <=50K \n", + "32559 0 0 20 United-States <=50K \n", + "32560 15024 0 40 United-States >50K \n", + "\n", + " married \n", + "0 False \n", + "1 True \n", + "2 False \n", + "3 True \n", + "4 True \n", + "... ... \n", + "32556 True \n", + "32557 True \n", + "32558 False \n", + "32559 False \n", + "32560 True \n", + "\n", + "[32561 rows x 16 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countrysalarymarried
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50KFalse
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50KTrue
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50KFalse
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50KTrue
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50KTrue
...................................................
3255627Private257302Assoc-acdm12Married-civ-spouseTech-supportWifeWhiteFemale0038United-States<=50KTrue
3255740Private154374HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States>50KTrue
3255858Private151910HS-grad9WidowedAdm-clericalUnmarriedWhiteFemale0040United-States<=50KFalse
3255922Private201490HS-grad9Never-marriedAdm-clericalOwn-childWhiteMale0020United-States<=50KFalse
3256052Self-emp-inc287927HS-grad9Married-civ-spouseExec-managerialWifeWhiteFemale15024040United-States>50KTrue
\n", + "

32561 rows × 16 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MoK8B5fIutTW" + }, + "source": [ + "**1. Сколько мужчин и женщин (признак *sex*) представлено в этом наборе данных?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "hdzky90TutTY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "74677bb2-d548-4cf1-9756-23c4f510eeb0" + }, + "source": [ + "data['sex'].value_counts()" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Male 21790\n", + "Female 10771\n", + "Name: sex, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "В этом наборе данных было представлено 21790 мужчин и 10771 женщин." + ], + "metadata": { + "id": "Oke5GuC2W_D_" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "adF8lgVbutTZ" + }, + "source": [ + "**2. Каков средний возраст (признак *age*) женщин?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "K6C2qZ_zutTb", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8bcc8f69-d458-4da3-ff4a-7f49f1c757b7" + }, + "source": [ + "sr_age_female = data[data[\"sex\"] == \"Female\"]['age'].mean()\n", + "print(\"Средний возраст женщин в данном наборе:\", sr_age_female)" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Средний возраст женщин в данном наборе: 36.85823043357163\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-Cz1S7-HutTd" + }, + "source": [ + "**3. Какова доля граждан Германии (признак *native-country*)?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "Y4mmqN6outTf", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "83b7b594-a19d-45a8-a0f5-911fc05ff28f" + }, + "source": [ + "fraction_people = data.groupby(\"native-country\").size() / len(data)\n", + "print(\"Доля граждан Германии\", fraction_people[\"Germany\"])" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Доля граждан Германии 0.004207487485028101\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Do-rEgaautTg" + }, + "source": [ + "**4-5. Каковы средние значения и среднеквадратичные отклонения возраста тех, кто получает более 50K в год (признак *salary*) и тех, кто получает менее 50K в год?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "eSuk0CAnutTh", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7bde0415-eb3a-4492-cc70-0b68b3b03772" + }, + "source": [ + "print(data.groupby('salary')['age'].mean())\n", + "print(data.groupby('salary')['age'].std())" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "salary\n", + "<=50K 36.783738\n", + ">50K 44.249841\n", + "Name: age, dtype: float64\n", + "salary\n", + "<=50K 14.020088\n", + ">50K 10.519028\n", + "Name: age, dtype: float64\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rK9SwvI_utTj" + }, + "source": [ + "**6. Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование? (признак *education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters* или *Doctorate*)**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "eygYabkdutTj", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "db5836c7-42cd-4b37-c907-f011f97e6b05" + }, + "source": [ + "higher_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']\n", + "true_or_false = data[data.education.isin(higher_education)].salary.value_counts()['<=50K'] == 0\n", + "print(\"Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование -\", true_or_false)" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование - False\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4DqPASEsutTk" + }, + "source": [ + "**7. Выведите статистику возраста для каждой расы (признак *race*) и каждого пола. Используйте *groupby* и *describe*. Найдите таким образом максимальный возраст мужчин расы *Amer-Indian-Eskimo*.**" + ] + }, + { + "cell_type": "code", + "source": [ + "table = data.groupby(['race', 'sex']).describe()['age']\n", + "print(table)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UBIITpxqd7T0", + "outputId": "b22cd440-fd36-4c63-9e9b-456ec425ec50" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " count mean std min 25% 50% \\\n", + "race sex \n", + "Amer-Indian-Eskimo Female 119.0 37.117647 13.114991 17.0 27.0 36.0 \n", + " Male 192.0 37.208333 12.049563 17.0 28.0 35.0 \n", + "Asian-Pac-Islander Female 346.0 35.089595 12.300845 17.0 25.0 33.0 \n", + " Male 693.0 39.073593 12.883944 18.0 29.0 37.0 \n", + "Black Female 1555.0 37.854019 12.637197 17.0 28.0 37.0 \n", + " Male 1569.0 37.682600 12.882612 17.0 27.0 36.0 \n", + "Other Female 109.0 31.678899 11.631599 17.0 23.0 29.0 \n", + " Male 162.0 34.654321 11.355531 17.0 26.0 32.0 \n", + "White Female 8642.0 36.811618 14.329093 17.0 25.0 35.0 \n", + " Male 19174.0 39.652498 13.436029 17.0 29.0 38.0 \n", + "\n", + " 75% max \n", + "race sex \n", + "Amer-Indian-Eskimo Female 46.00 80.0 \n", + " Male 45.00 82.0 \n", + "Asian-Pac-Islander Female 43.75 75.0 \n", + " Male 46.00 90.0 \n", + "Black Female 46.00 90.0 \n", + " Male 46.00 90.0 \n", + "Other Female 39.00 74.0 \n", + " Male 42.00 77.0 \n", + "White Female 46.00 90.0 \n", + " Male 49.00 90.0 \n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "fYkBDZMdutTl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9935d02d-e026-4157-a57d-9c5367e7d054" + }, + "source": [ + "dataframe = data.groupby([\"race\", \"sex\"]).describe().loc['Amer-Indian-Eskimo'].loc[\"Male\"]\n", + "print(\"Максимальный возраст мужчин расы Amer-Indian-Eskimo:\", dataframe[\"age\"][\"max\"], \"года.\")" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Максимальный возраст мужчин расы Amer-Indian-Eskimo: 82.0 года.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cn-jYXhzutTl" + }, + "source": [ + "**8. Среди кого больше доля зарабатывающих много (>50K): среди женатых или холостых мужчин (признак *marital-status*)? Женатыми считаем тех, у кого *marital-status* начинается с *Married* (Married-civ-spouse, Married-spouse-absent или Married-AF-spouse), остальных считаем холостыми.**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "4hIQXgGAutTm", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "28f56f3c-568e-4745-8c02-9b55b905f44b" + }, + "source": [ + "salary_man = data[data['sex'] == 'Male'].query('salary == \">50K\"')\n", + "married_man = len(salary_man[salary_man[\"marital-status\"]==\"Married-civ-spouse\"]) + len(salary_man[salary_man[\"marital-status\"]==\"Married-spouse-absent\"]) + len(salary_man[salary_man[\"marital-status\"]==\"Married-AF-spouse\"])\n", + "print(\"Доля зарабатывающих больше 50к больше среди женатых, чем холостых: \", married_man > len(salary_man) - married_man)\n" + ], + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Доля зарабатывающих больше 50к больше среди женатых, чем холостых: True\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rsh8YvoXutTm" + }, + "source": [ + "**9. Какое максимальное число часов человек работает в неделю (признак *hours-per-week*)? Сколько людей работают такое количество часов и каков среди них процент зарабатывающих много?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "RK1JQSIZutTn", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c963ad09-e2dc-4b99-e84e-d7aed9058aba" + }, + "source": [ + "max_hours_work_in_week = data['hours-per-week'].describe()['max']\n", + "people_work = data[data[\"hours-per-week\"] == max_hours_work_in_week]\n", + "print(\"Максимальное число часов, которое человек работает в неделю:\", max_hours_work_in_week)\n", + "print(\"Количество людей, работающих такое же количество часов:\", people_work.shape[0])\n", + "print(\"Процент людей зарабатывающих много:\", round((people_work[people_work['salary'] == '>50K'].shape[0] / people_work.shape[0]) * 100, 1), \"%\")" + ], + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Максимальное число часов, которое человек работает в неделю: 99.0\n", + "Количество людей, работающих такое же количество часов: 85\n", + "Процент людей зарабатывающих много: 29.4 %\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kUXV84AjutTn" + }, + "source": [ + "**10. Посчитайте среднее время работы (*hours-per-week*) зарабатывающих мало и много (*salary*) для каждой страны (*native-country*).**" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "id": "3gzYG3CDutTn", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "15ecdc6c-46cc-4a47-f909-992a9079c51e" + }, + "source": [ + "print(\"Среднее время работы зарабатывающих мало и много для каждой страны\")\n", + "print(data.groupby(['native-country', 'salary'])['hours-per-week'].mean())" + ], + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Среднее время работы зарабатывающих мало и много для каждой страны\n", + "native-country salary\n", + "? <=50K 40.164760\n", + " >50K 45.547945\n", + "Cambodia <=50K 41.416667\n", + " >50K 40.000000\n", + "Canada <=50K 37.914634\n", + " ... \n", + "United-States >50K 45.505369\n", + "Vietnam <=50K 37.193548\n", + " >50K 39.200000\n", + "Yugoslavia <=50K 41.600000\n", + " >50K 49.500000\n", + "Name: hours-per-week, Length: 82, dtype: float64\n" + ] + } + ] + } + ] +} \ No newline at end of file