From 292d641693a787e2eea8ee2664b88729164417fc Mon Sep 17 00:00:00 2001
From: alinaskukina <75088548+alinaskukina@users.noreply.github.com>
Date: Sat, 16 Apr 2022 17:22:16 +0500
Subject: [PATCH 1/2] =?UTF-8?q?=D0=A1=D0=BA=D1=83=D0=BA=D0=B8=D0=BD=D0=B0?=
=?UTF-8?q?=20pandas2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../Pandas_and_EDA_Task.ipynb | 781 ++++++++++++++++++
1 file changed, 781 insertions(+)
create mode 100644 Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb
diff --git a/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb b/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb
new file mode 100644
index 0000000..efc7486
--- /dev/null
+++ b/Pandas and EDA (12.03)/Pandas_and_EDA_Task.ipynb
@@ -0,0 +1,781 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "colab": {
+ "name": "Pandas and EDA. Task.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EmV0s8YY05p7"
+ },
+ "source": [
+ "- __ID__ - Unique number for each athlete\n",
+ "- __Name__ - Athlete's name\n",
+ "- __Sex__ - M or F\n",
+ "- __Age__ - Integer\n",
+ "- __Height__ - In centimeters\n",
+ "- __Weight__ - In kilograms\n",
+ "- __Team__ - Team name\n",
+ "- __NOC__ - National Olympic Committee 3-letter code\n",
+ "- __Games__ - Year and season\n",
+ "- __Year__ - Integer\n",
+ "- __Season__ - Summer or Winter\n",
+ "- __City__ - Host city\n",
+ "- __Sport__ - Sport\n",
+ "- __Event__ - Event\n",
+ "- __Medal__ - Gold, Silver, Bronze, or NA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "rVCrMDMh05p_"
+ },
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ],
+ "execution_count": 1,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "D5Q4Z-JW05qC"
+ },
+ "source": [
+ "PATH = 'https://github.com/aksenov7/Kaggle_competition_group/blob/master/athlete_events.csv.zip?raw=true'"
+ ],
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mI0LtqkY4Kp-"
+ },
+ "source": [
+ "__0. Откройте файл используя необходимые параметры и не меняя переменную PATH__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "h5SQwBLr05qG",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "4a67cced-8c5b-4783-98e5-0730370962d2"
+ },
+ "source": [
+ "data = pd.read_csv(PATH, compression='zip', sep=',')\n",
+ "data.head()"
+ ],
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " ID Name Sex Age Height Weight Team \\\n",
+ "0 1 A Dijiang M 24.0 180.0 80.0 China \n",
+ "1 2 A Lamusi M 23.0 170.0 60.0 China \n",
+ "2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark \n",
+ "3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden \n",
+ "4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands \n",
+ "\n",
+ " NOC Games Year Season City Sport \\\n",
+ "0 CHN 1992 Summer 1992 Summer Barcelona Basketball \n",
+ "1 CHN 2012 Summer 2012 Summer London Judo \n",
+ "2 DEN 1920 Summer 1920 Summer Antwerpen Football \n",
+ "3 DEN 1900 Summer 1900 Summer Paris Tug-Of-War \n",
+ "4 NED 1988 Winter 1988 Winter Calgary Speed Skating \n",
+ "\n",
+ " Event Medal \n",
+ "0 Basketball Men's Basketball NaN \n",
+ "1 Judo Men's Extra-Lightweight NaN \n",
+ "2 Football Men's Football NaN \n",
+ "3 Tug-Of-War Men's Tug-Of-War Gold \n",
+ "4 Speed Skating Women's 500 metres NaN "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ID | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " Height | \n",
+ " Weight | \n",
+ " Team | \n",
+ " NOC | \n",
+ " Games | \n",
+ " Year | \n",
+ " Season | \n",
+ " City | \n",
+ " Sport | \n",
+ " Event | \n",
+ " Medal | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " A Dijiang | \n",
+ " M | \n",
+ " 24.0 | \n",
+ " 180.0 | \n",
+ " 80.0 | \n",
+ " China | \n",
+ " CHN | \n",
+ " 1992 Summer | \n",
+ " 1992 | \n",
+ " Summer | \n",
+ " Barcelona | \n",
+ " Basketball | \n",
+ " Basketball Men's Basketball | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " A Lamusi | \n",
+ " M | \n",
+ " 23.0 | \n",
+ " 170.0 | \n",
+ " 60.0 | \n",
+ " China | \n",
+ " CHN | \n",
+ " 2012 Summer | \n",
+ " 2012 | \n",
+ " Summer | \n",
+ " London | \n",
+ " Judo | \n",
+ " Judo Men's Extra-Lightweight | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Gunnar Nielsen Aaby | \n",
+ " M | \n",
+ " 24.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Denmark | \n",
+ " DEN | \n",
+ " 1920 Summer | \n",
+ " 1920 | \n",
+ " Summer | \n",
+ " Antwerpen | \n",
+ " Football | \n",
+ " Football Men's Football | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " Edgar Lindenau Aabye | \n",
+ " M | \n",
+ " 34.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Denmark/Sweden | \n",
+ " DEN | \n",
+ " 1900 Summer | \n",
+ " 1900 | \n",
+ " Summer | \n",
+ " Paris | \n",
+ " Tug-Of-War | \n",
+ " Tug-Of-War Men's Tug-Of-War | \n",
+ " Gold | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " Christine Jacoba Aaftink | \n",
+ " F | \n",
+ " 21.0 | \n",
+ " 185.0 | \n",
+ " 82.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1988 Winter | \n",
+ " 1988 | \n",
+ " Winter | \n",
+ " Calgary | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 500 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "stYR4EbV05qP"
+ },
+ "source": [
+ "__1. Сколько лет было самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года ?__\n",
+ "- 16 и 15\n",
+ "- 14 и 13 \n",
+ "- 13 и 11\n",
+ "- 11 и 12"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "HgiqBXtb05qR",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "d7992f8a-9376-4041-ae7d-a36848fc03ff"
+ },
+ "source": [
+ "data[data['Year']==1992].groupby(['Sex'])['Age'].min()"
+ ],
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Sex\n",
+ "F 12.0\n",
+ "M 11.0\n",
+ "Name: Age, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года было 11 и 12 лет соответственно"
+ ],
+ "metadata": {
+ "id": "6l-jg7yc9nJM"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GQ290dsi05qc"
+ },
+ "source": [
+ "__2. Каков был процент баскетболистов-мужчин среди всех мужчин-участников Олимпийских игр 2012 года? Округлите ответ до первого десятичного знака.__\n",
+ "\n",
+ "Здесь и далее при необходимости отбрасывайте дублированных спортсменов, чтобы считать только уникальных . \n",
+ "- 0.2\n",
+ "- 1.5 \n",
+ "- 2.5\n",
+ "- 7.7"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "-fI5MqWP05qi",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "96c87bfe-4067-41f5-cd35-5339e7592e47"
+ },
+ "source": [
+ "man = data[(data['Sex']=='M')&(data['Year']==2012)].drop_duplicates(subset=['ID'],inplace=False)\n",
+ "basketball_man = man['Sport'].value_counts(normalize=True)['Basketball']\n",
+ "np.round(basketball_man*100,1)"
+ ],
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "2.5"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Самым молодым мужчинам и женщинам-участникам Олимпийских игр 1992 года было 11 и 12 лет"
+ ],
+ "metadata": {
+ "id": "BKw79NcoNRJ5"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "u5WrTgIC05qv"
+ },
+ "source": [
+ "__3. Каковы среднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года? Округлите ответ до первого десятичного знака.__\n",
+ "\n",
+ "- 171.8 и 6.5\n",
+ "- 179.4 и 10\n",
+ "- 180.7 и 6.7\n",
+ "- 182.4 и 9.1 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "vsKTqn6405qw",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "d264e669-d2af-4e46-a290-4235f201f50b"
+ },
+ "source": [
+ "deviation = data[(data['Year'] == 2000)&(data['Sex'] == 'F')&(data['Sport'] == 'Tennis')]\n",
+ "res_2 = round(deviation['Height'].std(), 1)\n",
+ "res_1 = round(deviation['Height'].mean(), 1)\n",
+ "print(\"Cреднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года:\", res_1, \"и\", res_2)"
+ ],
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Cреднее и стандартное отклонение роста теннисисток, участвовавших в Олимпийских играх 2000 года: 171.8 и 6.5\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xOOEzhNQ05qy"
+ },
+ "source": [
+ "__4. Найдите спортсмена, который участвовал в Олимпийских играх 2006 года, с наибольшим весом среди других участников той же Олимпиады. Каким спортом он или она занимался?__\n",
+ "\n",
+ "- Judo\n",
+ "- Bobsleigh \n",
+ "- Skeleton\n",
+ "- Boxing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "EkWD1Tnb05qz",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "5f219ce7-f1d8-49c0-f64a-a9e33e1a67a0"
+ },
+ "source": [
+ "highest_weight = data[(data['Year']==2006)]['Weight'].max()\n",
+ "print(data[(data['Year']==2006)&(data['Weight']==highest_weight)]['Sport'])"
+ ],
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "8102 Skeleton\n",
+ "Name: Sport, dtype: object\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Спортсмен с наибольшим весом, участвовавший в Олимпийских играх 2006 года, занимался Skeleton."
+ ],
+ "metadata": {
+ "id": "6DE5va9-NXk9"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UQzxZ3HT05q0"
+ },
+ "source": [
+ "__5. Сколько раз John Aalberg участвовал в Олимпийских играх в разные годы?__\n",
+ "\n",
+ "Один год - это один раз. Неважно сколько участий внутри одного года\n",
+ "- 0\n",
+ "- 1 \n",
+ "- 2\n",
+ "- 3 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ZSfkdjPO05q0",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "60cd0455-8924-4a76-f9db-056e65e4c32d"
+ },
+ "source": [
+ "count_name = len(data[data['Name'] == 'John Aalberg'].drop_duplicates(['Year']))\n",
+ "print(\"John Aalberg участвовал в Олимпийских играх\" ,count_name, \"раза\")"
+ ],
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "John Aalberg участвовал в Олимпийских играх 2 раза\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8EnLcNrk05q3"
+ },
+ "source": [
+ "__6. Сколько золотых медалей по теннису выиграли спортсмены сборной Switzerland на Олимпиаде-2008? Считайте каждую медаль от каждого спортсмена.__\n",
+ "\n",
+ "- 0\n",
+ "- 1 \n",
+ "- 2\n",
+ "- 3 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Y754OGI-05q3",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "a10310b6-05e8-4f00-f9dc-75d112dd7a34"
+ },
+ "source": [
+ "number_peoples = data[(data['Year'] == 2008)&(data['Sport'] == 'Tennis')&(data['Medal'] == 'Gold')&(data['Team'] == 'Switzerland')]\n",
+ "count_medal = len(number_peoples)\n",
+ "print(\"Спортсмены по теннису сборной Switzerland на Олимпиаде-2008 выиграли\", count_medal, \"медали\")"
+ ],
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Спортсмены по теннису сборной Switzerland на Олимпиаде-2008 выиграли 2 медали\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "v3h5sQF805q5"
+ },
+ "source": [
+ "__7. Правда ли, что на Олимпийских играх 2016 Spain выиграла меньше медалей, чем Италия?__ \n",
+ "\n",
+ "- Да\n",
+ "- Нет"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "gqJqDi2605q7",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "55be8cf3-34d5-4936-d939-7551724cf0f8"
+ },
+ "source": [
+ "people_year = data[data['Year'] == 2016]\n",
+ "people_italy = people_year[people_year['Team'] == 'Italy']['Medal'].dropna()\n",
+ "people_spain = people_year[people_year['Team'] == 'Spain']['Medal'].dropna()\n",
+ "print('Правда') if len(people_spain) < len(people_italy) else print('Не правда')"
+ ],
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Правда\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kkSYL5mK05q-"
+ },
+ "source": [
+ "__8. К какой возрастной категории принадлежало наименьшее и наибольшее количество участников Олимпиады-2008?__\n",
+ "\n",
+ "- [45-55] и [25-35) соответственно\n",
+ "- [45-55] и [15-25) соответственно\n",
+ "- [35-45) и [25-35) соответственно\n",
+ "- [45-55] и [35-45) соответственно"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pMAQtW7i05q_",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "fae0e8ba-cc03-41a8-929c-b02564ab6a64"
+ },
+ "source": [
+ "age = data[(data['Year'] == 2008)][['Age']]\n",
+ "age_15_25 = len(age[(age['Age'] >= 15) & (age['Age'] < 25)])\n",
+ "age_25_35 = len(age[(age['Age'] >= 25) & (age['Age'] < 35)])\n",
+ "age_35_45 = len(age[(age['Age'] >= 35) & (age['Age'] < 45)])\n",
+ "age_45_55 = len(age[(age['Age'] >= 45) & (age['Age'] <= 55)])\n",
+ "print(f'[45-55] и [25-35): {age_45_55} и {age_25_35} соответственно\\n')\n",
+ "print(f'[45-55] и [15-25): {age_45_55} и {age_15_25} соответственно\\n')\n",
+ "print(f'[35-45) и [25-35): {age_35_45} и {age_25_35} соответственно\\n')\n",
+ "print(f'[45-55] и [35-45): {age_45_55} и {age_35_45} соответственно\\n')\n",
+ "print(f'Ответ: [45-55] и [25-35) соответственно')"
+ ],
+ "execution_count": 30,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[45-55] и [25-35): 119 и 6367 соответственно\n",
+ "\n",
+ "[45-55] и [15-25): 119 и 6294 соответственно\n",
+ "\n",
+ "[35-45) и [25-35): 790 и 6367 соответственно\n",
+ "\n",
+ "[45-55] и [35-45): 119 и 790 соответственно\n",
+ "\n",
+ "Ответ: [45-55] и [25-35) соответственно\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JQmJPiXv05rB"
+ },
+ "source": [
+ "__9. Правда ли, что в Atlanta проводились летние Олимпийские игры? Правда ли, что в Squaw Valley проводились зимние Олимпийские игры? ?__\n",
+ "\n",
+ "- Да, Да\n",
+ "- Да, Нет\n",
+ "- Нет, Да \n",
+ "- Нет, Нет "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UU66wRHC05rB",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "9ad04b54-2daf-4c26-9044-73c24d9c1bad"
+ },
+ "source": [
+ "atlanta_sum = data[(data['City'] == 'Atlanta')&(data['Season'] == 'Summer')]\n",
+ "squaw_valley_win = data[(data['City'] == 'Squaw Valley') & (data['Season'] == 'Winter')]\n",
+ "print('Да, ', end='') if len(atlanta_sum) > 0 else print('Нет, ', end='')\n",
+ "print('Да') if len(squaw_valley_win) > 0 else print('Нет')"
+ ],
+ "execution_count": 31,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Да, Да\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4hxR5D-t05rF"
+ },
+ "source": [
+ "__10. Какова абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года?__\n",
+ "\n",
+ "- 3 \n",
+ "- 10\n",
+ "- 15\n",
+ "- 27 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "WKIr-TR105rF",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "25430833-b5b0-48c2-c05e-d552b4af29d4"
+ },
+ "source": [
+ "abs_difference = abs(data[data['Year']==1986]['Sport'].nunique()-data[data['Year']==2002]['Sport'].nunique())\n",
+ "print(\"Абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года - \", abs_difference )"
+ ],
+ "execution_count": 33,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Абсолютная разница между количеством уникальных видов спорта на Олимпиаде 1986 года и Олимпиаде 2002 года - 15\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
From ee08e4fceb7ad3e2f5aaa5fc62addac17bad49f8 Mon Sep 17 00:00:00 2001
From: alinaskukina <75088548+alinaskukina@users.noreply.github.com>
Date: Sun, 17 Apr 2022 15:47:45 +0500
Subject: [PATCH 2/2] =?UTF-8?q?=D0=A1=D0=BA=D1=83=D0=BA=D0=B8=D0=BD=D0=B0?=
=?UTF-8?q?=20pandas1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Pandas_Task_Part_1.ipynb | 1149 ++++++++++++++++++++++++++++++++++++++
1 file changed, 1149 insertions(+)
create mode 100644 Pandas_Task_Part_1.ipynb
diff --git a/Pandas_Task_Part_1.ipynb b/Pandas_Task_Part_1.ipynb
new file mode 100644
index 0000000..b870411
--- /dev/null
+++ b/Pandas_Task_Part_1.ipynb
@@ -0,0 +1,1149 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "colab": {
+ "name": "Pandas. Task. Part 1.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UTKVH3sMutTM"
+ },
+ "source": [
+ "**В задании предлагается с помощью Pandas ответить на несколько вопросов по данным репозитория UCI [Adult](https://archive.ics.uci.edu/ml/datasets/Adult)**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3lUT-CqYutTO"
+ },
+ "source": [
+ "Уникальные значения признаков (больше информации по ссылке выше):\n",
+ "- age: continuous.\n",
+ "- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.\n",
+ "- fnlwgt: continuous.\n",
+ "- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.\n",
+ "- education-num: continuous.\n",
+ "- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.\n",
+ "- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.\n",
+ "- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.\n",
+ "- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.\n",
+ "- sex: Female, Male.\n",
+ "- capital-gain: continuous.\n",
+ "- capital-loss: continuous.\n",
+ "- hours-per-week: continuous.\n",
+ "- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. \n",
+ "- salary: >50K,<=50K"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "6GzulHvOutTR"
+ },
+ "source": [
+ "import pandas as pd"
+ ],
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SJ3LbaoiutTT",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 444
+ },
+ "outputId": "d8361b12-744c-4fdb-ccd2-a737c457a8ba"
+ },
+ "source": [
+ "data = pd.read_csv(\"https://raw.githubusercontent.com/aksenov7/Kaggle_competition_group/master/adult.data.csv\")\n",
+ "data.head()"
+ ],
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " age workclass fnlwgt education education-num \\\n",
+ "0 39 State-gov 77516 Bachelors 13 \n",
+ "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
+ "2 38 Private 215646 HS-grad 9 \n",
+ "3 53 Private 234721 11th 7 \n",
+ "4 28 Private 338409 Bachelors 13 \n",
+ "\n",
+ " marital-status occupation relationship race sex \\\n",
+ "0 Never-married Adm-clerical Not-in-family White Male \n",
+ "1 Married-civ-spouse Exec-managerial Husband White Male \n",
+ "2 Divorced Handlers-cleaners Not-in-family White Male \n",
+ "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
+ "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
+ "\n",
+ " capital-gain capital-loss hours-per-week native-country salary \n",
+ "0 2174 0 40 United-States <=50K \n",
+ "1 0 0 13 United-States <=50K \n",
+ "2 0 0 40 United-States <=50K \n",
+ "3 0 0 40 United-States <=50K \n",
+ "4 0 0 40 Cuba <=50K "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " workclass | \n",
+ " fnlwgt | \n",
+ " education | \n",
+ " education-num | \n",
+ " marital-status | \n",
+ " occupation | \n",
+ " relationship | \n",
+ " race | \n",
+ " sex | \n",
+ " capital-gain | \n",
+ " capital-loss | \n",
+ " hours-per-week | \n",
+ " native-country | \n",
+ " salary | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 39 | \n",
+ " State-gov | \n",
+ " 77516 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Never-married | \n",
+ " Adm-clerical | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 2174 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50 | \n",
+ " Self-emp-not-inc | \n",
+ " 83311 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 13 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38 | \n",
+ " Private | \n",
+ " 215646 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Divorced | \n",
+ " Handlers-cleaners | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 53 | \n",
+ " Private | \n",
+ " 234721 | \n",
+ " 11th | \n",
+ " 7 | \n",
+ " Married-civ-spouse | \n",
+ " Handlers-cleaners | \n",
+ " Husband | \n",
+ " Black | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 28 | \n",
+ " Private | \n",
+ " 338409 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Prof-specialty | \n",
+ " Wife | \n",
+ " Black | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " Cuba | \n",
+ " <=50K | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "EpQFv8t1ds05"
+ },
+ "source": [
+ "# def married(row):\n",
+ "# return \"Married\" in row\n",
+ "data[\"married\"] = data[\"marital-status\"].apply(lambda row: \"Married\" in row)"
+ ],
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 800
+ },
+ "id": "3Bb2mRTEeoJK",
+ "outputId": "5b83d50d-e4c2-47de-bce7-3259503af14d"
+ },
+ "source": [
+ "data"
+ ],
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " age workclass fnlwgt education education-num \\\n",
+ "0 39 State-gov 77516 Bachelors 13 \n",
+ "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
+ "2 38 Private 215646 HS-grad 9 \n",
+ "3 53 Private 234721 11th 7 \n",
+ "4 28 Private 338409 Bachelors 13 \n",
+ "... ... ... ... ... ... \n",
+ "32556 27 Private 257302 Assoc-acdm 12 \n",
+ "32557 40 Private 154374 HS-grad 9 \n",
+ "32558 58 Private 151910 HS-grad 9 \n",
+ "32559 22 Private 201490 HS-grad 9 \n",
+ "32560 52 Self-emp-inc 287927 HS-grad 9 \n",
+ "\n",
+ " marital-status occupation relationship race sex \\\n",
+ "0 Never-married Adm-clerical Not-in-family White Male \n",
+ "1 Married-civ-spouse Exec-managerial Husband White Male \n",
+ "2 Divorced Handlers-cleaners Not-in-family White Male \n",
+ "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
+ "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
+ "... ... ... ... ... ... \n",
+ "32556 Married-civ-spouse Tech-support Wife White Female \n",
+ "32557 Married-civ-spouse Machine-op-inspct Husband White Male \n",
+ "32558 Widowed Adm-clerical Unmarried White Female \n",
+ "32559 Never-married Adm-clerical Own-child White Male \n",
+ "32560 Married-civ-spouse Exec-managerial Wife White Female \n",
+ "\n",
+ " capital-gain capital-loss hours-per-week native-country salary \\\n",
+ "0 2174 0 40 United-States <=50K \n",
+ "1 0 0 13 United-States <=50K \n",
+ "2 0 0 40 United-States <=50K \n",
+ "3 0 0 40 United-States <=50K \n",
+ "4 0 0 40 Cuba <=50K \n",
+ "... ... ... ... ... ... \n",
+ "32556 0 0 38 United-States <=50K \n",
+ "32557 0 0 40 United-States >50K \n",
+ "32558 0 0 40 United-States <=50K \n",
+ "32559 0 0 20 United-States <=50K \n",
+ "32560 15024 0 40 United-States >50K \n",
+ "\n",
+ " married \n",
+ "0 False \n",
+ "1 True \n",
+ "2 False \n",
+ "3 True \n",
+ "4 True \n",
+ "... ... \n",
+ "32556 True \n",
+ "32557 True \n",
+ "32558 False \n",
+ "32559 False \n",
+ "32560 True \n",
+ "\n",
+ "[32561 rows x 16 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " workclass | \n",
+ " fnlwgt | \n",
+ " education | \n",
+ " education-num | \n",
+ " marital-status | \n",
+ " occupation | \n",
+ " relationship | \n",
+ " race | \n",
+ " sex | \n",
+ " capital-gain | \n",
+ " capital-loss | \n",
+ " hours-per-week | \n",
+ " native-country | \n",
+ " salary | \n",
+ " married | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 39 | \n",
+ " State-gov | \n",
+ " 77516 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Never-married | \n",
+ " Adm-clerical | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 2174 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 50 | \n",
+ " Self-emp-not-inc | \n",
+ " 83311 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 13 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38 | \n",
+ " Private | \n",
+ " 215646 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Divorced | \n",
+ " Handlers-cleaners | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 53 | \n",
+ " Private | \n",
+ " 234721 | \n",
+ " 11th | \n",
+ " 7 | \n",
+ " Married-civ-spouse | \n",
+ " Handlers-cleaners | \n",
+ " Husband | \n",
+ " Black | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 28 | \n",
+ " Private | \n",
+ " 338409 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Prof-specialty | \n",
+ " Wife | \n",
+ " Black | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " Cuba | \n",
+ " <=50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 32556 | \n",
+ " 27 | \n",
+ " Private | \n",
+ " 257302 | \n",
+ " Assoc-acdm | \n",
+ " 12 | \n",
+ " Married-civ-spouse | \n",
+ " Tech-support | \n",
+ " Wife | \n",
+ " White | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 38 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 32557 | \n",
+ " 40 | \n",
+ " Private | \n",
+ " 154374 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Machine-op-inspct | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " >50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 32558 | \n",
+ " 58 | \n",
+ " Private | \n",
+ " 151910 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Widowed | \n",
+ " Adm-clerical | \n",
+ " Unmarried | \n",
+ " White | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 32559 | \n",
+ " 22 | \n",
+ " Private | \n",
+ " 201490 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Never-married | \n",
+ " Adm-clerical | \n",
+ " Own-child | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 20 | \n",
+ " United-States | \n",
+ " <=50K | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 32560 | \n",
+ " 52 | \n",
+ " Self-emp-inc | \n",
+ " 287927 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Wife | \n",
+ " White | \n",
+ " Female | \n",
+ " 15024 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ " >50K | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
32561 rows × 16 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MoK8B5fIutTW"
+ },
+ "source": [
+ "**1. Сколько мужчин и женщин (признак *sex*) представлено в этом наборе данных?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "hdzky90TutTY",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "74677bb2-d548-4cf1-9756-23c4f510eeb0"
+ },
+ "source": [
+ "data['sex'].value_counts()"
+ ],
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Male 21790\n",
+ "Female 10771\n",
+ "Name: sex, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "В этом наборе данных было представлено 21790 мужчин и 10771 женщин."
+ ],
+ "metadata": {
+ "id": "Oke5GuC2W_D_"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "adF8lgVbutTZ"
+ },
+ "source": [
+ "**2. Каков средний возраст (признак *age*) женщин?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "K6C2qZ_zutTb",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "8bcc8f69-d458-4da3-ff4a-7f49f1c757b7"
+ },
+ "source": [
+ "sr_age_female = data[data[\"sex\"] == \"Female\"]['age'].mean()\n",
+ "print(\"Средний возраст женщин в данном наборе:\", sr_age_female)"
+ ],
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Средний возраст женщин в данном наборе: 36.85823043357163\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-Cz1S7-HutTd"
+ },
+ "source": [
+ "**3. Какова доля граждан Германии (признак *native-country*)?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "Y4mmqN6outTf",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "83b7b594-a19d-45a8-a0f5-911fc05ff28f"
+ },
+ "source": [
+ "fraction_people = data.groupby(\"native-country\").size() / len(data)\n",
+ "print(\"Доля граждан Германии\", fraction_people[\"Germany\"])"
+ ],
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Доля граждан Германии 0.004207487485028101\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Do-rEgaautTg"
+ },
+ "source": [
+ "**4-5. Каковы средние значения и среднеквадратичные отклонения возраста тех, кто получает более 50K в год (признак *salary*) и тех, кто получает менее 50K в год?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "eSuk0CAnutTh",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "7bde0415-eb3a-4492-cc70-0b68b3b03772"
+ },
+ "source": [
+ "print(data.groupby('salary')['age'].mean())\n",
+ "print(data.groupby('salary')['age'].std())"
+ ],
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "salary\n",
+ "<=50K 36.783738\n",
+ ">50K 44.249841\n",
+ "Name: age, dtype: float64\n",
+ "salary\n",
+ "<=50K 14.020088\n",
+ ">50K 10.519028\n",
+ "Name: age, dtype: float64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rK9SwvI_utTj"
+ },
+ "source": [
+ "**6. Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование? (признак *education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters* или *Doctorate*)**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "eygYabkdutTj",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "db5836c7-42cd-4b37-c907-f011f97e6b05"
+ },
+ "source": [
+ "higher_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']\n",
+ "true_or_false = data[data.education.isin(higher_education)].salary.value_counts()['<=50K'] == 0\n",
+ "print(\"Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование -\", true_or_false)"
+ ],
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Правда ли, что люди, которые получают больше 50k, имеют как минимум высшее образование - False\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4DqPASEsutTk"
+ },
+ "source": [
+ "**7. Выведите статистику возраста для каждой расы (признак *race*) и каждого пола. Используйте *groupby* и *describe*. Найдите таким образом максимальный возраст мужчин расы *Amer-Indian-Eskimo*.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "table = data.groupby(['race', 'sex']).describe()['age']\n",
+ "print(table)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UBIITpxqd7T0",
+ "outputId": "b22cd440-fd36-4c63-9e9b-456ec425ec50"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " count mean std min 25% 50% \\\n",
+ "race sex \n",
+ "Amer-Indian-Eskimo Female 119.0 37.117647 13.114991 17.0 27.0 36.0 \n",
+ " Male 192.0 37.208333 12.049563 17.0 28.0 35.0 \n",
+ "Asian-Pac-Islander Female 346.0 35.089595 12.300845 17.0 25.0 33.0 \n",
+ " Male 693.0 39.073593 12.883944 18.0 29.0 37.0 \n",
+ "Black Female 1555.0 37.854019 12.637197 17.0 28.0 37.0 \n",
+ " Male 1569.0 37.682600 12.882612 17.0 27.0 36.0 \n",
+ "Other Female 109.0 31.678899 11.631599 17.0 23.0 29.0 \n",
+ " Male 162.0 34.654321 11.355531 17.0 26.0 32.0 \n",
+ "White Female 8642.0 36.811618 14.329093 17.0 25.0 35.0 \n",
+ " Male 19174.0 39.652498 13.436029 17.0 29.0 38.0 \n",
+ "\n",
+ " 75% max \n",
+ "race sex \n",
+ "Amer-Indian-Eskimo Female 46.00 80.0 \n",
+ " Male 45.00 82.0 \n",
+ "Asian-Pac-Islander Female 43.75 75.0 \n",
+ " Male 46.00 90.0 \n",
+ "Black Female 46.00 90.0 \n",
+ " Male 46.00 90.0 \n",
+ "Other Female 39.00 74.0 \n",
+ " Male 42.00 77.0 \n",
+ "White Female 46.00 90.0 \n",
+ " Male 49.00 90.0 \n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "fYkBDZMdutTl",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "9935d02d-e026-4157-a57d-9c5367e7d054"
+ },
+ "source": [
+ "dataframe = data.groupby([\"race\", \"sex\"]).describe().loc['Amer-Indian-Eskimo'].loc[\"Male\"]\n",
+ "print(\"Максимальный возраст мужчин расы Amer-Indian-Eskimo:\", dataframe[\"age\"][\"max\"], \"года.\")"
+ ],
+ "execution_count": 33,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Максимальный возраст мужчин расы Amer-Indian-Eskimo: 82.0 года.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cn-jYXhzutTl"
+ },
+ "source": [
+ "**8. Среди кого больше доля зарабатывающих много (>50K): среди женатых или холостых мужчин (признак *marital-status*)? Женатыми считаем тех, у кого *marital-status* начинается с *Married* (Married-civ-spouse, Married-spouse-absent или Married-AF-spouse), остальных считаем холостыми.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "4hIQXgGAutTm",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "28f56f3c-568e-4745-8c02-9b55b905f44b"
+ },
+ "source": [
+ "salary_man = data[data['sex'] == 'Male'].query('salary == \">50K\"')\n",
+ "married_man = len(salary_man[salary_man[\"marital-status\"]==\"Married-civ-spouse\"]) + len(salary_man[salary_man[\"marital-status\"]==\"Married-spouse-absent\"]) + len(salary_man[salary_man[\"marital-status\"]==\"Married-AF-spouse\"])\n",
+ "print(\"Доля зарабатывающих больше 50к больше среди женатых, чем холостых: \", married_man > len(salary_man) - married_man)\n"
+ ],
+ "execution_count": 35,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Доля зарабатывающих больше 50к больше среди женатых, чем холостых: True\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Rsh8YvoXutTm"
+ },
+ "source": [
+ "**9. Какое максимальное число часов человек работает в неделю (признак *hours-per-week*)? Сколько людей работают такое количество часов и каков среди них процент зарабатывающих много?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "RK1JQSIZutTn",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "c963ad09-e2dc-4b99-e84e-d7aed9058aba"
+ },
+ "source": [
+ "max_hours_work_in_week = data['hours-per-week'].describe()['max']\n",
+ "people_work = data[data[\"hours-per-week\"] == max_hours_work_in_week]\n",
+ "print(\"Максимальное число часов, которое человек работает в неделю:\", max_hours_work_in_week)\n",
+ "print(\"Количество людей, работающих такое же количество часов:\", people_work.shape[0])\n",
+ "print(\"Процент людей зарабатывающих много:\", round((people_work[people_work['salary'] == '>50K'].shape[0] / people_work.shape[0]) * 100, 1), \"%\")"
+ ],
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Максимальное число часов, которое человек работает в неделю: 99.0\n",
+ "Количество людей, работающих такое же количество часов: 85\n",
+ "Процент людей зарабатывающих много: 29.4 %\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kUXV84AjutTn"
+ },
+ "source": [
+ "**10. Посчитайте среднее время работы (*hours-per-week*) зарабатывающих мало и много (*salary*) для каждой страны (*native-country*).**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": true,
+ "id": "3gzYG3CDutTn",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "15ecdc6c-46cc-4a47-f909-992a9079c51e"
+ },
+ "source": [
+ "print(\"Среднее время работы зарабатывающих мало и много для каждой страны\")\n",
+ "print(data.groupby(['native-country', 'salary'])['hours-per-week'].mean())"
+ ],
+ "execution_count": 34,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Среднее время работы зарабатывающих мало и много для каждой страны\n",
+ "native-country salary\n",
+ "? <=50K 40.164760\n",
+ " >50K 45.547945\n",
+ "Cambodia <=50K 41.416667\n",
+ " >50K 40.000000\n",
+ "Canada <=50K 37.914634\n",
+ " ... \n",
+ "United-States >50K 45.505369\n",
+ "Vietnam <=50K 37.193548\n",
+ " >50K 39.200000\n",
+ "Yugoslavia <=50K 41.600000\n",
+ " >50K 49.500000\n",
+ "Name: hours-per-week, Length: 82, dtype: float64\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file