diff --git a/06_numpy_intro.html b/06_numpy_intro.html index e4a21c3..8f3f5c4 100644 --- a/06_numpy_intro.html +++ b/06_numpy_intro.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + @@ -650,7 +665,9 @@

    Images are Numerical Data
    Requirement already satisfied: python-dateutil>=2.7 in /home/javi/anaconda3/lib/python3.11/site-packages (from matplotlib) (2.9.0.post0)
     
    Requirement already satisfied: six>=1.5 in /home/javi/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
    diff --git a/_images/050342ad7de84c40a4f5b0c6b4dab40d296739d7f217feb8266f34cedda3e3ca.png b/_images/050342ad7de84c40a4f5b0c6b4dab40d296739d7f217feb8266f34cedda3e3ca.png
    deleted file mode 100644
    index 565171a..0000000
    Binary files a/_images/050342ad7de84c40a4f5b0c6b4dab40d296739d7f217feb8266f34cedda3e3ca.png and /dev/null differ
    diff --git a/_images/070678a7e031c9e174b949341b6769baad220cfe3b8ff26899a82d433503ae43.png b/_images/070678a7e031c9e174b949341b6769baad220cfe3b8ff26899a82d433503ae43.png
    deleted file mode 100644
    index 31946d9..0000000
    Binary files a/_images/070678a7e031c9e174b949341b6769baad220cfe3b8ff26899a82d433503ae43.png and /dev/null differ
    diff --git a/_images/09648ccee61f35c1d83771b6e7b95082eb478f1c18a3bf86c68555d25aa48eff.png b/_images/09648ccee61f35c1d83771b6e7b95082eb478f1c18a3bf86c68555d25aa48eff.png
    deleted file mode 100644
    index ef8465d..0000000
    Binary files a/_images/09648ccee61f35c1d83771b6e7b95082eb478f1c18a3bf86c68555d25aa48eff.png and /dev/null differ
    diff --git a/_images/19b4ef6aa7945c5b3eee41c25052b158df7cccdca6e1be627c555a7f2610a7cd.png b/_images/19b4ef6aa7945c5b3eee41c25052b158df7cccdca6e1be627c555a7f2610a7cd.png
    deleted file mode 100644
    index 38c3b46..0000000
    Binary files a/_images/19b4ef6aa7945c5b3eee41c25052b158df7cccdca6e1be627c555a7f2610a7cd.png and /dev/null differ
    diff --git a/_images/267a46e09f7430ad3fccb6b619c25d6382d080de21e885f22ad6dcdbd24172b9.png b/_images/267a46e09f7430ad3fccb6b619c25d6382d080de21e885f22ad6dcdbd24172b9.png
    new file mode 100644
    index 0000000..81bacc7
    Binary files /dev/null and b/_images/267a46e09f7430ad3fccb6b619c25d6382d080de21e885f22ad6dcdbd24172b9.png differ
    diff --git a/_images/31805950d8499b57c39bdfbc3a67021af2192f28a93c9217fee70eb158fe9027.png b/_images/31805950d8499b57c39bdfbc3a67021af2192f28a93c9217fee70eb158fe9027.png
    deleted file mode 100644
    index 0006053..0000000
    Binary files a/_images/31805950d8499b57c39bdfbc3a67021af2192f28a93c9217fee70eb158fe9027.png and /dev/null differ
    diff --git a/_images/404f063734e2f5dcb1228730ff1f99d7066229bac488649789a0c8d01fc3ca2b.png b/_images/404f063734e2f5dcb1228730ff1f99d7066229bac488649789a0c8d01fc3ca2b.png
    deleted file mode 100644
    index e8565c0..0000000
    Binary files a/_images/404f063734e2f5dcb1228730ff1f99d7066229bac488649789a0c8d01fc3ca2b.png and /dev/null differ
    diff --git a/_images/49235144743ad3e42d3413bbe691dbfc7596c45f6f5441d4b7ecdc76d50e9b2e.png b/_images/49235144743ad3e42d3413bbe691dbfc7596c45f6f5441d4b7ecdc76d50e9b2e.png
    deleted file mode 100644
    index 51e2136..0000000
    Binary files a/_images/49235144743ad3e42d3413bbe691dbfc7596c45f6f5441d4b7ecdc76d50e9b2e.png and /dev/null differ
    diff --git a/_images/49ed94f15827ce8ac14e383ab6e9d4653c651f6dfb2edba2708c8d0fc1abc493.png b/_images/49ed94f15827ce8ac14e383ab6e9d4653c651f6dfb2edba2708c8d0fc1abc493.png
    new file mode 100644
    index 0000000..dfa4f02
    Binary files /dev/null and b/_images/49ed94f15827ce8ac14e383ab6e9d4653c651f6dfb2edba2708c8d0fc1abc493.png differ
    diff --git a/_images/613f5dbe46144df9ed2c32f18ed370a394b35506cc1b89a60b2d71b52d5219c2.png b/_images/613f5dbe46144df9ed2c32f18ed370a394b35506cc1b89a60b2d71b52d5219c2.png
    deleted file mode 100644
    index 2142813..0000000
    Binary files a/_images/613f5dbe46144df9ed2c32f18ed370a394b35506cc1b89a60b2d71b52d5219c2.png and /dev/null differ
    diff --git a/_images/7de5d3b5336ce431ec4f22429a4787349037f1791de0ef710daae738cd2d7efe.png b/_images/7de5d3b5336ce431ec4f22429a4787349037f1791de0ef710daae738cd2d7efe.png
    deleted file mode 100644
    index 07787f5..0000000
    Binary files a/_images/7de5d3b5336ce431ec4f22429a4787349037f1791de0ef710daae738cd2d7efe.png and /dev/null differ
    diff --git a/_images/8393299fa27a42e0b7e0f92dbf4b389608087a22bb19b8d8552621c98c9857b5.png b/_images/8393299fa27a42e0b7e0f92dbf4b389608087a22bb19b8d8552621c98c9857b5.png
    new file mode 100644
    index 0000000..4c131c9
    Binary files /dev/null and b/_images/8393299fa27a42e0b7e0f92dbf4b389608087a22bb19b8d8552621c98c9857b5.png differ
    diff --git a/_images/83fb628c4f337b0bb1bd0b2d7c32d4f856a8dc627bacb76741eeba545ccd3a57.png b/_images/83fb628c4f337b0bb1bd0b2d7c32d4f856a8dc627bacb76741eeba545ccd3a57.png
    deleted file mode 100644
    index e8a76ab..0000000
    Binary files a/_images/83fb628c4f337b0bb1bd0b2d7c32d4f856a8dc627bacb76741eeba545ccd3a57.png and /dev/null differ
    diff --git a/_images/9c819e3921bf42578207b6dea8364dac88c5df9e193120f2117cdd9dcd90e51a.png b/_images/9c819e3921bf42578207b6dea8364dac88c5df9e193120f2117cdd9dcd90e51a.png
    deleted file mode 100644
    index 9f4aa29..0000000
    Binary files a/_images/9c819e3921bf42578207b6dea8364dac88c5df9e193120f2117cdd9dcd90e51a.png and /dev/null differ
    diff --git a/_images/c320e303b93814dfbe08651d9ac04d8277a3a49756eb213a92b93e18099005a3.png b/_images/c320e303b93814dfbe08651d9ac04d8277a3a49756eb213a92b93e18099005a3.png
    deleted file mode 100644
    index 0907007..0000000
    Binary files a/_images/c320e303b93814dfbe08651d9ac04d8277a3a49756eb213a92b93e18099005a3.png and /dev/null differ
    diff --git a/_images/e56d5aba8799217f6afe138ed710ed222a315c174f283c71639551057f692abb.png b/_images/e56d5aba8799217f6afe138ed710ed222a315c174f283c71639551057f692abb.png
    new file mode 100644
    index 0000000..360cc46
    Binary files /dev/null and b/_images/e56d5aba8799217f6afe138ed710ed222a315c174f283c71639551057f692abb.png differ
    diff --git a/_images/e7e3ff2b906c8e926c0e4d6ac6a81e8a3a0ba98ae4fc28c085a0fab1911aff6e.png b/_images/e7e3ff2b906c8e926c0e4d6ac6a81e8a3a0ba98ae4fc28c085a0fab1911aff6e.png
    deleted file mode 100644
    index fb12d92..0000000
    Binary files a/_images/e7e3ff2b906c8e926c0e4d6ac6a81e8a3a0ba98ae4fc28c085a0fab1911aff6e.png and /dev/null differ
    diff --git a/_images/f51cf89b081544448e064cb8ce537e13c1baa630e0be650380df0813287266cf.png b/_images/f51cf89b081544448e064cb8ce537e13c1baa630e0be650380df0813287266cf.png
    deleted file mode 100644
    index c80c2ca..0000000
    Binary files a/_images/f51cf89b081544448e064cb8ce537e13c1baa630e0be650380df0813287266cf.png and /dev/null differ
    diff --git a/_sources/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.ipynb b/_sources/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.ipynb
    deleted file mode 100644
    index fa058c4..0000000
    --- a/_sources/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.ipynb
    +++ /dev/null
    @@ -1,875 +0,0 @@
    -{
    - "cells": [
    -  {
    -   "cell_type": "code",
    -   "execution_count": 1,
    -   "id": "ca3e52c1-205a-4b79-a122-ca6de7694f08",
    -   "metadata": {},
    -   "outputs": [],
    -   "source": [
    -    "import pandas as pd\n",
    -    "import numpy as np"
    -   ]
    -  },
    -  {
    -   "cell_type": "code",
    -   "execution_count": 2,
    -   "id": "8d5c18d4-14ae-4298-bfe5-f36d6ebbfa7d",
    -   "metadata": {},
    -   "outputs": [
    -    {
    -     "data": {
    -      "text/html": [
    -       "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
    05.13.51.40.2setosa
    14.93.01.40.2setosa
    24.73.21.30.2setosa
    34.63.11.50.2setosa
    45.03.61.40.2setosa
    ..................
    1456.73.05.22.3virginica
    1466.32.55.01.9virginica
    1476.53.05.22.0virginica
    1486.23.45.42.3virginica
    1495.93.05.11.8virginica
    \n", - "

    150 rows × 5 columns

    \n", - "
    " - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width species\n", - "0 5.1 3.5 1.4 0.2 setosa\n", - "1 4.9 3.0 1.4 0.2 setosa\n", - "2 4.7 3.2 1.3 0.2 setosa\n", - "3 4.6 3.1 1.5 0.2 setosa\n", - "4 5.0 3.6 1.4 0.2 setosa\n", - ".. ... ... ... ... ...\n", - "145 6.7 3.0 5.2 2.3 virginica\n", - "146 6.3 2.5 5.0 1.9 virginica\n", - "147 6.5 3.0 5.2 2.0 virginica\n", - "148 6.2 3.4 5.4 2.3 virginica\n", - "149 5.9 3.0 5.1 1.8 virginica\n", - "\n", - "[150 rows x 5 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iris_df = pd.read_csv(\"https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv\")\n", - "iris_df" - ] - }, - { - "cell_type": "markdown", - "id": "383c6fe5-50d7-4b20-b761-cbe3db8c47fe", - "metadata": {}, - "source": [ - "## Concatenating and Merging\n", - "\n", - "### Concate: `pd.concat()` \n", - "\n", - "Concatenate pandas objects along an axis\n", - "\n", - "[Details](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html)" - ] - }, - { - "cell_type": "markdown", - "id": "23e8b17e-adab-4594-a8ad-2b72ad72eae0", - "metadata": {}, - "source": [ - "Create two dfs and vertically stack them" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "869e9f26-9576-4128-a6ab-f4bdb13cd8ed", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 2 3\n", - "0 0.442947 -0.617434 0.401841 -0.990547\n", - "1 0.404872 -0.729196 0.834374 -1.633626\n", - "2 -0.727989 -0.455244 -0.107535 0.788234\n", - "---------------------------------------------\n", - " 0 1 2 3\n", - "0 1.887038 0.631577 -0.373963 -0.239185\n", - "1 0.810859 0.454026 -0.796657 0.866273\n", - "2 2.243792 -0.983704 -0.527390 0.155886\n", - "---------------------------------------------\n", - " 0 1 2 3\n", - "0 0.442947 -0.617434 0.401841 -0.990547\n", - "1 0.404872 -0.729196 0.834374 -1.633626\n", - "2 -0.727989 -0.455244 -0.107535 0.788234\n", - "0 1.887038 0.631577 -0.373963 -0.239185\n", - "1 0.810859 0.454026 -0.796657 0.866273\n", - "2 2.243792 -0.983704 -0.527390 0.155886\n" - ] - } - ], - "source": [ - "df1 = pd.DataFrame(np.random.randn(3, 4))\n", - "df2 = pd.DataFrame(np.random.randn(3, 4))\n", - "\n", - "print(df1)\n", - "print('-'*45)\n", - "print(df2)\n", - "\n", - "df3 = pd.concat([df1, df2], axis=0)\n", - "\n", - "print('-'*45)\n", - "print(df3)" - ] - }, - { - "cell_type": "markdown", - "id": "dff68262-90ba-4e21-9107-1695388d51f9", - "metadata": {}, - "source": [ - "**Concat columns** \n", - "This assumes that the indexes represent IDs of specific things or events" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "c6a7e550-9972-47ea-b271-32a490dcb5ff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    foobar
    01230123
    00.442947-0.6174340.401841-0.9905471.8870380.631577-0.373963-0.239185
    10.404872-0.7291960.834374-1.6336260.8108590.454026-0.7966570.866273
    2-0.727989-0.455244-0.1075350.7882342.243792-0.983704-0.5273900.155886
    \n", - "
    " - ], - "text/plain": [ - " foo bar \\\n", - " 0 1 2 3 0 1 2 \n", - "0 0.442947 -0.617434 0.401841 -0.990547 1.887038 0.631577 -0.373963 \n", - "1 0.404872 -0.729196 0.834374 -1.633626 0.810859 0.454026 -0.796657 \n", - "2 -0.727989 -0.455244 -0.107535 0.788234 2.243792 -0.983704 -0.527390 \n", - "\n", - " \n", - " 3 \n", - "0 -0.239185 \n", - "1 0.866273 \n", - "2 0.155886 " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df4 = pd.concat([df1,df2], axis = 1, keys = ['foo', 'bar'])\n", - "\n", - "df4" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a5f24ba4-0f83-437d-94a6-53167ddff3be", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    0123
    00.442947-0.6174340.401841-0.990547
    10.404872-0.7291960.834374-1.633626
    2-0.727989-0.455244-0.1075350.788234
    \n", - "
    " - ], - "text/plain": [ - " 0 1 2 3\n", - "0 0.442947 -0.617434 0.401841 -0.990547\n", - "1 0.404872 -0.729196 0.834374 -1.633626\n", - "2 -0.727989 -0.455244 -0.107535 0.788234" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df4.foo" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d954fa94-80b4-41f1-835e-cec68a473599", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    0123
    01.8870380.631577-0.373963-0.239185
    10.8108590.454026-0.7966570.866273
    22.243792-0.983704-0.5273900.155886
    \n", - "
    " - ], - "text/plain": [ - " 0 1 2 3\n", - "0 1.887038 0.631577 -0.373963 -0.239185\n", - "1 0.810859 0.454026 -0.796657 0.866273\n", - "2 2.243792 -0.983704 -0.527390 0.155886" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df4.bar" - ] - }, - { - "cell_type": "markdown", - "id": "f942b10c-0cde-4adb-a1e2-195144c6e169", - "metadata": {}, - "source": [ - "### merge: `merge()`\n", - "\n", - "SQL-style joining of tables (DataFrames)\n", - "\n", - "Important parameters include:\n", - "\n", - "- `how` : type of merge {'left', 'right', 'outer', 'inner', 'cross'}, default ‘inner’\n", - "- `on` : names to join on\n", - " \n", - "[Details](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html)" - ] - }, - { - "cell_type": "markdown", - "id": "df302e38-6caf-40dd-a2b9-ec2efa00917a", - "metadata": {}, - "source": [ - "**Very useful!**" - ] - }, - { - "cell_type": "markdown", - "id": "dfbdb7ee-aafd-4ff8-bc09-6da066178f15", - "metadata": {}, - "source": [ - "Create two tables, `left` and `right`. Then right join them on `key`. \n", - "Right join means include all records from table on right. \n", - "The `key` is used for matching up the records." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "77899cbc-dc68-411e-8ff2-69d2db87c9ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---left\n", - " key lval\n", - "0 jamie 15\n", - "1 bill 22\n", - "\n", - "---right\n", - " key rval\n", - "0 jamie 4\n", - "1 bill 5\n", - "2 asher 8\n", - "\n", - "---joined\n", - " key lval rval\n", - "0 jamie 15.0 4\n", - "1 bill 22.0 5\n", - "2 asher NaN 8\n" - ] - } - ], - "source": [ - "left = pd.DataFrame({\"key\": [\"jamie\", \"bill\"], \"lval\": [15, 22]})\n", - "right = pd.DataFrame({\"key\": [\"jamie\", \"bill\", \"asher\"], \"rval\": [4, 5, 8]})\n", - "\n", - "joined = pd.merge(left, right, on=\"key\", how=\"right\")\n", - "\n", - "print('---left')\n", - "print(left)\n", - "print('\\n---right')\n", - "print(right)\n", - "print('\\n---joined')\n", - "print(joined)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "956199c0-ce5a-44e2-a0a2-89b33899d33d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "0eb6b71a-25f1-44f1-a4af-ce6377732756", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "* Use **join** if you have shared indexes\n", - "* Use **merge** if you do not have shared indexes\n", - "* Use **concat** to combine based on shared indexes or columns" - ] - }, - { - "cell_type": "markdown", - "id": "08dd64e7-5ef2-43cf-9198-ff63dc38400c", - "metadata": {}, - "source": [ - "## Data Aggregation\n", - "\n", - "Involves one or more of:\n", - "\n", - "- splitting the data into groups\n", - "- applying a function to each group\n", - "- combining results" - ] - }, - { - "cell_type": "markdown", - "id": "cf1a6bc2-705f-44f9-8497-8a5fc53b948e", - "metadata": {}, - "source": [ - "### Aggregation by `.groupby()`\n", - "\n", - "Compute mean of each column, grouped (separately) by species" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9fff6ac6-bd68-46af-90d2-cb994becb5f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    sepal_lengthsepal_widthpetal_lengthpetal_width
    species
    setosa5.0063.4281.4620.246
    versicolor5.9362.7704.2601.326
    virginica6.5882.9745.5522.026
    \n", - "
    " - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width\n", - "species \n", - "setosa 5.006 3.428 1.462 0.246\n", - "versicolor 5.936 2.770 4.260 1.326\n", - "virginica 6.588 2.974 5.552 2.026" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iris_df.groupby(\"species\").mean()" - ] - }, - { - "cell_type": "markdown", - "id": "c1f0f6a4-955a-45c0-bd8d-96948b8f04d4", - "metadata": {}, - "source": [ - "### `pd.pivot_table()`\n", - "\n", - "Apply a function `aggfunc` to selected values grouped by columns\n", - "\n", - "[Details](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html)" - ] - }, - { - "cell_type": "markdown", - "id": "48857f21-842c-4655-887f-2cb6bf441b19", - "metadata": {}, - "source": [ - "Compute mean sepal length for each species:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "779c5fbd-fce1-4a41-8f34-1c0642feb70a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    speciessetosaversicolorvirginica
    sepal_length5.0065.9366.588
    \n", - "
    " - ], - "text/plain": [ - "species setosa versicolor virginica\n", - "sepal_length 5.006 5.936 6.588" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.pivot_table(iris_df, values=\"sepal_length\", columns=[\"species\"], aggfunc = np.mean)" - ] - }, - { - "cell_type": "markdown", - "id": "0b711382-80e5-43ac-8d51-a886cc6e2bd0", - "metadata": {}, - "source": [ - "## Reshaping Data" - ] - }, - { - "cell_type": "markdown", - "id": "af439e07-55b8-47cc-b462-85b899074128", - "metadata": {}, - "source": [ - "## `.reshape()`\n", - "\n", - "Changes the object's shape\n", - "\n", - "We illustrate creating pandas Series, extracting array of length 6, and reshaping to 3x2 array." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "be6f85d4-571e-4ea6-9962-436bb80a92c0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "orig data: [1 1 2 3 5 8]\n", - "orig type: \n", - "orig shape: (6,)\n", - "\n", - " reshaped vals:\n", - "[[1 1]\n", - " [2 3]\n", - " [5 8]]\n", - "\n", - " new type: \n", - "new shape: (3, 2)\n" - ] - } - ], - "source": [ - "# create a series\n", - "ser = pd.Series([1, 1, 2, 3, 5, 8])\n", - "\n", - "# extract values\n", - "vals = ser.values\n", - "\n", - "print('orig data:', vals)\n", - "print('orig type:', type(vals))\n", - "print('orig shape:', vals.shape)\n", - "\n", - "# reshaping series\n", - "reshaped_vals = vals.reshape((3, 2))\n", - "\n", - "print('\\n reshaped vals:')\n", - "print(reshaped_vals)\n", - "print('\\n new type:', type(reshaped_vals))\n", - "print('new shape:', reshaped_vals.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9fdf231e-2c24-4277-bd7d-8882104f016a", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/_sources/chapters/wrap-up.ipynb b/_sources/chapters/wrap-up.ipynb new file mode 100644 index 0000000..e7dd136 --- /dev/null +++ b/_sources/chapters/wrap-up.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d5ee9563-eb6c-46f1-af44-7dac2a01bc88", + "metadata": {}, + "source": [ + "# Wrapping up\n", + "\n", + "Throughout this course, we have covered the most important foundational programming skills a future Data Scientist needs, with a particular emphasis on Python.\n", + "\n", + "For both languages, we explored their syntax, different data types, and how to work with data structures. We also delved into implementing loops, functions, and even classes (which is uncommon in beginner programming courses). Additionally, we discussed basic data science operations in both languages, particularly focusing on how to inspect and interact with raw data.\n", + "\n", + "Now, coming to the question of **Python vs. R**: which one should you choose? It’s entirely up to you—both are excellent tools, as we have emphasized throughout the course. Keep in mind that you can essentially achieve the same results in one language as in the other. For example, when it comes to data manipulation, see this comparison: [Python vs R](https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_r.html).\n", + "\n", + "Here is my personal perspective though:\n", + "\n", + "- **Python**: Ideal for programmatic scenarios such as developing complex libraries, thanks to its versatility, simple syntax, and readability. Moreover, for machine learning and deep learning applications, Python remains the top choice.\n", + " \n", + "- **R**: Best suited for advanced statistical analysis, such as mixed linear modeling, factor analysis, mediation analysis, and Bayesian statistics. In addition, while I do not use it as often as I should, `ggplot2` can produce exceptionally high-quality graphs—so be sure to consider this in the future!" + ] + }, + { + "cell_type": "markdown", + "id": "5c0a9698-4972-4c13-9094-594ece61ff19", + "metadata": {}, + "source": [ + "## Looking Ahead\n", + "\n", + "There are certain things we have not covered in this course that a Data Scientist should likely master in the future. Here are some examples:\n", + "\n", + "### Visualization\n", + "\n", + "Clear and effective visualization is crucial for communicating with data. \n", + "\n", + "Here are a few examples:\n", + "\n", + "- **Python**: `matplotlib`, `seaborn`\n", + "- **R**: `ggplot2`\n", + "- **Cross-platform**: `plotly`, `shiny`" + ] + }, + { + "cell_type": "markdown", + "id": "798f9f74-440d-48d1-b3ea-665f8b7350b3", + "metadata": {}, + "source": [ + "**Matplotlib**" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3ce65aba-b814-4fb9-9535-d0eebe4f5e83", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.style.use('classic')\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b90a9671-5e14-4b47-ace2-5a9538d61886", + "metadata": {}, + "outputs": [], + "source": [ + "# Create some data\n", + "rng = np.random.RandomState(0) # creates a random range seeded from 0\n", + "x = np.linspace(0, 10, 500) # creates evenly spaced numbers of a specified interval\n", + "y = np.cumsum(rng.randn(500, 6), 0) # creates the sum of random numbers within a range." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6217776d-f106-4533-944a-d85d16a63f2b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the data with Matplotlib defaults\n", + "plt.plot(x, y)\n", + "plt.legend('ABCDEF', ncol=3, loc='upper left');" + ] + }, + { + "cell_type": "markdown", + "id": "01f502ae-fe48-437d-8d4b-1be8d5b7291a", + "metadata": {}, + "source": [ + "**Seaborn**" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d91be125-f7bb-43d3-9f4a-1b647a4f6343", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f7d84ffa-8c7c-4b02-a70e-a02f05813192", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    xgroupy
    00.00000A1.764052
    10.02004A2.714141
    20.04008A3.475178
    30.06012A3.788246
    40.08016A6.058001
    ............
    29959.91984F-10.465950
    29969.93988F-9.613061
    29979.95992F-9.165378
    29989.97996F-9.004272
    299910.00000F-9.084870
    \n", + "

    3000 rows × 3 columns

    \n", + "
    " + ], + "text/plain": [ + " x group y\n", + "0 0.00000 A 1.764052\n", + "1 0.02004 A 2.714141\n", + "2 0.04008 A 3.475178\n", + "3 0.06012 A 3.788246\n", + "4 0.08016 A 6.058001\n", + "... ... ... ...\n", + "2995 9.91984 F -10.465950\n", + "2996 9.93988 F -9.613061\n", + "2997 9.95992 F -9.165378\n", + "2998 9.97996 F -9.004272\n", + "2999 10.00000 F -9.084870\n", + "\n", + "[3000 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xy_df = pd.concat([pd.DataFrame({\"x\": x}), \n", + " pd.DataFrame(y, columns=[\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"])], axis=1)\n", + "xy_df = pd.melt(xy_df, id_vars=[\"x\"], var_name=\"group\", value_name=\"y\")\n", + "xy_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "0087a9f6-979d-48aa-ad73-2506510faeb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set(style=\"whitegrid\")\n", + "sns.lineplot(x=\"x\", y=\"y\", hue=\"group\", data=xy_df)\n", + "plt.legend(ncol=3, loc='upper left')" + ] + }, + { + "cell_type": "markdown", + "id": "55bd026a-dc9a-4a4d-b5ba-634ae7dd2016", + "metadata": {}, + "source": [ + "### Analysis\n", + "\n", + "Hera are a few examples of basic libraries for data anaylis in Python and R, with a bit of predominance bias towards the former:\n", + "\n", + "- **Statistics**: `scipy` (Python), `statsmodels` (Python), Base R, `lme4` (R), `blme` (R).\n", + "- **Machine lerning**: `scikit-learn` (Python), `caret` (R), `xgboost` (cross-platform).\n", + "- **Deep lerning**: `keras` (Python), `pytorch` (Python), `tensorflow`(Python)." + ] + }, + { + "cell_type": "markdown", + "id": "6dd9490e-58df-4082-9432-5de004bd1c68", + "metadata": {}, + "source": [ + "**scikit-learn**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ca4b6254-5841-4b60-857e-664d243dabf8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the average accuracy in classifying the types of Iris using a decision tree and cross-validation is: 0.9666666666666668\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.datasets import load_iris\n", + "\n", + "X, y = load_iris()[\"data\"], load_iris()[\"target\"]\n", + "clf = DecisionTreeClassifier()\n", + "\n", + "res = cross_val_score(clf, X, y, cv=5)\n", + "\n", + "print(\"the average accuracy in classifying the types of Iris using a decision tree and cross-validation is:\", \n", + " res.mean())" + ] + }, + { + "cell_type": "markdown", + "id": "85c40ab6-f17a-40af-8bd5-3034edba44cf", + "metadata": {}, + "source": [ + "**scipy**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "31c80388-db68-487c-83ff-8893cadb9fd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=-2.0456709273958644, pvalue=0.04105049135941344, df=998.0)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A two-sample t-test, adapted from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html\n", + "import numpy as np\n", + "from scipy import stats\n", + "rng = np.random.RandomState(1234)\n", + "\n", + "rvs1 = stats.norm.rvs(loc=5, scale=5, size=500, random_state=rng)\n", + "rvs2 = stats.norm.rvs(loc=5.57, scale=5, size=500, random_state=rng)\n", + "stats.ttest_ind(rvs1, rvs2)" + ] + }, + { + "cell_type": "markdown", + "id": "65dac05f-17d7-4972-8d53-5fb8e8c2c6b9", + "metadata": {}, + "source": [ + "**statsmodels**" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "8e8706f0-00f5-4979-8951-18572cb9417e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y R-squared: 0.004\n", + "Model: OLS Adj. R-squared: 0.003\n", + "Method: Least Squares F-statistic: 4.185\n", + "Date: Tue, 03 Dec 2024 Prob (F-statistic): 0.0411\n", + "Time: 10:19:40 Log-Likelihood: -3001.1\n", + "No. Observations: 1000 AIC: 6006.\n", + "Df Residuals: 998 BIC: 6016.\n", + "Df Model: 1 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 5.0487 0.218 23.180 0.000 4.621 5.476\n", + "x1 0.6301 0.308 2.046 0.041 0.026 1.235\n", + "==============================================================================\n", + "Omnibus: 0.183 Durbin-Watson: 2.112\n", + "Prob(Omnibus): 0.913 Jarque-Bera (JB): 0.129\n", + "Skew: -0.024 Prob(JB): 0.938\n", + "Kurtosis: 3.027 Cond. No. 2.62\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# The same as above, but using a linear regression model \n", + "# (tip for life: any almost basic stastical tests is just a particular instation of a linear regression model).\n", + "\n", + "import statsmodels.api as sm\n", + "\n", + "y=np.concatenate((rvs1, rvs2))\n", + "X=np.column_stack(([1]*len(y), \n", + " [0]*len(rvs1) + [1]*len(rvs2)))\n", + "model = sm.OLS(endog=y, exog=X)\n", + "res = model.fit()\n", + "print(res.summary())" + ] + }, + { + "cell_type": "markdown", + "id": "01bfbd31-287f-4da8-aed4-a438594b0451", + "metadata": {}, + "source": [ + "### Command-Line Terminal Programming\n", + "\n", + "- Programming that takes place in a **terminal**, which a text-based interface for interacting directly with the computer.\n", + "- Commands in a terminal are interpreted by a **shell**. Common shells include Bash (popular on Linux and macOS), Zsh (modern and customizable), and PowerShell (Windows-specific).\n", + "- **Essential for managing files, running scripts, and interacting with compute clusters (e.g. SLURM).**\n", + "\n", + "In Jupyter notebooks, you can execute shell commands by prefixing them with `!`. \n", + "\n", + "For example, we can navigate directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3e077d21-8fac-402d-98b3-19e98344bece", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters\n", + "01-getting_started.md\tmodule-1 module-4 wrap-up.ipynb\n", + "02-python-basics.ipynb\tmodule-2 module-5\n", + "04-python-basics.ipynb\tmodule-3 my_folder\n" + ] + } + ], + "source": [ + "# Print the current directory\n", + "!pwd\n", + "\n", + "# List files in the directory\n", + "!ls" + ] + }, + { + "cell_type": "markdown", + "id": "f961542d-8435-4701-9336-9eb85aaa41e4", + "metadata": {}, + "source": [ + "We can also manage files and directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "898cc000-738e-4596-83c2-f394e531f12b", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new folder and file\n", + "!mkdir -p my_folder # make new dir; -p option to not raise an error if it already existed\n", + "!rm -f my_folder/* # Remove preexisting content; -f option to not raise an error if the folder was already empty\n", + "!touch my_folder/hello_world.py # Create a new file named \"hello world.py\"\n", + "!echo \"print('Hello, World!\\nCode run from:', __file__)\" > my_folder/hello_world.py # Add some a line of code to this file" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "6c70e822-b505-48f0-b31f-a2ef36a615bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello_world.py\n" + ] + } + ], + "source": [ + "# List contents of the folder\n", + "!ls my_folder" + ] + }, + { + "cell_type": "markdown", + "id": "f2fb53a8-51e5-47d2-90ff-4f96469dc87e", + "metadata": {}, + "source": [ + "And run scripts:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "34b98b2c-d216-4110-ac89-38fe9e4a2f90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello, World!\n", + "Code run from: /home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters/my_folder/hello_world.py\n" + ] + } + ], + "source": [ + "!python my_folder/hello_world.py" + ] + }, + { + "cell_type": "markdown", + "id": "1e9ae7ae-ab8d-46cb-89eb-4fb1132bd12d", + "metadata": {}, + "source": [ + "### GitHub\n", + "\n", + "- Web-based platform for version control and collaboration built on top of Git, a version control system.\n", + "- It also has a powerful terminal programming where to easily interact and change your repositories.\n", + "- Allows you to track changes, collaborate with others, and share your work.\n", + "\n", + "Common Use Cases:\n", + "\n", + "- **Code Management**: Store and version codebases for projects/libraries. \n", + "- **Team Collaboration**: Coordinated team efforts on software development or data science projects.\n", + "- **Portfolio Hosting**: Showcase projects and skills for personal branding.\n", + "- **Open Source Contribution**: Contribute to or learn from public repositories.\n", + "- **Documentation**: Use GitHub Pages to create project websites or host documentation." + ] + }, + { + "cell_type": "markdown", + "id": "343eee57-2b06-4799-9b69-a9e820d8547c", + "metadata": {}, + "source": [ + "A few personal examples:\n", + "- Personal porfolio: https://github.com/jrasero\n", + "- This very course's book: https://github.com/UVADS/DS1002-book\n", + "- Niphlem: NeuroImaging-oriented Physiological Log Extraction for Modeling, toolbox: https://github.com/CoAxLab/niphlem, and [its documentation (rendered through Github)](https://coaxlab.github.io/niphlem)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/_sources/epilogue.ipynb b/_sources/epilogue.ipynb new file mode 100644 index 0000000..e7dd136 --- /dev/null +++ b/_sources/epilogue.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d5ee9563-eb6c-46f1-af44-7dac2a01bc88", + "metadata": {}, + "source": [ + "# Wrapping up\n", + "\n", + "Throughout this course, we have covered the most important foundational programming skills a future Data Scientist needs, with a particular emphasis on Python.\n", + "\n", + "For both languages, we explored their syntax, different data types, and how to work with data structures. We also delved into implementing loops, functions, and even classes (which is uncommon in beginner programming courses). Additionally, we discussed basic data science operations in both languages, particularly focusing on how to inspect and interact with raw data.\n", + "\n", + "Now, coming to the question of **Python vs. R**: which one should you choose? It’s entirely up to you—both are excellent tools, as we have emphasized throughout the course. Keep in mind that you can essentially achieve the same results in one language as in the other. For example, when it comes to data manipulation, see this comparison: [Python vs R](https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_r.html).\n", + "\n", + "Here is my personal perspective though:\n", + "\n", + "- **Python**: Ideal for programmatic scenarios such as developing complex libraries, thanks to its versatility, simple syntax, and readability. Moreover, for machine learning and deep learning applications, Python remains the top choice.\n", + " \n", + "- **R**: Best suited for advanced statistical analysis, such as mixed linear modeling, factor analysis, mediation analysis, and Bayesian statistics. In addition, while I do not use it as often as I should, `ggplot2` can produce exceptionally high-quality graphs—so be sure to consider this in the future!" + ] + }, + { + "cell_type": "markdown", + "id": "5c0a9698-4972-4c13-9094-594ece61ff19", + "metadata": {}, + "source": [ + "## Looking Ahead\n", + "\n", + "There are certain things we have not covered in this course that a Data Scientist should likely master in the future. Here are some examples:\n", + "\n", + "### Visualization\n", + "\n", + "Clear and effective visualization is crucial for communicating with data. \n", + "\n", + "Here are a few examples:\n", + "\n", + "- **Python**: `matplotlib`, `seaborn`\n", + "- **R**: `ggplot2`\n", + "- **Cross-platform**: `plotly`, `shiny`" + ] + }, + { + "cell_type": "markdown", + "id": "798f9f74-440d-48d1-b3ea-665f8b7350b3", + "metadata": {}, + "source": [ + "**Matplotlib**" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3ce65aba-b814-4fb9-9535-d0eebe4f5e83", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.style.use('classic')\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b90a9671-5e14-4b47-ace2-5a9538d61886", + "metadata": {}, + "outputs": [], + "source": [ + "# Create some data\n", + "rng = np.random.RandomState(0) # creates a random range seeded from 0\n", + "x = np.linspace(0, 10, 500) # creates evenly spaced numbers of a specified interval\n", + "y = np.cumsum(rng.randn(500, 6), 0) # creates the sum of random numbers within a range." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6217776d-f106-4533-944a-d85d16a63f2b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the data with Matplotlib defaults\n", + "plt.plot(x, y)\n", + "plt.legend('ABCDEF', ncol=3, loc='upper left');" + ] + }, + { + "cell_type": "markdown", + "id": "01f502ae-fe48-437d-8d4b-1be8d5b7291a", + "metadata": {}, + "source": [ + "**Seaborn**" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d91be125-f7bb-43d3-9f4a-1b647a4f6343", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f7d84ffa-8c7c-4b02-a70e-a02f05813192", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    xgroupy
    00.00000A1.764052
    10.02004A2.714141
    20.04008A3.475178
    30.06012A3.788246
    40.08016A6.058001
    ............
    29959.91984F-10.465950
    29969.93988F-9.613061
    29979.95992F-9.165378
    29989.97996F-9.004272
    299910.00000F-9.084870
    \n", + "

    3000 rows × 3 columns

    \n", + "
    " + ], + "text/plain": [ + " x group y\n", + "0 0.00000 A 1.764052\n", + "1 0.02004 A 2.714141\n", + "2 0.04008 A 3.475178\n", + "3 0.06012 A 3.788246\n", + "4 0.08016 A 6.058001\n", + "... ... ... ...\n", + "2995 9.91984 F -10.465950\n", + "2996 9.93988 F -9.613061\n", + "2997 9.95992 F -9.165378\n", + "2998 9.97996 F -9.004272\n", + "2999 10.00000 F -9.084870\n", + "\n", + "[3000 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xy_df = pd.concat([pd.DataFrame({\"x\": x}), \n", + " pd.DataFrame(y, columns=[\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"])], axis=1)\n", + "xy_df = pd.melt(xy_df, id_vars=[\"x\"], var_name=\"group\", value_name=\"y\")\n", + "xy_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "0087a9f6-979d-48aa-ad73-2506510faeb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set(style=\"whitegrid\")\n", + "sns.lineplot(x=\"x\", y=\"y\", hue=\"group\", data=xy_df)\n", + "plt.legend(ncol=3, loc='upper left')" + ] + }, + { + "cell_type": "markdown", + "id": "55bd026a-dc9a-4a4d-b5ba-634ae7dd2016", + "metadata": {}, + "source": [ + "### Analysis\n", + "\n", + "Hera are a few examples of basic libraries for data anaylis in Python and R, with a bit of predominance bias towards the former:\n", + "\n", + "- **Statistics**: `scipy` (Python), `statsmodels` (Python), Base R, `lme4` (R), `blme` (R).\n", + "- **Machine lerning**: `scikit-learn` (Python), `caret` (R), `xgboost` (cross-platform).\n", + "- **Deep lerning**: `keras` (Python), `pytorch` (Python), `tensorflow`(Python)." + ] + }, + { + "cell_type": "markdown", + "id": "6dd9490e-58df-4082-9432-5de004bd1c68", + "metadata": {}, + "source": [ + "**scikit-learn**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ca4b6254-5841-4b60-857e-664d243dabf8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the average accuracy in classifying the types of Iris using a decision tree and cross-validation is: 0.9666666666666668\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.datasets import load_iris\n", + "\n", + "X, y = load_iris()[\"data\"], load_iris()[\"target\"]\n", + "clf = DecisionTreeClassifier()\n", + "\n", + "res = cross_val_score(clf, X, y, cv=5)\n", + "\n", + "print(\"the average accuracy in classifying the types of Iris using a decision tree and cross-validation is:\", \n", + " res.mean())" + ] + }, + { + "cell_type": "markdown", + "id": "85c40ab6-f17a-40af-8bd5-3034edba44cf", + "metadata": {}, + "source": [ + "**scipy**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "31c80388-db68-487c-83ff-8893cadb9fd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=-2.0456709273958644, pvalue=0.04105049135941344, df=998.0)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A two-sample t-test, adapted from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html\n", + "import numpy as np\n", + "from scipy import stats\n", + "rng = np.random.RandomState(1234)\n", + "\n", + "rvs1 = stats.norm.rvs(loc=5, scale=5, size=500, random_state=rng)\n", + "rvs2 = stats.norm.rvs(loc=5.57, scale=5, size=500, random_state=rng)\n", + "stats.ttest_ind(rvs1, rvs2)" + ] + }, + { + "cell_type": "markdown", + "id": "65dac05f-17d7-4972-8d53-5fb8e8c2c6b9", + "metadata": {}, + "source": [ + "**statsmodels**" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "8e8706f0-00f5-4979-8951-18572cb9417e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y R-squared: 0.004\n", + "Model: OLS Adj. R-squared: 0.003\n", + "Method: Least Squares F-statistic: 4.185\n", + "Date: Tue, 03 Dec 2024 Prob (F-statistic): 0.0411\n", + "Time: 10:19:40 Log-Likelihood: -3001.1\n", + "No. Observations: 1000 AIC: 6006.\n", + "Df Residuals: 998 BIC: 6016.\n", + "Df Model: 1 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 5.0487 0.218 23.180 0.000 4.621 5.476\n", + "x1 0.6301 0.308 2.046 0.041 0.026 1.235\n", + "==============================================================================\n", + "Omnibus: 0.183 Durbin-Watson: 2.112\n", + "Prob(Omnibus): 0.913 Jarque-Bera (JB): 0.129\n", + "Skew: -0.024 Prob(JB): 0.938\n", + "Kurtosis: 3.027 Cond. No. 2.62\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# The same as above, but using a linear regression model \n", + "# (tip for life: any almost basic stastical tests is just a particular instation of a linear regression model).\n", + "\n", + "import statsmodels.api as sm\n", + "\n", + "y=np.concatenate((rvs1, rvs2))\n", + "X=np.column_stack(([1]*len(y), \n", + " [0]*len(rvs1) + [1]*len(rvs2)))\n", + "model = sm.OLS(endog=y, exog=X)\n", + "res = model.fit()\n", + "print(res.summary())" + ] + }, + { + "cell_type": "markdown", + "id": "01bfbd31-287f-4da8-aed4-a438594b0451", + "metadata": {}, + "source": [ + "### Command-Line Terminal Programming\n", + "\n", + "- Programming that takes place in a **terminal**, which a text-based interface for interacting directly with the computer.\n", + "- Commands in a terminal are interpreted by a **shell**. Common shells include Bash (popular on Linux and macOS), Zsh (modern and customizable), and PowerShell (Windows-specific).\n", + "- **Essential for managing files, running scripts, and interacting with compute clusters (e.g. SLURM).**\n", + "\n", + "In Jupyter notebooks, you can execute shell commands by prefixing them with `!`. \n", + "\n", + "For example, we can navigate directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3e077d21-8fac-402d-98b3-19e98344bece", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters\n", + "01-getting_started.md\tmodule-1 module-4 wrap-up.ipynb\n", + "02-python-basics.ipynb\tmodule-2 module-5\n", + "04-python-basics.ipynb\tmodule-3 my_folder\n" + ] + } + ], + "source": [ + "# Print the current directory\n", + "!pwd\n", + "\n", + "# List files in the directory\n", + "!ls" + ] + }, + { + "cell_type": "markdown", + "id": "f961542d-8435-4701-9336-9eb85aaa41e4", + "metadata": {}, + "source": [ + "We can also manage files and directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "898cc000-738e-4596-83c2-f394e531f12b", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new folder and file\n", + "!mkdir -p my_folder # make new dir; -p option to not raise an error if it already existed\n", + "!rm -f my_folder/* # Remove preexisting content; -f option to not raise an error if the folder was already empty\n", + "!touch my_folder/hello_world.py # Create a new file named \"hello world.py\"\n", + "!echo \"print('Hello, World!\\nCode run from:', __file__)\" > my_folder/hello_world.py # Add some a line of code to this file" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "6c70e822-b505-48f0-b31f-a2ef36a615bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello_world.py\n" + ] + } + ], + "source": [ + "# List contents of the folder\n", + "!ls my_folder" + ] + }, + { + "cell_type": "markdown", + "id": "f2fb53a8-51e5-47d2-90ff-4f96469dc87e", + "metadata": {}, + "source": [ + "And run scripts:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "34b98b2c-d216-4110-ac89-38fe9e4a2f90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello, World!\n", + "Code run from: /home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters/my_folder/hello_world.py\n" + ] + } + ], + "source": [ + "!python my_folder/hello_world.py" + ] + }, + { + "cell_type": "markdown", + "id": "1e9ae7ae-ab8d-46cb-89eb-4fb1132bd12d", + "metadata": {}, + "source": [ + "### GitHub\n", + "\n", + "- Web-based platform for version control and collaboration built on top of Git, a version control system.\n", + "- It also has a powerful terminal programming where to easily interact and change your repositories.\n", + "- Allows you to track changes, collaborate with others, and share your work.\n", + "\n", + "Common Use Cases:\n", + "\n", + "- **Code Management**: Store and version codebases for projects/libraries. \n", + "- **Team Collaboration**: Coordinated team efforts on software development or data science projects.\n", + "- **Portfolio Hosting**: Showcase projects and skills for personal branding.\n", + "- **Open Source Contribution**: Contribute to or learn from public repositories.\n", + "- **Documentation**: Use GitHub Pages to create project websites or host documentation." + ] + }, + { + "cell_type": "markdown", + "id": "343eee57-2b06-4799-9b69-a9e820d8547c", + "metadata": {}, + "source": [ + "A few personal examples:\n", + "- Personal porfolio: https://github.com/jrasero\n", + "- This very course's book: https://github.com/UVADS/DS1002-book\n", + "- Niphlem: NeuroImaging-oriented Physiological Log Extraction for Modeling, toolbox: https://github.com/CoAxLab/niphlem, and [its documentation (rendered through Github)](https://coaxlab.github.io/niphlem)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/chapters/01-getting_started.html b/chapters/01-getting_started.html index 1112f35..32443b7 100644 --- a/chapters/01-getting_started.html +++ b/chapters/01-getting_started.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +
    diff --git a/chapters/02-python-basics.html b/chapters/02-python-basics.html index 16d0b12..4105d9b 100644 --- a/chapters/02-python-basics.html +++ b/chapters/02-python-basics.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +
    diff --git a/chapters/04-python-basics.html b/chapters/04-python-basics.html index 73e6e86..17e1f43 100644 --- a/chapters/04-python-basics.html +++ b/chapters/04-python-basics.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/012-intro_python (copia).html b/chapters/module-1/012-intro_python (copia).html index 58a1f31..8697b8e 100644 --- a/chapters/module-1/012-intro_python (copia).html +++ b/chapters/module-1/012-intro_python (copia).html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/012-intro_python.html b/chapters/module-1/012-intro_python.html index a090768..4034072 100644 --- a/chapters/module-1/012-intro_python.html +++ b/chapters/module-1/012-intro_python.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/013-intro_R.html b/chapters/module-1/013-intro_R.html index 6cac5c8..35aecc1 100644 --- a/chapters/module-1/013-intro_R.html +++ b/chapters/module-1/013-intro_R.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/Practice.html b/chapters/module-1/Practice.html index 775c938..0476a97 100644 --- a/chapters/module-1/Practice.html +++ b/chapters/module-1/Practice.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/about_course.html b/chapters/module-1/about_course.html index cd6f559..18b9bb7 100644 --- a/chapters/module-1/about_course.html +++ b/chapters/module-1/about_course.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/jupyter_notebooks.html b/chapters/module-1/jupyter_notebooks.html index d431345..4f7e1dc 100644 --- a/chapters/module-1/jupyter_notebooks.html +++ b/chapters/module-1/jupyter_notebooks.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/programming.html b/chapters/module-1/programming.html index 3b639d4..c1e352a 100644 --- a/chapters/module-1/programming.html +++ b/chapters/module-1/programming.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/tech_stack.html b/chapters/module-1/tech_stack.html index c5c9ac4..17a2773 100644 --- a/chapters/module-1/tech_stack.html +++ b/chapters/module-1/tech_stack.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-1/your_first_program.html b/chapters/module-1/your_first_program.html index f6545a5..c3cff9d 100644 --- a/chapters/module-1/your_first_program.html +++ b/chapters/module-1/your_first_program.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/02-cover.html b/chapters/module-2/02-cover.html index 203b94c..e0d62c5 100644 --- a/chapters/module-2/02-cover.html +++ b/chapters/module-2/02-cover.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/021-variables.html b/chapters/module-2/021-variables.html index 26e9f51..b85867e 100644 --- a/chapters/module-2/021-variables.html +++ b/chapters/module-2/021-variables.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/022-operators.html b/chapters/module-2/022-operators.html index d5171e2..cfa7406 100644 --- a/chapters/module-2/022-operators.html +++ b/chapters/module-2/022-operators.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/023-strings.html b/chapters/module-2/023-strings.html index f9f49b8..ca1afc3 100644 --- a/chapters/module-2/023-strings.html +++ b/chapters/module-2/023-strings.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/024-structures.html b/chapters/module-2/024-structures.html index b504c8d..cba1a4e 100644 --- a/chapters/module-2/024-structures.html +++ b/chapters/module-2/024-structures.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/0241-structures_exercises.html b/chapters/module-2/0241-structures_exercises.html index 25bf667..79ba597 100644 --- a/chapters/module-2/0241-structures_exercises.html +++ b/chapters/module-2/0241-structures_exercises.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/025-conditional.html b/chapters/module-2/025-conditional.html index e775456..24819bf 100644 --- a/chapters/module-2/025-conditional.html +++ b/chapters/module-2/025-conditional.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/0251-conditional_exercises.html b/chapters/module-2/0251-conditional_exercises.html index cecd267..c0488c4 100644 --- a/chapters/module-2/0251-conditional_exercises.html +++ b/chapters/module-2/0251-conditional_exercises.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/026-iterables_and_iterators.html b/chapters/module-2/026-iterables_and_iterators.html index d2ff0f2..e4a38cf 100644 --- a/chapters/module-2/026-iterables_and_iterators.html +++ b/chapters/module-2/026-iterables_and_iterators.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/0261-functions_exercises.html b/chapters/module-2/0261-functions_exercises.html index 7e8de7a..e3c5393 100644 --- a/chapters/module-2/0261-functions_exercises.html +++ b/chapters/module-2/0261-functions_exercises.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-2/027-functions.html b/chapters/module-2/027-functions.html index e4a69c7..f8588a9 100644 --- a/chapters/module-2/027-functions.html +++ b/chapters/module-2/027-functions.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-3/029-packages.html b/chapters/module-3/029-packages.html index 0b337c6..a11959f 100644 --- a/chapters/module-3/029-packages.html +++ b/chapters/module-3/029-packages.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + @@ -646,7 +661,9 @@

    Images are Numerical Data
    Requirement already satisfied: fonttools>=4.22.0 in /home/javi/anaconda3/lib/python3.11/site-packages (from matplotlib) (4.51.0)
     Requirement already satisfied: kiwisolver>=1.0.1 in /home/javi/anaconda3/lib/python3.11/site-packages (from matplotlib) (1.4.4)
     Requirement already satisfied: pyparsing>=2.3.1 in /home/javi/anaconda3/lib/python3.11/site-packages (from matplotlib) (3.0.9)
     Requirement already satisfied: python-dateutil>=2.7 in /home/javi/anaconda3/lib/python3.11/site-packages (from matplotlib) (2.9.0.post0)
    diff --git a/chapters/module-3/03-cover.html b/chapters/module-3/03-cover.html
    index df9132d..49ceed2 100644
    --- a/chapters/module-3/03-cover.html
    +++ b/chapters/module-3/03-cover.html
    @@ -210,6 +210,21 @@
     
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +
    diff --git a/chapters/module-3/031-errors_and_exceptions.html b/chapters/module-3/031-errors_and_exceptions.html index 19f36da..46a15f7 100644 --- a/chapters/module-3/031-errors_and_exceptions.html +++ b/chapters/module-3/031-errors_and_exceptions.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-3/031-errors_and_exceptions_w_sols.html b/chapters/module-3/031-errors_and_exceptions_w_sols.html index 7cdf663..fbd78f3 100644 --- a/chapters/module-3/031-errors_and_exceptions_w_sols.html +++ b/chapters/module-3/031-errors_and_exceptions_w_sols.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-3/032-classes.html b/chapters/module-3/032-classes.html index a3f8838..ce8cf40 100644 --- a/chapters/module-3/032-classes.html +++ b/chapters/module-3/032-classes.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-3/033-reading_writing_files.html b/chapters/module-3/033-reading_writing_files.html index 256fd39..1abca80 100644 --- a/chapters/module-3/033-reading_writing_files.html +++ b/chapters/module-3/033-reading_writing_files.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-3/lab-recursion.html b/chapters/module-3/lab-recursion.html index 27e2e68..a978a54 100644 --- a/chapters/module-3/lab-recursion.html +++ b/chapters/module-3/lab-recursion.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-4/041-numpyI.html b/chapters/module-4/041-numpyI.html index f7a8955..1cdf8c9 100644 --- a/chapters/module-4/041-numpyI.html +++ b/chapters/module-4/041-numpyI.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + @@ -483,7 +498,7 @@

    The ndarray object -
    0.9983151051472091 <class 'float'>
    +
    1.5106196181630014 <class 'float'>
     
    @@ -497,8 +512,8 @@

    The ndarray object -
    [[-0.16765433  0.29513178  0.46948154]
    - [ 0.37200198 -1.24104694  0.01450445]] <class 'numpy.ndarray'>
    +
    [[ 0.50357981  1.60435582  0.06693117]
    + [-0.76137145  1.07623844 -0.00815216]] <class 'numpy.ndarray'>
     
    @@ -511,8 +526,8 @@

    The ndarray object -
    array([[ -1.67654327,   2.95131775,   4.69481541],
    -       [  3.7200198 , -12.41046939,   0.14504455]])
    +
    array([[ 5.03579813, 16.04355815,  0.66931166],
    +       [-7.61371454, 10.76238442, -0.08152162]])
     
    @@ -527,10 +542,10 @@

    The ndarray object -
    [[-0.33530865  0.59026355  0.93896308]
    - [ 0.74400396 -2.48209388  0.02900891]]
    -[[-0.33530865  0.59026355  0.93896308]
    - [ 0.74400396 -2.48209388  0.02900891]]
    +
    [[ 1.00715963  3.20871163  0.13386233]
    + [-1.52274291  2.15247688 -0.01630432]]
    +[[ 1.00715963  3.20871163  0.13386233]
    + [-1.52274291  2.15247688 -0.01630432]]
     
    @@ -558,9 +573,9 @@

    The ndarray object -
    [[-0.16765433  0.29513178]
    - [ 0.46948154  0.37200198]
    - [-1.24104694  0.01450445]]
    +
    [[ 0.50357981  1.60435582]
    + [ 0.06693117 -0.76137145]
    + [ 1.07623844 -0.00815216]]
     (3, 2)
     
    @@ -847,11 +862,11 @@

    Creating ndarrays - @@ -819,13 +834,13 @@

    Boolean slicing
    ['Bob' 'Joe' 'Will' 'Bob' 'Will' 'Joe' 'Joe']
    -[[-0.36748832  0.41549724 -0.07892885  1.03054864]
    - [ 2.1515256   0.0252963  -0.50860192 -0.81510486]
    - [-1.50438063  1.34625813  0.19001997 -2.75474839]
    - [-2.02413974 -0.40117379 -1.11674174 -0.04430263]
    - [-1.68668132  0.4919675   0.46076492  1.68898003]
    - [ 2.05100384  2.15532653  0.63420939 -0.05512468]
    - [ 0.16083873 -2.08108768  0.47970436 -1.7483949 ]]
    +[[-0.7237047  -0.50762254 -0.48204371 -1.6133627 ]
    + [ 1.69705017  0.80812228 -0.66722951  1.33247531]
    + [-2.81200144  0.89421787 -0.02931852 -1.05156439]
    + [-1.06524316 -1.54260722  0.02495081  0.20867737]
    + [-0.14302032  0.81352009 -0.07620839  0.59158928]
    + [ 2.03311889 -0.3202367  -0.01237379 -1.41072045]
    + [-0.56663455  1.16679853 -1.34970156 -0.41449339]]
     

    @@ -864,8 +879,8 @@

    Boolean slicing -
    array([[-0.36748832,  0.41549724, -0.07892885,  1.03054864],
    -       [-2.02413974, -0.40117379, -1.11674174, -0.04430263]])
    +
    array([[-0.7237047 , -0.50762254, -0.48204371, -1.6133627 ],
    +       [-1.06524316, -1.54260722,  0.02495081,  0.20867737]])
     
    @@ -878,8 +893,8 @@

    Boolean slicing -
    array([[-0.07892885,  1.03054864],
    -       [-1.11674174, -0.04430263]])
    +
    array([[-0.48204371, -1.6133627 ],
    +       [ 0.02495081,  0.20867737]])
     
    @@ -894,11 +909,11 @@

    Boolean slicing -
    array([[ 2.1515256 ,  0.0252963 , -0.50860192, -0.81510486],
    -       [-1.50438063,  1.34625813,  0.19001997, -2.75474839],
    -       [-1.68668132,  0.4919675 ,  0.46076492,  1.68898003],
    -       [ 2.05100384,  2.15532653,  0.63420939, -0.05512468],
    -       [ 0.16083873, -2.08108768,  0.47970436, -1.7483949 ]])
    +
    array([[ 1.69705017,  0.80812228, -0.66722951,  1.33247531],
    +       [-2.81200144,  0.89421787, -0.02931852, -1.05156439],
    +       [-0.14302032,  0.81352009, -0.07620839,  0.59158928],
    +       [ 2.03311889, -0.3202367 , -0.01237379, -1.41072045],
    +       [-0.56663455,  1.16679853, -1.34970156, -0.41449339]])
     
    @@ -911,11 +926,11 @@

    Boolean slicing - @@ -949,12 +964,12 @@

    Boolean slicing
    array([[ 7.        ,  7.        ,  7.        ,  7.        ],
    -       [ 2.1515256 ,  0.0252963 , -0.50860192, -0.81510486],
    +       [ 1.69705017,  0.80812228, -0.66722951,  1.33247531],
            [ 7.        ,  7.        ,  7.        ,  7.        ],
            [ 7.        ,  7.        ,  7.        ,  7.        ],
            [ 7.        ,  7.        ,  7.        ,  7.        ],
    -       [ 2.05100384,  2.15532653,  0.63420939, -0.05512468],
    -       [ 0.16083873, -2.08108768,  0.47970436, -1.7483949 ]])
    +       [ 2.03311889, -0.3202367 , -0.01237379, -1.41072045],
    +       [-0.56663455,  1.16679853, -1.34970156, -0.41449339]])
     

    @@ -1306,8 +1321,8 @@

    More useful calculations -
    0x7f6150364bd0
    -0x7f6150365110
    +
    0x7f73825a3450
    +0x7f735b82c030
     
    @@ -1586,7 +1601,7 @@

    More useful calculations -
    @@ -1463,7 +1478,7 @@

    An introduction to some attributes and methods - @@ -1340,7 +1355,7 @@

    Summarizing data -
    @@ -1337,7 +1353,7 @@

    Summarizing data - diff --git a/chapters/module-4/045-PandasIII-manipulation.html b/chapters/module-4/045-PandasIII-manipulation.html index c3512f9..6e8cc1c 100644 --- a/chapters/module-4/045-PandasIII-manipulation.html +++ b/chapters/module-4/045-PandasIII-manipulation.html @@ -32,9 +32,9 @@ - + - + @@ -61,6 +61,7 @@ + @@ -210,6 +211,20 @@
  • Introduction to Pandas
  • Pandas: Data Exploration
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +

    @@ -3907,6 +3922,15 @@

    Practice exercisesPandas: Data Exploration

    + +
    +

    next

    +

    Pandas: Advanced Data Manipulation and Aggregation

    +
    + +

    diff --git a/chapters/module-4/045-PandasIII-manipulation_sols.html b/chapters/module-4/045-PandasIII-manipulation_sols.html index 339c22a..07c5271 100644 --- a/chapters/module-4/045-PandasIII-manipulation_sols.html +++ b/chapters/module-4/045-PandasIII-manipulation_sols.html @@ -32,9 +32,9 @@ - + - + @@ -217,12 +217,14 @@

    Module 5: R

    +

    Wrapping up

    +

    diff --git a/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.html b/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.html deleted file mode 100644 index 511e924..0000000 --- a/chapters/module-4/046-PandasIII-Merging_Concatenating_Aggregating.html +++ /dev/null @@ -1,1109 +0,0 @@ - - - - - - - - - - - Concatenating and Merging — DS-1002 Programming for Data Science - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - - - - - - - -
    -
    -
    -
    -
    - - - - -
    -
    - - - - - -
    - - - - - - - - - - - - - -
    - -
    - - - -
    - -
    -
    - -
    -
    - -
    - -
    - -
    - - -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - -
    - -
    - -
    -
    - - - - - - - - -
    - -
    -
    -
    import pandas as pd
    -import numpy as np
    -
    -
    -
    -
    -
    -
    -
    iris_df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv")
    -iris_df
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
    05.13.51.40.2setosa
    14.93.01.40.2setosa
    24.73.21.30.2setosa
    34.63.11.50.2setosa
    45.03.61.40.2setosa
    ..................
    1456.73.05.22.3virginica
    1466.32.55.01.9virginica
    1476.53.05.22.0virginica
    1486.23.45.42.3virginica
    1495.93.05.11.8virginica
    -

    150 rows × 5 columns

    -
    -
    -
    -

    Concatenating and Merging#

    -
    -

    Concate: pd.concat()#

    -

    Concatenate pandas objects along an axis

    -

    Details

    -

    Create two dfs and vertically stack them

    -
    -
    -
    df1 = pd.DataFrame(np.random.randn(3, 4))
    -df2 = pd.DataFrame(np.random.randn(3, 4))
    -
    -print(df1)
    -print('-'*45)
    -print(df2)
    -
    -df3 = pd.concat([df1, df2], axis=0)
    -
    -print('-'*45)
    -print(df3)
    -
    -
    -
    -
    -
              0         1         2         3
    -0 -0.974224 -0.361428 -1.260157 -0.425635
    -1  0.917031 -0.046927  1.048192  0.075938
    -2  0.624287 -0.835596 -2.225162 -2.354256
    ----------------------------------------------
    -          0         1         2         3
    -0 -0.367434 -2.306643 -1.030095 -0.373502
    -1 -0.179604 -1.704118  0.127096  0.098003
    -2  2.444379  0.584522 -0.991921 -0.355007
    ----------------------------------------------
    -          0         1         2         3
    -0 -0.974224 -0.361428 -1.260157 -0.425635
    -1  0.917031 -0.046927  1.048192  0.075938
    -2  0.624287 -0.835596 -2.225162 -2.354256
    -0 -0.367434 -2.306643 -1.030095 -0.373502
    -1 -0.179604 -1.704118  0.127096  0.098003
    -2  2.444379  0.584522 -0.991921 -0.355007
    -
    -
    -
    -
    -

    Concat columns
    -This assumes that the indexes represent IDs of specific things or events

    -
    -
    -
    df4 = pd.concat([df1,df2], axis = 1, keys = ['foo', 'bar'])
    -
    -df4
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    foobar
    01230123
    0-0.974224-0.361428-1.260157-0.425635-0.367434-2.306643-1.030095-0.373502
    10.917031-0.0469271.0481920.075938-0.179604-1.7041180.1270960.098003
    20.624287-0.835596-2.225162-2.3542562.4443790.584522-0.991921-0.355007
    -
    -
    -
    -
    -
    df4.foo
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    0123
    0-0.974224-0.361428-1.260157-0.425635
    10.917031-0.0469271.0481920.075938
    20.624287-0.835596-2.225162-2.354256
    -
    -
    -
    -
    -
    df4.bar
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    0123
    0-0.367434-2.306643-1.030095-0.373502
    1-0.179604-1.7041180.1270960.098003
    22.4443790.584522-0.991921-0.355007
    -
    -
    -
    -
    -

    merge: merge()#

    -

    SQL-style joining of tables (DataFrames)

    -

    Important parameters include:

    -
      -
    • how : type of merge {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’

    • -
    • on : names to join on

    • -
    -

    Details

    -

    Very useful!

    -

    Create two tables, left and right. Then right join them on key.
    -Right join means include all records from table on right.
    -The key is used for matching up the records.

    -
    -
    -
    left = pd.DataFrame({"key": ["jamie", "bill"], "lval": [15, 22]})
    -right = pd.DataFrame({"key": ["jamie", "bill", "asher"], "rval": [4, 5, 8]})
    -
    -joined = pd.merge(left, right, on="key", how="right")
    -
    -print('---left')
    -print(left)
    -print('\n---right')
    -print(right)
    -print('\n---joined')
    -print(joined)
    -
    -
    -
    -
    -
    ---left
    -     key  lval
    -0  jamie    15
    -1   bill    22
    -
    ----right
    -     key  rval
    -0  jamie     4
    -1   bill     5
    -2  asher     8
    -
    ----joined
    -     key  lval  rval
    -0  jamie  15.0     4
    -1   bill  22.0     5
    -2  asher   NaN     8
    -
    -
    -
    -
    -
    -
    -
    -

    Summary#

    -
      -
    • Use join if you have shared indexes

    • -
    • Use merge if you do not have shared indexes

    • -
    • Use concat to combine based on shared indexes or columns

    • -
    -
    -
    -

    Data Aggregation#

    -

    Involves one or more of:

    -
      -
    • splitting the data into groups

    • -
    • applying a function to each group

    • -
    • combining results

    • -
    -
    -

    Aggregation by .groupby()#

    -

    Compute mean of each column, grouped (separately) by species

    -
    -
    -
    iris_df.groupby("species").mean()
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    sepal_lengthsepal_widthpetal_lengthpetal_width
    species
    setosa5.0063.4281.4620.246
    versicolor5.9362.7704.2601.326
    virginica6.5882.9745.5522.026
    -
    -
    -
    -
    -

    pd.pivot_table()#

    -

    Apply a function aggfunc to selected values grouped by columns

    -

    Details

    -

    Compute mean sepal length for each species:

    -
    -
    -
    pd.pivot_table(iris_df, values="sepal_length", columns=["species"], aggfunc = np.mean)
    -
    -
    -
    -
    -
    - - - - - - - - - - - - - - - - - - -
    speciessetosaversicolorvirginica
    sepal_length5.0065.9366.588
    -
    -
    -
    -
    -
    -

    Reshaping Data#

    -
    -
    -

    .reshape()#

    -

    Changes the object’s shape

    -

    We illustrate creating pandas Series, extracting array of length 6, and reshaping to 3x2 array.

    -
    -
    -
    # create a series
    -ser = pd.Series([1, 1, 2, 3, 5, 8])
    -
    -# extract values
    -vals = ser.values
    -
    -print('orig data:', vals)
    -print('orig type:', type(vals))
    -print('orig shape:', vals.shape)
    -
    -# reshaping series
    -reshaped_vals = vals.reshape((3, 2))
    -
    -print('\n reshaped vals:')
    -print(reshaped_vals)
    -print('\n new type:', type(reshaped_vals))
    -print('new shape:', reshaped_vals.shape)
    -
    -
    -
    -
    -
    orig data: [1 1 2 3 5 8]
    -orig type: <class 'numpy.ndarray'>
    -orig shape: (6,)
    -
    - reshaped vals:
    -[[1 1]
    - [2 3]
    - [5 8]]
    -
    - new type: <class 'numpy.ndarray'>
    -new shape: (3, 2)
    -
    -
    -
    -
    -
    - - - - -
    - - - - - - -
    - -
    -
    -
    - -
    - - - - - - -
    -
    - - -
    - - -
    -
    -
    - - - - - -
    -
    - - \ No newline at end of file diff --git a/chapters/module-4/046-PandasIV-advanced_manipulation_aggregation.html b/chapters/module-4/046-PandasIV-advanced_manipulation_aggregation.html index e5edf2f..7fb79a5 100644 --- a/chapters/module-4/046-PandasIV-advanced_manipulation_aggregation.html +++ b/chapters/module-4/046-PandasIV-advanced_manipulation_aggregation.html @@ -32,9 +32,9 @@ - + - + @@ -213,6 +213,18 @@
  • Pandas: Data Manipulation
  • Pandas: Advanced Data Manipulation and Aggregation
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +

    @@ -564,22 +576,22 @@

    Concate: pd.con

              a         b         c         d
    -0  2.185435  1.002872 -0.029634  2.028856
    -1 -0.875013 -1.323426 -0.517671 -0.519281
    -2 -0.655456  0.708395 -0.419140  1.257233
    +0  0.174450 -1.018138  0.968461 -0.412483
    +1  0.703028 -0.594527  1.997723 -1.159000
    +2 -0.415667  0.290691  1.538168  0.236634
     ---------------------------------------------
               a         b         c         d
    -0 -0.640966 -2.015608 -0.241571 -0.303553
    -1 -1.782508 -0.317744 -0.998355 -0.939285
    -2  0.455360 -0.545225  0.459453  0.282877
    +0  1.141372  0.539766  0.924608 -1.013957
    +1  1.333616 -0.966171  1.258023  0.185296
    +2 -0.060995  0.330361 -0.710879 -0.408728
     ---------------------------------------------
               a         b         c         d
    -0  2.185435  1.002872 -0.029634  2.028856
    -1 -0.875013 -1.323426 -0.517671 -0.519281
    -2 -0.655456  0.708395 -0.419140  1.257233
    -0 -0.640966 -2.015608 -0.241571 -0.303553
    -1 -1.782508 -0.317744 -0.998355 -0.939285
    -2  0.455360 -0.545225  0.459453  0.282877
    +0  0.174450 -1.018138  0.968461 -0.412483
    +1  0.703028 -0.594527  1.997723 -1.159000
    +2 -0.415667  0.290691  1.538168  0.236634
    +0  1.141372  0.539766  0.924608 -1.013957
    +1  1.333616 -0.966171  1.258023  0.185296
    +2 -0.060995  0.330361 -0.710879 -0.408728
     
    @@ -632,33 +644,33 @@

    Concate: pd.con 0 - 0.189000 - -1.051835 - 0.675315 - 0.983315 - -0.667326 - -0.367171 - -0.650382 + 0.358929 + -0.851815 + 0.313507 + 0.727029 + 0.732533 + -1.176725 + 0.359086 1 - -0.819721 - 0.084818 - 0.254183 - 0.300219 - 1.210534 - 0.201252 - -0.409164 + 1.212873 + -1.549474 + 0.040366 + 0.525790 + 1.353637 + 1.343218 + -0.161630 2 - -0.911595 - -1.838310 - -0.582055 - -0.404169 - 1.912954 - 0.288270 - 1.105270 + -0.344516 + -0.645047 + -0.412381 + 0.107239 + -0.375055 + -0.723952 + 0.493891 diff --git a/chapters/module-4/047-PandasV-Intro_Feature_Engineering.html b/chapters/module-4/047-PandasV-Intro_Feature_Engineering.html index 79e5910..ae6ae5c 100644 --- a/chapters/module-4/047-PandasV-Intro_Feature_Engineering.html +++ b/chapters/module-4/047-PandasV-Intro_Feature_Engineering.html @@ -32,9 +32,9 @@ - + - + @@ -61,6 +61,7 @@ + @@ -212,6 +213,18 @@
  • Pandas: Data Manipulation
  • Pandas: Advanced Data Manipulation and Aggregation
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +

    @@ -1411,6 +1424,15 @@

    Practice exercisesPandas: Advanced Data Manipulation and Aggregation

    + +
    +

    next

    +

    Introduction to R

    +
    + +

    diff --git a/chapters/module-4/07-numpy-continued.html b/chapters/module-4/07-numpy-continued.html index c5be5d4..5787685 100644 --- a/chapters/module-4/07-numpy-continued.html +++ b/chapters/module-4/07-numpy-continued.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    +

    diff --git a/chapters/module-4/Untitled.html b/chapters/module-4/Untitled.html index c6e16b2..25e9e49 100644 --- a/chapters/module-4/Untitled.html +++ b/chapters/module-4/Untitled.html @@ -210,6 +210,21 @@
  • NumPy (Part II)
  • Introduction to Pandas
  • Pandas: Data Exploration
  • +
  • Pandas: Data Manipulation
  • +
  • Pandas: Advanced Data Manipulation and Aggregation
  • +
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-4/Untitled1.html b/chapters/module-4/Untitled1.html index 1ada748..9e46511 100644 --- a/chapters/module-4/Untitled1.html +++ b/chapters/module-4/Untitled1.html @@ -32,9 +32,9 @@ - + - + @@ -213,6 +213,18 @@
  • Pandas: Data Manipulation
  • Pandas: Advanced Data Manipulation and Aggregation
  • Pandas: Introduction to Feature Engineering
  • + +

    Module 5: R

    + +

    Wrapping up

    + diff --git a/chapters/module-5/051-intro_to_R.html b/chapters/module-5/051-intro_to_R.html index a4d03f1..bbcc470 100644 --- a/chapters/module-5/051-intro_to_R.html +++ b/chapters/module-5/051-intro_to_R.html @@ -32,9 +32,9 @@ - + - + @@ -223,6 +223,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Wrapping up

    + @@ -424,7 +428,7 @@

    Introduction to R -../../_images/404f063734e2f5dcb1228730ff1f99d7066229bac488649789a0c8d01fc3ca2b.png +../../_images/267a46e09f7430ad3fccb6b619c25d6382d080de21e885f22ad6dcdbd24172b9.png

    The standard normal distribution is a special case of a Gaussian distribution with mean (average location) of 1 and a standard deviation (average dispersion) of 0. The gaussian distribution has a characteristic bell shape, as shown in the histogram below using the R’s hist function.

    @@ -435,7 +439,7 @@

    Introduction to R -../../_images/f51cf89b081544448e064cb8ce537e13c1baa630e0be650380df0813287266cf.png +../../_images/e56d5aba8799217f6afe138ed710ed222a315c174f283c71639551057f692abb.png
    diff --git a/chapters/module-5/052-data-structures.html b/chapters/module-5/052-data-structures.html index bea2ed5..85b0fef 100644 --- a/chapters/module-5/052-data-structures.html +++ b/chapters/module-5/052-data-structures.html @@ -32,9 +32,9 @@ - + - + @@ -222,7 +222,11 @@
  • Data Structures
  • Control Structures
  • Functions
  • -
  • R for data science
  • +
  • R for data science: Tidyverse
  • + +

    Wrapping up

    + diff --git a/chapters/module-5/053-Control-structures-sols.html b/chapters/module-5/053-Control-structures-sols.html index 76b3153..b56b4d2 100644 --- a/chapters/module-5/053-Control-structures-sols.html +++ b/chapters/module-5/053-Control-structures-sols.html @@ -32,9 +32,9 @@ - + - + @@ -219,12 +219,14 @@

    Module 5: R

    +

    Wrapping up

    + diff --git a/chapters/module-5/053-Control-structures.html b/chapters/module-5/053-Control-structures.html index d095339..41cd08c 100644 --- a/chapters/module-5/053-Control-structures.html +++ b/chapters/module-5/053-Control-structures.html @@ -32,9 +32,9 @@ - + - + @@ -223,6 +223,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Wrapping up

    + diff --git a/chapters/module-5/054-functions-sols.html b/chapters/module-5/054-functions-sols.html index a49ac95..2dd457c 100644 --- a/chapters/module-5/054-functions-sols.html +++ b/chapters/module-5/054-functions-sols.html @@ -32,9 +32,9 @@ - + - + @@ -219,12 +219,14 @@

    Module 5: R

    +

    Wrapping up

    + @@ -612,8 +614,8 @@

    Built-in functions -
    1. 3.44318044976882
    2. 10.9381169236683
    3. 18.8495873534953
    4. 2.41687744073499
    5. 9.08140659032006
    6. 13.8851224456932
    7. 6.25564070974601
    8. 4.19221253135156
    9. 14.8982453260319
    10. 15.849869591281
    -
    9.98102593620911
    10.0097617569942
    +
    1. 17.1064024290231
    2. 4.50870321766152
    3. 13.9520701862171
    4. 5.69629448075557
    5. 4.41933655917822
    6. 3.41442028590302
    7. 16.6392789808824
    8. 9.2228576498917
    9. -3.18074141775111
    10. 16.2434599266992
    +
    8.80220822984607
    7.45957606532364

    Function help#

    diff --git a/chapters/module-5/054-functions.html b/chapters/module-5/054-functions.html index dc4eb7d..126d3a2 100644 --- a/chapters/module-5/054-functions.html +++ b/chapters/module-5/054-functions.html @@ -32,9 +32,9 @@ - + - + @@ -223,6 +223,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Wrapping up

    + @@ -605,8 +609,8 @@

    Built-in functions -
    1. 13.1963451166444
    2. 8.72797219132438
    3. 13.1711611847993
    4. 8.81686471098759
    5. 5.77526507710081
    6. 12.9391842738982
    7. 16.3681944750152
    8. 10.6781215679532
    9. 10.4963780372093
    10. 4.00038680441802
    -
    10.416987343935
    10.5872498025813
    +
    1. 17.3154600272022
    2. 2.33314450866007
    3. 2.65690099102852
    4. 17.8482261024962
    5. -0.877821425385072
    6. 16.0673664447932
    7. 11.2335652420367
    8. 13.3946342021372
    9. 1.33792095135807
    10. 10.1419139756298
    +
    9.1451311019957
    10.6877396088333

    Function help#

    diff --git a/chapters/module-5/055-tidyverse.html b/chapters/module-5/055-tidyverse.html index 6ece696..d7d203d 100644 --- a/chapters/module-5/055-tidyverse.html +++ b/chapters/module-5/055-tidyverse.html @@ -32,9 +32,9 @@ - + - + @@ -63,6 +63,7 @@ + @@ -222,6 +223,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Wrapping up

    + @@ -424,7 +429,7 @@

    Loading the tidyverse
    -
    ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
    +
    -
    ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
    +
                     
                   
    diff --git a/chapters/module-5/old/013-Control Structures.html b/chapters/module-5/old/013-Control Structures.html
    index 67e96d6..3dfb95c 100644
    --- a/chapters/module-5/old/013-Control Structures.html	
    +++ b/chapters/module-5/old/013-Control Structures.html	
    @@ -32,9 +32,9 @@
         
         
         
    -    
    +    
         
    -    
    +    
         
       
       
    @@ -217,12 +217,14 @@
     

    Module 5: R

    +

    Wrapping up

    +
    diff --git a/chapters/module-5/old/051-dataframes-in-r-student.html b/chapters/module-5/old/051-dataframes-in-r-student.html index 42dbc20..a597761 100644 --- a/chapters/module-5/old/051-dataframes-in-r-student.html +++ b/chapters/module-5/old/051-dataframes-in-r-student.html @@ -32,9 +32,9 @@ - + - + @@ -217,12 +217,14 @@

    Module 5: R

    +

    Wrapping up

    +
    diff --git a/chapters/module-5/old/051-dataframes-in-r.html b/chapters/module-5/old/051-dataframes-in-r.html index 18bc464..379a17b 100644 --- a/chapters/module-5/old/051-dataframes-in-r.html +++ b/chapters/module-5/old/051-dataframes-in-r.html @@ -32,9 +32,9 @@ - + - + @@ -219,12 +219,14 @@

    Module 5: R

    +

    Wrapping up

    +
    diff --git a/chapters/wrap-up.html b/chapters/wrap-up.html new file mode 100644 index 0000000..ef47b3b --- /dev/null +++ b/chapters/wrap-up.html @@ -0,0 +1,880 @@ + + + + + + + + + + + Wrapping up — DS-1002 Programming for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + + +
    +
    +
    +
    +
    + + + + +
    +
    + + + + + +
    + + + + + + + + + + + + + +
    + +
    + + + +
    + +
    +
    + +
    +
    + +
    + +
    + +
    + + +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
    + +
    +
    + + + +
    +

    Wrapping up

    + +
    + +
    +
    + + + + +
    + +
    +

    Wrapping up#

    +

    Throughout this course, we have covered the most important foundational programming skills a future Data Scientist needs, with a particular emphasis on Python.

    +

    For both languages, we explored their syntax, different data types, and how to work with data structures. We also delved into implementing loops, functions, and even classes (which is uncommon in beginner programming courses). Additionally, we discussed basic data science operations in both languages, particularly focusing on how to inspect and interact with raw data.

    +

    Now, coming to the question of Python vs. R: which one should you choose? It’s entirely up to you—both are excellent tools, as we have emphasized throughout the course. Keep in mind that you can essentially achieve the same results in one language as in the other. For example, when it comes to data manipulation, see this comparison: Python vs R.

    +

    Here is my personal perspective though:

    +
      +
    • Python: Ideal for programmatic scenarios such as developing complex libraries, thanks to its versatility, simple syntax, and readability. Moreover, for machine learning and deep learning applications, Python remains the top choice.

    • +
    • R: Best suited for advanced statistical analysis, such as mixed linear modeling, factor analysis, mediation analysis, and Bayesian statistics. In addition, while I do not use it as often as I should, ggplot2 can produce exceptionally high-quality graphs—so be sure to consider this in the future!

    • +
    +
    +

    Looking Ahead#

    +

    There are certain things we have not covered in this course that a Data Scientist should likely master in the future. Here are some examples:

    +
    +

    Visualization#

    +

    Clear and effective visualization is crucial for communicating with data.

    +

    Here are a few examples:

    +
      +
    • Python: matplotlib, seaborn

    • +
    • R: ggplot2

    • +
    • Cross-platform: plotly, shiny

    • +
    +

    Matplotlib

    +
    +
    +
    import matplotlib.pyplot as plt
    +plt.style.use('classic')
    +%matplotlib inline
    +import numpy as np
    +import pandas as pd
    +
    +
    +
    +
    +
    +
    +
    # Create some data
    +rng = np.random.RandomState(0)              # creates a random range seeded from 0
    +x = np.linspace(0, 10, 500)                 # creates evenly spaced numbers of a specified interval
    +y = np.cumsum(rng.randn(500, 6), 0)         # creates the sum of random numbers within a range.
    +
    +
    +
    +
    +
    +
    +
    # Plot the data with Matplotlib defaults
    +plt.plot(x, y)
    +plt.legend('ABCDEF', ncol=3, loc='upper left');
    +
    +
    +
    +
    +../_images/49ed94f15827ce8ac14e383ab6e9d4653c651f6dfb2edba2708c8d0fc1abc493.png +
    +
    +

    Seaborn

    +
    +
    +
    import seaborn as sns
    +
    +
    +
    +
    +
    +
    +
    xy_df = pd.concat([pd.DataFrame({"x": x}), 
    +                pd.DataFrame(y, columns=["A", "B", "C", "D", "E", "F"])], axis=1)
    +xy_df = pd.melt(xy_df, id_vars=["x"], var_name="group", value_name="y")
    +xy_df
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    xgroupy
    00.00000A1.764052
    10.02004A2.714141
    20.04008A3.475178
    30.06012A3.788246
    40.08016A6.058001
    ............
    29959.91984F-10.465950
    29969.93988F-9.613061
    29979.95992F-9.165378
    29989.97996F-9.004272
    299910.00000F-9.084870
    +

    3000 rows × 3 columns

    +
    +
    +
    +
    +
    sns.set(style="whitegrid")
    +sns.lineplot(x="x", y="y", hue="group", data=xy_df)
    +plt.legend(ncol=3, loc='upper left')
    +
    +
    +
    +
    +
    <matplotlib.legend.Legend at 0x7fb2609bdb90>
    +
    +
    +../_images/8393299fa27a42e0b7e0f92dbf4b389608087a22bb19b8d8552621c98c9857b5.png +
    +
    +
    +
    +

    Analysis#

    +

    Hera are a few examples of basic libraries for data anaylis in Python and R, with a bit of predominance bias towards the former:

    +
      +
    • Statistics: scipy (Python), statsmodels (Python), Base R, lme4 (R), blme (R).

    • +
    • Machine lerning: scikit-learn (Python), caret (R), xgboost (cross-platform).

    • +
    • Deep lerning: keras (Python), pytorch (Python), tensorflow(Python).

    • +
    +

    scikit-learn

    +
    +
    +
    from sklearn.tree import DecisionTreeClassifier
    +from sklearn.model_selection import cross_val_score
    +from sklearn.datasets import load_iris
    +
    +X, y = load_iris()["data"], load_iris()["target"]
    +clf = DecisionTreeClassifier()
    +
    +res = cross_val_score(clf, X, y, cv=5)
    +
    +print("the average accuracy in classifying the types of Iris using a decision tree and cross-validation is:", 
    +      res.mean())
    +
    +
    +
    +
    +
    the average accuracy in classifying the types of Iris using a decision tree and cross-validation is: 0.9600000000000002
    +
    +
    +
    +
    +

    scipy

    +
    +
    +
    # A two-sample t-test, adapted from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
    +import numpy as np
    +from scipy import stats
    +rng = np.random.RandomState(1234)
    +
    +rvs1 = stats.norm.rvs(loc=5, scale=5, size=500, random_state=rng)
    +rvs2 = stats.norm.rvs(loc=5.57, scale=5, size=500, random_state=rng)
    +stats.ttest_ind(rvs1, rvs2)
    +
    +
    +
    +
    +
    TtestResult(statistic=-2.0456709273958644, pvalue=0.04105049135941344, df=998.0)
    +
    +
    +
    +
    +

    statsmodels

    +
    +
    +
    # The same as above, but using a linear regression model 
    +# (tip for life: any almost basic stastical tests is just a particular instation of a linear regression model).
    +
    +import statsmodels.api as sm
    +
    +y=np.concatenate((rvs1, rvs2))
    +X=np.column_stack(([1]*len(y), 
    +                   [0]*len(rvs1) + [1]*len(rvs2)))
    +model = sm.OLS(endog=y, exog=X)
    +res = model.fit()
    +print(res.summary())
    +
    +
    +
    +
    +
                                OLS Regression Results                            
    +==============================================================================
    +Dep. Variable:                      y   R-squared:                       0.004
    +Model:                            OLS   Adj. R-squared:                  0.003
    +Method:                 Least Squares   F-statistic:                     4.185
    +Date:                Wed, 04 Dec 2024   Prob (F-statistic):             0.0411
    +Time:                        21:16:01   Log-Likelihood:                -3001.1
    +No. Observations:                1000   AIC:                             6006.
    +Df Residuals:                     998   BIC:                             6016.
    +Df Model:                           1                                         
    +Covariance Type:            nonrobust                                         
    +==============================================================================
    +                 coef    std err          t      P>|t|      [0.025      0.975]
    +------------------------------------------------------------------------------
    +const          5.0487      0.218     23.180      0.000       4.621       5.476
    +x1             0.6301      0.308      2.046      0.041       0.026       1.235
    +==============================================================================
    +Omnibus:                        0.183   Durbin-Watson:                   2.112
    +Prob(Omnibus):                  0.913   Jarque-Bera (JB):                0.129
    +Skew:                          -0.024   Prob(JB):                        0.938
    +Kurtosis:                       3.027   Cond. No.                         2.62
    +==============================================================================
    +
    +Notes:
    +[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
    +
    +
    +
    +
    +
    +
    +

    Command-Line Terminal Programming#

    +
      +
    • Programming that takes place in a terminal, which a text-based interface for interacting directly with the computer.

    • +
    • Commands in a terminal are interpreted by a shell. Common shells include Bash (popular on Linux and macOS), Zsh (modern and customizable), and PowerShell (Windows-specific).

    • +
    • Essential for managing files, running scripts, and interacting with compute clusters (e.g. SLURM).

    • +
    +

    In Jupyter notebooks, you can execute shell commands by prefixing them with !.

    +

    For example, we can navigate directories:

    +
    +
    +
    # Print the current directory
    +!pwd
    +
    +# List files in the directory
    +!ls
    +
    +
    +
    +
    +
    /home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters
    +
    +
    +
    01-getting_started.md	module-1  module-4   wrap-up.ipynb
    +02-python-basics.ipynb	module-2  module-5
    +04-python-basics.ipynb	module-3  my_folder
    +
    +
    +
    +
    +

    We can also manage files and directories:

    +
    +
    +
    # Create a new folder and file
    +!mkdir -p my_folder # make new dir; -p option to not raise an error if it already existed
    +!rm -f my_folder/* # Remove preexisting content; -f option to not raise an error if the folder was already empty
    +!touch my_folder/hello_world.py # Create a new file named "hello world.py"
    +!echo "print('Hello, World!\nCode run from:', __file__)" > my_folder/hello_world.py # Add some a line of code to this file
    +
    +
    +
    +
    +
    +
    +
    # List contents of the folder
    +!ls my_folder
    +
    +
    +
    +
    +
    hello_world.py
    +
    +
    +
    +
    +

    And run scripts:

    +
    +
    +
    !python my_folder/hello_world.py
    +
    +
    +
    +
    +
    Hello, World!
    +Code run from: /home/javi/Documentos/docencia/DS-1002/DS1002-book/chapters/my_folder/hello_world.py
    +
    +
    +
    +
    +
    +
    +

    GitHub#

    +
      +
    • Web-based platform for version control and collaboration built on top of Git, a version control system.

    • +
    • It also has a powerful terminal programming where to easily interact and change your repositories.

    • +
    • Allows you to track changes, collaborate with others, and share your work.

    • +
    +

    Common Use Cases:

    +
      +
    • Code Management: Store and version codebases for projects/libraries.

    • +
    • Team Collaboration: Coordinated team efforts on software development or data science projects.

    • +
    • Portfolio Hosting: Showcase projects and skills for personal branding.

    • +
    • Open Source Contribution: Contribute to or learn from public repositories.

    • +
    • Documentation: Use GitHub Pages to create project websites or host documentation.

    • +
    +

    A few personal examples:

    + +
    +
    +
    + + + + +
    + + + + + + +
    + +
    +
    +
    + +
    + + + + + + +
    +
    + + +
    + + +
    +
    +
    + + + + + +
    +
    + + \ No newline at end of file diff --git a/epilogue.html b/epilogue.html new file mode 100644 index 0000000..b868941 --- /dev/null +++ b/epilogue.html @@ -0,0 +1,887 @@ + + + + + + + + + + + Wrapping up — DS-1002 Programming for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + + +
    +
    +
    +
    +
    + + + + +
    +
    + + + +
    + + + + + + + + + + + + + +
    + +
    + + + +
    + +
    +
    + +
    +
    + +
    + +
    + +
    + + +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
    + +
    +
    + + + +
    +

    Wrapping up

    + +
    + +
    +
    + + + + +
    + +
    +

    Wrapping up#

    +

    Throughout this course, we have covered the most important foundational programming skills a future Data Scientist needs, with a particular emphasis on Python.

    +

    For both languages, we explored their syntax, different data types, and how to work with data structures. We also delved into implementing loops, functions, and even classes (which is uncommon in beginner programming courses). Additionally, we discussed basic data science operations in both languages, particularly focusing on how to inspect and interact with raw data.

    +

    Now, coming to the question of Python vs. R: which one should you choose? It’s entirely up to you—both are excellent tools, as we have emphasized throughout the course. Keep in mind that you can essentially achieve the same results in one language as in the other. For example, when it comes to data manipulation, see this comparison: Python vs R.

    +

    Here is my personal perspective though:

    +
      +
    • Python: Ideal for programmatic scenarios such as developing complex libraries, thanks to its versatility, simple syntax, and readability. Moreover, for machine learning and deep learning applications, Python remains the top choice.

    • +
    • R: Best suited for advanced statistical analysis, such as mixed linear modeling, factor analysis, mediation analysis, and Bayesian statistics. In addition, while I do not use it as often as I should, ggplot2 can produce exceptionally high-quality graphs—so be sure to consider this in the future!

    • +
    +
    +

    Looking Ahead#

    +

    There are certain things we have not covered in this course that a Data Scientist should likely master in the future. Here are some examples:

    +
    +

    Visualization#

    +

    Clear and effective visualization is crucial for communicating with data.

    +

    Here are a few examples:

    +
      +
    • Python: matplotlib, seaborn

    • +
    • R: ggplot2

    • +
    • Cross-platform: plotly, shiny

    • +
    +

    Matplotlib

    +
    +
    +
    import matplotlib.pyplot as plt
    +plt.style.use('classic')
    +%matplotlib inline
    +import numpy as np
    +import pandas as pd
    +
    +
    +
    +
    +
    +
    +
    # Create some data
    +rng = np.random.RandomState(0)              # creates a random range seeded from 0
    +x = np.linspace(0, 10, 500)                 # creates evenly spaced numbers of a specified interval
    +y = np.cumsum(rng.randn(500, 6), 0)         # creates the sum of random numbers within a range.
    +
    +
    +
    +
    +
    +
    +
    # Plot the data with Matplotlib defaults
    +plt.plot(x, y)
    +plt.legend('ABCDEF', ncol=3, loc='upper left');
    +
    +
    +
    +
    +_images/49ed94f15827ce8ac14e383ab6e9d4653c651f6dfb2edba2708c8d0fc1abc493.png +
    +
    +

    Seaborn

    +
    +
    +
    import seaborn as sns
    +
    +
    +
    +
    +
    +
    +
    xy_df = pd.concat([pd.DataFrame({"x": x}), 
    +                pd.DataFrame(y, columns=["A", "B", "C", "D", "E", "F"])], axis=1)
    +xy_df = pd.melt(xy_df, id_vars=["x"], var_name="group", value_name="y")
    +xy_df
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    xgroupy
    00.00000A1.764052
    10.02004A2.714141
    20.04008A3.475178
    30.06012A3.788246
    40.08016A6.058001
    ............
    29959.91984F-10.465950
    29969.93988F-9.613061
    29979.95992F-9.165378
    29989.97996F-9.004272
    299910.00000F-9.084870
    +

    3000 rows × 3 columns

    +
    +
    +
    +
    +
    sns.set(style="whitegrid")
    +sns.lineplot(x="x", y="y", hue="group", data=xy_df)
    +plt.legend(ncol=3, loc='upper left')
    +
    +
    +
    +
    +
    <matplotlib.legend.Legend at 0x7fcffd3316d0>
    +
    +
    +_images/8393299fa27a42e0b7e0f92dbf4b389608087a22bb19b8d8552621c98c9857b5.png +
    +
    +
    +
    +

    Analysis#

    +

    Hera are a few examples of basic libraries for data anaylis in Python and R, with a bit of predominance bias towards the former:

    +
      +
    • Statistics: scipy (Python), statsmodels (Python), Base R, lme4 (R), blme (R).

    • +
    • Machine lerning: scikit-learn (Python), caret (R), xgboost (cross-platform).

    • +
    • Deep lerning: keras (Python), pytorch (Python), tensorflow(Python).

    • +
    +

    scikit-learn

    +
    +
    +
    from sklearn.tree import DecisionTreeClassifier
    +from sklearn.model_selection import cross_val_score
    +from sklearn.datasets import load_iris
    +
    +X, y = load_iris()["data"], load_iris()["target"]
    +clf = DecisionTreeClassifier()
    +
    +res = cross_val_score(clf, X, y, cv=5)
    +
    +print("the average accuracy in classifying the types of Iris using a decision tree and cross-validation is:", 
    +      res.mean())
    +
    +
    +
    +
    +
    the average accuracy in classifying the types of Iris using a decision tree and cross-validation is: 0.9666666666666668
    +
    +
    +
    +
    +

    scipy

    +
    +
    +
    # A two-sample t-test, adapted from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
    +import numpy as np
    +from scipy import stats
    +rng = np.random.RandomState(1234)
    +
    +rvs1 = stats.norm.rvs(loc=5, scale=5, size=500, random_state=rng)
    +rvs2 = stats.norm.rvs(loc=5.57, scale=5, size=500, random_state=rng)
    +stats.ttest_ind(rvs1, rvs2)
    +
    +
    +
    +
    +
    TtestResult(statistic=-2.0456709273958644, pvalue=0.04105049135941344, df=998.0)
    +
    +
    +
    +
    +

    statsmodels

    +
    +
    +
    # The same as above, but using a linear regression model 
    +# (tip for life: any almost basic stastical tests is just a particular instation of a linear regression model).
    +
    +import statsmodels.api as sm
    +
    +y=np.concatenate((rvs1, rvs2))
    +X=np.column_stack(([1]*len(y), 
    +                   [0]*len(rvs1) + [1]*len(rvs2)))
    +model = sm.OLS(endog=y, exog=X)
    +res = model.fit()
    +print(res.summary())
    +
    +
    +
    +
    +
                                OLS Regression Results                            
    +==============================================================================
    +Dep. Variable:                      y   R-squared:                       0.004
    +Model:                            OLS   Adj. R-squared:                  0.003
    +Method:                 Least Squares   F-statistic:                     4.185
    +Date:                Wed, 04 Dec 2024   Prob (F-statistic):             0.0411
    +Time:                        21:16:04   Log-Likelihood:                -3001.1
    +No. Observations:                1000   AIC:                             6006.
    +Df Residuals:                     998   BIC:                             6016.
    +Df Model:                           1                                         
    +Covariance Type:            nonrobust                                         
    +==============================================================================
    +                 coef    std err          t      P>|t|      [0.025      0.975]
    +------------------------------------------------------------------------------
    +const          5.0487      0.218     23.180      0.000       4.621       5.476
    +x1             0.6301      0.308      2.046      0.041       0.026       1.235
    +==============================================================================
    +Omnibus:                        0.183   Durbin-Watson:                   2.112
    +Prob(Omnibus):                  0.913   Jarque-Bera (JB):                0.129
    +Skew:                          -0.024   Prob(JB):                        0.938
    +Kurtosis:                       3.027   Cond. No.                         2.62
    +==============================================================================
    +
    +Notes:
    +[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
    +
    +
    +
    +
    +
    +
    +

    Command-Line Terminal Programming#

    +
      +
    • Programming that takes place in a terminal, which a text-based interface for interacting directly with the computer.

    • +
    • Commands in a terminal are interpreted by a shell. Common shells include Bash (popular on Linux and macOS), Zsh (modern and customizable), and PowerShell (Windows-specific).

    • +
    • Essential for managing files, running scripts, and interacting with compute clusters (e.g. SLURM).

    • +
    +

    In Jupyter notebooks, you can execute shell commands by prefixing them with !.

    +

    For example, we can navigate directories:

    +
    +
    +
    # Print the current directory
    +!pwd
    +
    +# List files in the directory
    +!ls
    +
    +
    +
    +
    +
    /home/javi/Documentos/docencia/DS-1002/DS1002-book
    +
    +
    +
    06_numpy_intro.ipynb  _build	_config.yml  epilogue.ipynb  index.md
    +admonition	      chapters	data	     imgs	     _toc.yml
    +
    +
    +
    +
    +

    We can also manage files and directories:

    +
    +
    +
    # Create a new folder and file
    +!mkdir -p my_folder # make new dir; -p option to not raise an error if it already existed
    +!rm -f my_folder/* # Remove preexisting content; -f option to not raise an error if the folder was already empty
    +!touch my_folder/hello_world.py # Create a new file named "hello world.py"
    +!echo "print('Hello, World!\nCode run from:', __file__)" > my_folder/hello_world.py # Add some a line of code to this file
    +
    +
    +
    +
    +
    +
    +
    # List contents of the folder
    +!ls my_folder
    +
    +
    +
    +
    +
    hello_world.py
    +
    +
    +
    +
    +

    And run scripts:

    +
    +
    +
    !python my_folder/hello_world.py
    +
    +
    +
    +
    +
    Hello, World!
    +Code run from: /home/javi/Documentos/docencia/DS-1002/DS1002-book/my_folder/hello_world.py
    +
    +
    +
    +
    +
    +
    +

    GitHub#

    +
      +
    • Web-based platform for version control and collaboration built on top of Git, a version control system.

    • +
    • It also has a powerful terminal programming where to easily interact and change your repositories.

    • +
    • Allows you to track changes, collaborate with others, and share your work.

    • +
    +

    Common Use Cases:

    +
      +
    • Code Management: Store and version codebases for projects/libraries.

    • +
    • Team Collaboration: Coordinated team efforts on software development or data science projects.

    • +
    • Portfolio Hosting: Showcase projects and skills for personal branding.

    • +
    • Open Source Contribution: Contribute to or learn from public repositories.

    • +
    • Documentation: Use GitHub Pages to create project websites or host documentation.

    • +
    +

    A few personal examples:

    + +
    +
    +
    + + + + +
    + + + + + + + + +
    + + + + + + +
    +
    + + +
    + + +
    +
    +
    + + + + + +
    +
    + + \ No newline at end of file diff --git a/genindex.html b/genindex.html index 5491a8b..9a2bdfb 100644 --- a/genindex.html +++ b/genindex.html @@ -220,6 +220,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Epilogue

    +
    diff --git a/index.html b/index.html index 9787bf9..6812427 100644 --- a/index.html +++ b/index.html @@ -224,6 +224,10 @@
  • Control Structures
  • Functions
  • R for data science: Tidyverse
  • + +

    Epilogue

    +
    @@ -396,6 +400,8 @@

    Welcome to DS-1002 +
    +