From c0d6818367ed114b8ba18855f2004c3179b0006b Mon Sep 17 00:00:00 2001 From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:50:48 +0200 Subject: [PATCH] [FSTORE-1207] AirQuality LLM project (#250) * Function Calling & AirQuality FunctionCalling Chatbot --- .../1_air_quality_feature_backfill.ipynb | 976 +++++++++++++----- .../2_air_quality_feature_pipeline.ipynb | 942 +++++++++++++++-- .../3_air_quality_training_pipeline.ipynb | 632 +++++++++--- .../4_air_quality_batch_inference.ipynb | 402 ++++++-- .../air_quality/5_function_calling.ipynb | 776 ++++++++++++++ advanced_tutorials/air_quality/app_gradio.py | 104 ++ .../air_quality/app_streamlit.py | 99 ++ .../air_quality/feature_pipeline.py | 158 --- .../air_quality/features/__init__.py | 0 .../air_quality/features/air_quality.py | 6 +- advanced_tutorials/air_quality/functions.py | 392 ------- .../functions/air_quality_data_retrieval.py | 164 +++ .../air_quality/functions/common_functions.py | 25 + .../functions/context_engineering.py | 191 ++++ .../air_quality/functions/llm_chain.py | 202 ++++ .../functions/parse_air_quality.py | 79 ++ .../air_quality/functions/parse_weather.py | 81 ++ .../air_quality/requirements.txt | 19 +- 18 files changed, 4125 insertions(+), 1123 deletions(-) create mode 100644 advanced_tutorials/air_quality/5_function_calling.ipynb create mode 100644 advanced_tutorials/air_quality/app_gradio.py create mode 100644 advanced_tutorials/air_quality/app_streamlit.py delete mode 100644 advanced_tutorials/air_quality/feature_pipeline.py delete mode 100644 advanced_tutorials/air_quality/features/__init__.py delete mode 100644 advanced_tutorials/air_quality/functions.py create mode 100644 advanced_tutorials/air_quality/functions/air_quality_data_retrieval.py create mode 100644 advanced_tutorials/air_quality/functions/common_functions.py create mode 100644 advanced_tutorials/air_quality/functions/context_engineering.py create mode 100644 advanced_tutorials/air_quality/functions/llm_chain.py create mode 100644 advanced_tutorials/air_quality/functions/parse_air_quality.py create mode 100644 advanced_tutorials/air_quality/functions/parse_weather.py diff --git a/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb b/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb index f203e073..1ce0301e 100644 --- a/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb +++ b/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "73ee3ec9", + "id": "32cd155d", "metadata": {}, "source": [ "# **Hopsworks Feature Store** \n", @@ -12,49 +12,61 @@ "**Note**: This tutorial does not support Google Colab.\n", "\n", "## ๐Ÿ—’๏ธ This notebook is divided into the following sections:\n", - "1. Fetch historical data\n", - "2. Connect to the Hopsworks feature store\n", - "3. Create feature groups and insert them to the feature store\n", + "\n", + "1. Fetch historical data.\n", + "2. Connect to the Hopsworks feature store.\n", + "3. Create feature groups and insert them to the feature store.\n", "\n", "![tutorial-flow](../../images/01_featuregroups.png)" ] }, { "cell_type": "markdown", - "id": "f04d5c5e", + "id": "ce71c0b2", "metadata": {}, "source": [ - "### ๐Ÿ“ Imports" + "## ๐Ÿ“ Imports" ] }, { "cell_type": "code", - "execution_count": null, - "id": "a03d0127", + "execution_count": 1, + "id": "f92001bd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.25.3 which is incompatible.\n", + "tensorboard 2.11.2 requires protobuf<4,>=3.9.2, but you have protobuf 4.25.3 which is incompatible.\n", + "ray 2.0.0 requires protobuf<4.0.0,>=3.15.3, but you have protobuf 4.25.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ - "!pip install -U hopsworks --quiet\n", - "!pip install geopy folium streamlit-folium --q" + "!pip install -r requirements.txt --quiet\n", + "!pip install -U hopsworks --quiet" ] }, { "cell_type": "code", - "execution_count": null, - "id": "cd165941", + "execution_count": 2, + "id": "e974d9d5", "metadata": {}, "outputs": [], "source": [ - "import datetime\n", - "import time\n", - "import requests\n", "import json\n", "\n", "import pandas as pd\n", "import folium\n", "\n", "from features import air_quality\n", - "from functions import *\n", + "from functions.common_functions import convert_date_to_unix\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" @@ -62,15 +74,7 @@ }, { "cell_type": "markdown", - "id": "ba9903fc", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "b7a1965a-0da7-4263-a68a-8b2e8cb753f1", + "id": "88d519dd", "metadata": {}, "source": [ "## ๐ŸŒ Representing the Target cities " @@ -78,8 +82,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "bd578db1-69e7-4230-b3f2-807b8056283a", + "execution_count": 3, + "id": "e0f7a26b", "metadata": { "tags": [] }, @@ -95,8 +99,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ea972c52-bfad-465d-b1e1-50eeff99b482", + "execution_count": 5, + "id": "f8063796", "metadata": {}, "outputs": [], "source": [ @@ -109,13 +113,13 @@ " location=coords,\n", " popup=city_name,\n", " ).add_to(my_map)\n", - "my_map" + "#my_map" ] }, { "cell_type": "code", "execution_count": null, - "id": "fb5ecf81-647b-490a-92b1-f7e963413710", + "id": "fcde29f7", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +129,7 @@ }, { "cell_type": "markdown", - "id": "2246ca9d", + "id": "2a2d3674", "metadata": {}, "source": [ "## ๐ŸŒซ Processing Air Quality data" @@ -133,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "b4a1c5d1", + "id": "b081d3f2", "metadata": {}, "source": [ "### [๐Ÿ‡ช๐Ÿ‡บ EEA](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm)\n", @@ -142,12 +146,39 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "96b8be01-6286-4886-8043-56e0e49b314e", + "execution_count": 6, + "id": "986686f5", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'Amsterdam': [52.37, 4.89],\n", + " 'Athina': [37.98, 23.73],\n", + " 'Berlin': [52.52, 13.39],\n", + " 'Gdansk': [54.37, 18.61],\n", + " 'Krakรณw': [50.06, 19.94],\n", + " 'London': [51.51, -0.13],\n", + " 'Madrid': [40.42, -3.7],\n", + " 'Marseille': [43.3, 5.37],\n", + " 'Milano': [45.46, 9.19],\n", + " 'Mรผnchen': [48.14, 11.58],\n", + " 'Napoli': [40.84, 14.25],\n", + " 'Paris': [48.85, 2.35],\n", + " 'Sevilla': [37.39, -6.0],\n", + " 'Stockholm': [59.33, 18.07],\n", + " 'Tallinn': [59.44, 24.75],\n", + " 'Varna': [43.21, 27.92],\n", + " 'Wien': [48.21, 16.37]}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# EU Cities \n", "target_cities[\"EU\"]" @@ -155,47 +186,96 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "5bb2a868-5f3a-4065-b651-318c24826b97", + "execution_count": 12, + "id": "be358330", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ Size of this dataframe: (63548, 3)\n", + "โ›ณ๏ธ Missing Values: 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatepm2_5
11887Gdansk2014-09-2123.0
17498Krakรณw2019-10-2356.0
42593Paris2016-08-047.0
\n", + "
" + ], + "text/plain": [ + " city_name date pm2_5\n", + "11887 Gdansk 2014-09-21 23.0\n", + "17498 Krakรณw 2019-10-23 56.0\n", + "42593 Paris 2016-08-04 7.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read the CSV file from the specified URL into a pandas DataFrame\n", - "df_eu = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_eu.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5620df22-f744-4550-a81a-7e5d71aae542", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Check for missing values in the 'df_eu' DataFrame\n", - "df_eu.isna().sum().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0e23728-a01d-45bc-bf25-4a9c77f21d66", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "df_eu = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_eu.csv\")\n", + "\n", "# Print the size of the 'df_eu' DataFrame (number of rows and columns)\n", "print(\"โ›ณ๏ธ Size of this dataframe:\", df_eu.shape)\n", "\n", + "# Check for missing values in the 'df_eu' DataFrame\n", + "print(f'โ›ณ๏ธ Missing Values: {df_eu.isna().sum().sum()}')\n", + "\n", "# Display a random sample of three rows from the 'df_eu' DataFrame\n", "df_eu.sample(3)" ] }, { "cell_type": "markdown", - "id": "c2e45567-dd6b-4e5e-a153-82a2f4f32fbc", + "id": "f02141bd", "metadata": {}, "source": [ "### [๐Ÿ‡บ๐Ÿ‡ธ USEPA](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily)\n", @@ -206,12 +286,35 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c4952759-0fb9-4229-8b78-2e37cffb144d", + "execution_count": 13, + "id": "87c439b7", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'Albuquerque': [35.08, -106.65],\n", + " 'Atlanta': [33.75, -84.39],\n", + " 'Chicago': [41.88, -87.62],\n", + " 'Columbus': [39.96, -83.0],\n", + " 'Dallas': [32.78, -96.8],\n", + " 'Denver': [39.74, -104.98],\n", + " 'Houston': [29.76, -95.37],\n", + " 'Los Angeles': [34.05, -118.24],\n", + " 'New York': [40.71, -74.01],\n", + " 'Phoenix-Mesa': [33.66, -112.04],\n", + " 'Salt Lake City': [40.76, -111.89],\n", + " 'San Francisco': [37.78, -122.42],\n", + " 'Tampa': [27.95, -82.46]}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# US Cities \n", "target_cities[\"US\"]" @@ -219,49 +322,98 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c6aceaee-9431-48fd-818a-41fbdd07575c", + "execution_count": 16, + "id": "3429aebd", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ Size of this dataframe: (46037, 3)\n", + "โ›ณ๏ธ Missing Values: 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecity_namepm2_5
214762015-01-14Houston11.3
263212018-11-28Los Angeles7.8
430022014-09-01Tampa11.8
\n", + "
" + ], + "text/plain": [ + " date city_name pm2_5\n", + "21476 2015-01-14 Houston 11.3\n", + "26321 2018-11-28 Los Angeles 7.8\n", + "43002 2014-09-01 Tampa 11.8" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read the CSV file from the specified URL into a pandas DataFrame\n", - "df_us = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_us.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e7ff20e-8a1a-4fa3-b801-71beead7b5f2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Check for missing values in the 'df_us' DataFrame\n", - "df_us.isna().sum().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3818e3e1-8674-4634-9023-92be8410fba5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "df_us = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_us.csv\")\n", + "\n", "# Print the size of the 'df_us' DataFrame (number of rows and columns)\n", "print(\"โ›ณ๏ธ Size of this dataframe:\", df_us.shape)\n", "\n", + "# Check for missing values in the 'df_us' DataFrame\n", + "print(f'โ›ณ๏ธ Missing Values: {df_us.isna().sum().sum()}')\n", + "\n", "# Display a random sample of three rows from the 'df_us' DataFrame\n", "df_us.sample(3)" ] }, { "cell_type": "markdown", - "id": "25557752-31c8-4da9-a52c-4415c4d20ae3", + "id": "5ee7b660", "metadata": {}, "source": [ "### ๐Ÿข Processing special city - `Seattle`\n", @@ -271,72 +423,135 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "2f54d2cb-991c-47cb-a686-76c9f7a87170", + "execution_count": 15, + "id": "f401130e", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'Bellevue-SE 12th St': [47.60086, -122.1484],\n", + " 'DARRINGTON - FIR ST (Darrington High School)': [48.2469, -121.6031],\n", + " 'KENT - JAMES & CENTRAL': [47.38611, -122.23028],\n", + " 'LAKE FOREST PARK TOWNE CENTER': [47.755, -122.2806],\n", + " 'MARYSVILLE - 7TH AVE (Marysville Junior High)': [48.05432, -122.17153],\n", + " 'NORTH BEND - NORTH BEND WAY': [47.49022, -121.77278],\n", + " 'SEATTLE - BEACON HILL': [47.56824, -122.30863],\n", + " 'SEATTLE - DUWAMISH': [47.55975, -122.33827],\n", + " 'SEATTLE - SOUTH PARK #2': [47.53091, -122.3208],\n", + " 'Seattle-10th & Weller': [47.59722, -122.31972],\n", + " 'TACOMA - ALEXANDER AVE': [47.2656, -122.3858],\n", + " 'TACOMA - L STREET': [47.1864, -122.4517],\n", + " 'Tacoma-S 36th St': [47.22634, -122.46256],\n", + " 'Tukwila Allentown': [47.49854, -122.27839],\n", + " 'Tulalip-Totem Beach Rd': [48.06534, -122.28519]}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "target_cities[\"Seattle\"]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "31c8505d-68bc-40b6-be0f-42d8532dbd48", + "execution_count": 17, + "id": "5ac26217", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ Size of this dataframe: (46479, 3)\n", + "โ›ณ๏ธ Missing Values: 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatepm2_5
8709SEATTLE - BEACON HILL2015-11-059.5
6634DARRINGTON - FIR ST (Darrington High School)2014-06-241.7
45134NORTH BEND - NORTH BEND WAY2023-01-120.3
\n", + "
" + ], + "text/plain": [ + " city_name date pm2_5\n", + "8709 SEATTLE - BEACON HILL 2015-11-05 9.5\n", + "6634 DARRINGTON - FIR ST (Darrington High School) 2014-06-24 1.7\n", + "45134 NORTH BEND - NORTH BEND WAY 2023-01-12 0.3" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read the CSV file from the specified URL into a pandas DataFrame\n", - "df_seattle = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_seattle.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f6583c9-3b2a-41c6-a020-aeede88c4867", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Check for missing values in the 'df_seattle' DataFrame\n", - "df_seattle.isna().sum().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "065a5b03-28f7-475c-9c6a-4340388157d8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "df_seattle = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_seattle.csv\")\n", + "\n", "# Print the size of the 'df_seattle' DataFrame (number of rows and columns)\n", "print(\"โ›ณ๏ธ Size of this dataframe:\", df_seattle.shape)\n", + "\n", + "# Check for missing values in the 'df_seattle' DataFrame\n", + "print(f'โ›ณ๏ธ Missing Values: {df_seattle.isna().sum().sum()}')\n", + "\n", + "# Display a random sample of three rows\n", "df_seattle.sample(3)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3b17ca4-0e9d-4207-ad62-90ea9c157def", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Value Counts\n", - "df_seattle['city_name'].value_counts()" - ] - }, { "cell_type": "markdown", - "id": "c278a55d-f083-4f95-b292-92e545b9c408", + "id": "e23a6e68", "metadata": {}, "source": [ "### ๐ŸŒŸ All together" @@ -344,12 +559,94 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0d55ae92-4bf9-43ae-8841-6767f5f68bec", + "execution_count": 19, + "id": "d913087f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ DF shape: (156064, 3)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatepm2_5
106487Tampa2014-06-309.3
12453Gdansk2016-04-099.0
101342Salt Lake City2020-04-295.8
46538Sevilla2017-02-128.0
117821Seattle-10th & Weller2015-12-125.7
\n", + "
" + ], + "text/plain": [ + " city_name date pm2_5\n", + "106487 Tampa 2014-06-30 9.3\n", + "12453 Gdansk 2016-04-09 9.0\n", + "101342 Salt Lake City 2020-04-29 5.8\n", + "46538 Sevilla 2017-02-12 8.0\n", + "117821 Seattle-10th & Weller 2015-12-12 5.7" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Concatenate the DataFrames df_eu, df_us, and df_seattle along the rows and reset the index\n", "df_air_quality = pd.concat(\n", @@ -365,18 +662,18 @@ }, { "cell_type": "markdown", - "id": "22896049-441d-4baf-b717-415123cb39d7", + "id": "268791c4", "metadata": { "tags": [] }, "source": [ - "### ๐Ÿ›  Feature Engineering" + "## ๐Ÿ›  Feature Engineering" ] }, { "cell_type": "code", - "execution_count": null, - "id": "140b468a-e0c2-44a1-8e44-4cf393407eca", + "execution_count": 20, + "id": "aff7a97b", "metadata": { "tags": [] }, @@ -388,12 +685,23 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "87dc89c0-72a7-4be6-b4e4-03d5d32be546", + "execution_count": 21, + "id": "1d45e480", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Apply feature engineering to the df_air_quality DataFrame using the air_quality.feature_engineer_aq() function\n", "df_air_quality = air_quality.feature_engineer_aq(df_air_quality)\n", @@ -407,12 +715,23 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "94f67c89-6b39-4748-b4be-6ed3c9d57f96", + "execution_count": 22, + "id": "02c8e1e5", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(154533, 31)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Print the shape (number of rows and columns) of the df_air_quality DataFrame\n", "df_air_quality.shape" @@ -420,12 +739,32 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ed9bc7f1-d62e-4b1f-97af-6ecd30fe4b67", + "execution_count": 23, + "id": "4c627429", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['city_name', 'date', 'pm2_5', 'pm_2_5_previous_1_day',\n", + " 'pm_2_5_previous_2_day', 'pm_2_5_previous_3_day',\n", + " 'pm_2_5_previous_4_day', 'pm_2_5_previous_5_day',\n", + " 'pm_2_5_previous_6_day', 'pm_2_5_previous_7_day', 'mean_7_days',\n", + " 'mean_14_days', 'mean_28_days', 'std_7_days', 'exp_mean_7_days',\n", + " 'exp_std_7_days', 'std_14_days', 'exp_mean_14_days', 'exp_std_14_days',\n", + " 'std_28_days', 'exp_mean_28_days', 'exp_std_28_days', 'year',\n", + " 'day_of_month', 'month', 'day_of_week', 'is_weekend', 'sin_day_of_year',\n", + " 'cos_day_of_year', 'sin_day_of_week', 'cos_day_of_week'],\n", + " dtype='object')" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Retrieve and display the column names of the df_air_quality DataFrame\n", "df_air_quality.columns" @@ -433,15 +772,7 @@ }, { "cell_type": "markdown", - "id": "88a9e0ef-e9d2-4e3c-91af-c4e619b8c906", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "4687e802", + "id": "4296b629", "metadata": { "tags": [] }, @@ -451,42 +782,124 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c46283b4", + "execution_count": 27, + "id": "52e4eb11", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatetemperature_maxtemperature_minprecipitation_sumrain_sumsnowfall_sumprecipitation_hourswind_speed_maxwind_gusts_maxwind_direction_dominant
0Amsterdam2013-01-019.25.510.210.20.014.032.062.6255
1Amsterdam2013-01-027.85.60.50.50.02.022.939.6251
2Amsterdam2013-01-0310.38.22.02.00.06.022.239.2255
\n", + "
" + ], + "text/plain": [ + " city_name date temperature_max temperature_min precipitation_sum \\\n", + "0 Amsterdam 2013-01-01 9.2 5.5 10.2 \n", + "1 Amsterdam 2013-01-02 7.8 5.6 0.5 \n", + "2 Amsterdam 2013-01-03 10.3 8.2 2.0 \n", + "\n", + " rain_sum snowfall_sum precipitation_hours wind_speed_max \\\n", + "0 10.2 0.0 14.0 32.0 \n", + "1 0.5 0.0 2.0 22.9 \n", + "2 2.0 0.0 6.0 22.2 \n", + "\n", + " wind_gusts_max wind_direction_dominant \n", + "0 62.6 255 \n", + "1 39.6 251 \n", + "2 39.2 255 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read the CSV file from the specified URL into a pandas DataFrame for weather data\n", - "df_weather = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_weather.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1921b61c-d002-417e-88a6-9fe1cad0a7d4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Count the occurrences of each unique value in the 'city_name' column of the df_weather DataFrame\n", - "df_weather.city_name.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d5dcd0a", - "metadata": {}, - "outputs": [], - "source": [ + "df_weather = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_weather.csv\")\n", + "\n", "# Display the first three rows of the df_weather DataFrame\n", "df_weather.head(3)" ] }, { "cell_type": "markdown", - "id": "cc9b7ad6", + "id": "cd0d4d7b", "metadata": {}, "source": [ "---" @@ -494,8 +907,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a8f886c3-a5ac-4370-a6a2-22838ab7409e", + "execution_count": 28, + "id": "ec1e91c1", "metadata": { "tags": [] }, @@ -516,26 +929,29 @@ }, { "cell_type": "markdown", - "id": "f2ebd846-0420-4e4c-8a5b-0827fa91c693", + "id": "472f7eb5", "metadata": {}, "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "cb6f83ba", - "metadata": {}, - "source": [ - "### ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " + "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " ] }, { "cell_type": "code", - "execution_count": null, - "id": "dd068240", + "execution_count": 29, + "id": "410f0b7b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/5242\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "import hopsworks\n", "\n", @@ -546,7 +962,7 @@ }, { "cell_type": "markdown", - "id": "63d8c3b9", + "id": "6176991c", "metadata": {}, "source": [ "## ๐Ÿช„ Creating Feature Groups" @@ -554,7 +970,7 @@ }, { "cell_type": "markdown", - "id": "4a2515c4", + "id": "370bbf0b", "metadata": {}, "source": [ "### ๐ŸŒซ Air Quality Data" @@ -562,13 +978,62 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "9d7088a8", + "execution_count": 30, + "id": "b5a58bfc", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://snurran.hops.works/p/5242/fs/5190/fg/5194\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6677d288bc0746a48faf802be7032e40", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Uploading Dataframe: 0.00% | | Rows 0/154533 | Elapsed Time: 00:00 | Remaining Time: ?" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: air_quality_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://snurran.hops.works/p/5242/jobs/named/air_quality_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(, None)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get or create feature group\n", "air_quality_fg = fs.get_or_create_feature_group(\n", @@ -577,23 +1042,14 @@ " version=1,\n", " primary_key=['unix_time','city_name'],\n", " event_time=[\"unix_time\"],\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e04a975-bb58-42e2-9abd-90e68ae37864", - "metadata": {}, - "outputs": [], - "source": [ + ") \n", "# Insert data\n", "air_quality_fg.insert(df_air_quality)" ] }, { "cell_type": "markdown", - "id": "a73a9029", + "id": "09cbe8aa", "metadata": {}, "source": [ "### ๐ŸŒฆ Weather Data" @@ -601,10 +1057,52 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "acc2b799", + "execution_count": 31, + "id": "089f45b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://snurran.hops.works/p/5242/fs/5190/fg/5195\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7328db1bd3b84e769e7b4ac88c1416a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Uploading Dataframe: 0.00% | | Rows 0/168975 | Elapsed Time: 00:00 | Remaining Time: ?" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: weather_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://snurran.hops.works/p/5242/jobs/named/weather_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(, None)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get or create feature group\n", "weather_fg = fs.get_or_create_feature_group(\n", @@ -613,27 +1111,17 @@ " version=1,\n", " primary_key=['unix_time','city_name'],\n", " event_time=[\"unix_time\"],\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9583b4d1-e2e3-4f56-9e5d-23caa0c49457", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + ") \n", "# Insert data\n", "weather_fg.insert(df_weather)" ] }, { "cell_type": "markdown", - "id": "87c668dd", + "id": "34f5ffec", "metadata": {}, "source": [ + "---\n", "## โญ๏ธ **Next:** Part 02: Feature Pipeline \n", " \n", "\n", @@ -643,7 +1131,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -657,7 +1145,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb b/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb index 580e8fc7..8f159c75 100644 --- a/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb +++ b/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb @@ -2,19 +2,21 @@ "cells": [ { "cell_type": "markdown", - "id": "dd094af7", + "id": "f16a717d", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 02: Feature Pipeline\n", "\n", "## ๐Ÿ—’๏ธ This notebook is divided into the following sections:\n", - "1. Parse Data\n", - "2. Feature Group Insertion" + "\n", + "1. Fetch Feature Groups. \n", + "2. Parse Data.\n", + "3. Feature Group Insertion." ] }, { "cell_type": "markdown", - "id": "a7dcc328", + "id": "37facd6e", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -22,19 +24,21 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "364e961e", + "execution_count": 1, + "id": "77d2fbe5", "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import time\n", - "import requests\n", "import pandas as pd\n", "import json\n", "\n", "from features import air_quality\n", - "from functions import *\n", + "from functions.parse_air_quality import get_aqi_data_from_open_meteo\n", + "from functions.parse_weather import get_weather_data_from_open_meteo\n", + "from functions.common_functions import *\n", + "\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" @@ -42,8 +46,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "50d04cc5-6788-4a4c-9f87-c2e00b5fce49", + "execution_count": 2, + "id": "c14c97e6", "metadata": { "tags": [] }, @@ -57,12 +61,23 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "b0d2261f-8907-44f4-9f1a-bd9ec5e1556f", + "execution_count": 3, + "id": "5b67e039", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(datetime.date(2024, 3, 17), '2024-03-17')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Getting the current date\n", "today = datetime.date.today()\n", @@ -73,7 +88,7 @@ }, { "cell_type": "markdown", - "id": "d406b01d", + "id": "0c5ebe2a", "metadata": {}, "source": [ "### ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " @@ -81,16 +96,35 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "8ba3cb02", + "execution_count": 4, + "id": "730eb857", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/5242\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "import hopsworks\n", "\n", "project = hopsworks.login()\n", - "fs = project.get_feature_store() \n", - "\n", + "fs = project.get_feature_store() " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ddd400ad", + "metadata": {}, + "outputs": [], + "source": [ "# Retrieve feature groups\n", "air_quality_fg = fs.get_feature_group(\n", " name='air_quality',\n", @@ -104,15 +138,7 @@ }, { "cell_type": "markdown", - "id": "c7f61053-a8c0-48a7-afa4-0e8733d2a54a", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "459ee37e-7e74-4051-97f6-2e03f9cac9d8", + "id": "f992009d", "metadata": {}, "source": [ "## ๐ŸŒซ Filling gaps in Air Quality data (PM2.5)" @@ -120,12 +146,21 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "76ae9dd9-ab28-41d1-8478-5af27b7f767e", + "execution_count": 6, + "id": "37058d7f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (2.30s) \n", + "Finished: Reading data from Hopsworks, using ArrowFlight (1.48s) \n" + ] + } + ], "source": [ "# Read data from feature groups\n", "df_air_quality = air_quality_fg.read()\n", @@ -134,8 +169,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "03063bc6-b58f-47f4-bfc6-8020ec196478", + "execution_count": 7, + "id": "cee48adb", "metadata": { "tags": [] }, @@ -154,12 +189,21 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "5e868bdf-e91a-410a-b654-a315c605f3dc", + "execution_count": 8, + "id": "49b9e259", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ Last update for Paris: 2024-03-15\n", + "โ›ณ๏ธ Last update for Columbus: 2024-03-15\n" + ] + } + ], "source": [ "# Accessing the last updated date for the city of Paris\n", "paris_last_date = last_dates_aq.get(\"Paris\", \"Not available\")\n", @@ -172,9 +216,22 @@ "print(\"โ›ณ๏ธ Last update for Columbus:\", columbus_last_date)" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f658d581", + "metadata": {}, + "outputs": [], + "source": [ + "for city, date in last_dates_aq.items():\n", + " city_last_date = datetime.datetime.strptime(date, \"%Y-%m-%d\").date()\n", + " if (today - city_last_date) <= datetime.timedelta(days=28):\n", + " last_dates_aq[city] = (city_last_date - datetime.timedelta(days=28)).strftime(\"%Y-%m-%d\")" + ] + }, { "cell_type": "markdown", - "id": "77c4ee8d-7f7e-4bd0-a97b-c3ac0d7db50f", + "id": "f6df0f7f", "metadata": {}, "source": [ "### ๐Ÿง™๐Ÿผโ€โ™‚๏ธ Parsing PM2.5 data" @@ -182,13 +239,159 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "112a7974-37cb-4195-bc71-328af428c491", + "execution_count": 10, + "id": "cc68ab56", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processed PM2_5 for Amsterdam since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Athina since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Berlin since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Gdansk since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Krakรณw since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for London since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Madrid since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Marseille since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Milano since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Mรผnchen since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Napoli since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Paris since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Sevilla since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Stockholm since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Tallinn since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Varna since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Wien since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Albuquerque since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Atlanta since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Chicago since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Columbus since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Dallas since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Denver since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Houston since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Los Angeles since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for New York since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Phoenix-Mesa since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Salt Lake City since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for San Francisco since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Tampa since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for Bellevue-SE 12th St since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for DARRINGTON - FIR ST (Darrington High School) since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for KENT - JAMES & CENTRAL since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for LAKE FOREST PARK TOWNE CENTER since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for MARYSVILLE - 7TH AVE (Marysville Junior High) since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for NORTH BEND - NORTH BEND WAY since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for SEATTLE - BEACON HILL since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for SEATTLE - DUWAMISH since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for SEATTLE - SOUTH PARK #2 since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Seattle-10th & Weller since 2024-02-16 till 2024-03-17.\n", + "Took 0.13 sec.\n", + "\n", + "Processed PM2_5 for TACOMA - ALEXANDER AVE since 2024-02-16 till 2024-03-17.\n", + "Took 0.1 sec.\n", + "\n", + "Processed PM2_5 for TACOMA - L STREET since 2024-02-16 till 2024-03-17.\n", + "Took 0.66 sec.\n", + "\n", + "Processed PM2_5 for Tacoma-S 36th St since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Tukwila Allentown since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "Processed PM2_5 for Tulalip-Totem Beach Rd since 2024-02-16 till 2024-03-17.\n", + "Took 0.11 sec.\n", + "\n", + "----------------------------------------------------------------\n", + "Parsed new PM2.5 data for ALL locations up to 2024-03-17.\n", + "Took 5.44 sec.\n", + "\n" + ] + } + ], "source": [ "# Storing the current time as the start time of the cell execution\n", "start_of_cell = time.time()\n", @@ -223,17 +426,78 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1afdc6a5", + "execution_count": 11, + "id": "09db1460", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatepm2_5
1392Tulalip-Totem Beach Rd2024-03-1514.5
1393Tulalip-Totem Beach Rd2024-03-1613.7
1394Tulalip-Totem Beach Rd2024-03-1715.1
\n", + "
" + ], + "text/plain": [ + " city_name date pm2_5\n", + "1392 Tulalip-Totem Beach Rd 2024-03-15 14.5\n", + "1393 Tulalip-Totem Beach Rd 2024-03-16 13.7\n", + "1394 Tulalip-Totem Beach Rd 2024-03-17 15.1" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_aq_raw.tail(3)" ] }, { "cell_type": "markdown", - "id": "250d9daf-83fa-49f1-bcd8-4efaeb90b99c", + "id": "7cbabf34", "metadata": { "tags": [] }, @@ -243,8 +507,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "140b468a-e0c2-44a1-8e44-4cf393407eca", + "execution_count": 12, + "id": "100f0f2d", "metadata": { "tags": [] }, @@ -256,27 +520,200 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "acc181a9-6183-45ec-aed2-8ee684e13b39", + "execution_count": 13, + "id": "91ded83e", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatepm2_5pm_2_5_previous_1_daypm_2_5_previous_2_daypm_2_5_previous_3_daypm_2_5_previous_4_daypm_2_5_previous_5_daypm_2_5_previous_6_daypm_2_5_previous_7_day...exp_std_28_daysyearday_of_monthmonthday_of_weekis_weekendsin_day_of_yearcos_day_of_yearsin_day_of_weekcos_day_of_week
1392Athina2024-03-1725.625.820.214.88.410.39.617.2...5.3762532024173610.9700640.24285-0.7818310.62349
1393Los Angeles2024-03-1728.121.515.112.716.411.027.036.3...7.6965752024173610.9700640.24285-0.7818310.62349
1394Milano2024-03-1743.423.616.846.632.417.021.210.4...22.0443942024173610.9700640.24285-0.7818310.62349
\n", + "

3 rows ร— 31 columns

\n", + "
" + ], + "text/plain": [ + " city_name date pm2_5 pm_2_5_previous_1_day \\\n", + "1392 Athina 2024-03-17 25.6 25.8 \n", + "1393 Los Angeles 2024-03-17 28.1 21.5 \n", + "1394 Milano 2024-03-17 43.4 23.6 \n", + "\n", + " pm_2_5_previous_2_day pm_2_5_previous_3_day pm_2_5_previous_4_day \\\n", + "1392 20.2 14.8 8.4 \n", + "1393 15.1 12.7 16.4 \n", + "1394 16.8 46.6 32.4 \n", + "\n", + " pm_2_5_previous_5_day pm_2_5_previous_6_day pm_2_5_previous_7_day \\\n", + "1392 10.3 9.6 17.2 \n", + "1393 11.0 27.0 36.3 \n", + "1394 17.0 21.2 10.4 \n", + "\n", + " ... exp_std_28_days year day_of_month month day_of_week \\\n", + "1392 ... 5.376253 2024 17 3 6 \n", + "1393 ... 7.696575 2024 17 3 6 \n", + "1394 ... 22.044394 2024 17 3 6 \n", + "\n", + " is_weekend sin_day_of_year cos_day_of_year sin_day_of_week \\\n", + "1392 1 0.970064 0.24285 -0.781831 \n", + "1393 1 0.970064 0.24285 -0.781831 \n", + "1394 1 0.970064 0.24285 -0.781831 \n", + "\n", + " cos_day_of_week \n", + "1392 0.62349 \n", + "1393 0.62349 \n", + "1394 0.62349 \n", + "\n", + "[3 rows x 31 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Applying a feature engineering function 'feature_engineer_aq' to the 'df_aq_update' DataFrame\n", "df_aq_update = air_quality.feature_engineer_aq(df_aq_raw)\n", "\n", "# Dropping rows with missing values in the 'df_aq_update' DataFrame\n", "df_aq_update = df_aq_update.dropna()\n", + "\n", "df_aq_update.tail(3)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0364873c", + "execution_count": 14, + "id": "a387ac8a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Checking the total number of missing values in the 'df_aq_update' DataFrame\n", "df_aq_update.isna().sum().sum()" @@ -284,12 +721,23 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "94f67c89-6b39-4748-b4be-6ed3c9d57f96", + "execution_count": 15, + "id": "bb4b8914", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(135, 31)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Retrieving the dimensions (number of rows and columns) of the 'df_aq_update' DataFrame\n", "df_aq_update.shape" @@ -297,15 +745,7 @@ }, { "cell_type": "markdown", - "id": "d74f5622-6f57-47b9-ac0b-dfb6617847b2", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "95a34c64-5b94-4c4f-b03d-14e12a106f25", + "id": "fe63cf55", "metadata": {}, "source": [ "## ๐ŸŒฆ Filling gaps in Weather data" @@ -313,8 +753,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "46009853-160c-467e-abb0-3145d27c57dc", + "execution_count": 16, + "id": "36a7388e", "metadata": { "tags": [] }, @@ -333,7 +773,7 @@ }, { "cell_type": "markdown", - "id": "1fd15812-a3a9-488c-879e-181c7b815357", + "id": "6c868dae", "metadata": { "tags": [] }, @@ -343,13 +783,159 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ef027d28-3443-4c7c-9e85-783625301a14", + "execution_count": 17, + "id": "0ad03c2d", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsed weather for Amsterdam since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Athina since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Berlin since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Gdansk since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Krakรณw since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for London since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Madrid since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Marseille since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Milano since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Mรผnchen since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Napoli since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Paris since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Sevilla since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Stockholm since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Tallinn since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Varna since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Wien since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Albuquerque since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Atlanta since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Chicago since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Columbus since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Dallas since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Denver since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Houston since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Los Angeles since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for New York since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Phoenix-Mesa since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Salt Lake City since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for San Francisco since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Tampa since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Bellevue-SE 12th St since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for DARRINGTON - FIR ST (Darrington High School) since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for KENT - JAMES & CENTRAL since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for LAKE FOREST PARK TOWNE CENTER since 2024-03-15 till 2024-03-17.\n", + "Took 2.12 sec.\n", + "\n", + "Parsed weather for MARYSVILLE - 7TH AVE (Marysville Junior High) since 2024-03-15 till 2024-03-17.\n", + "Took 2.12 sec.\n", + "\n", + "Parsed weather for NORTH BEND - NORTH BEND WAY since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for SEATTLE - BEACON HILL since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for SEATTLE - DUWAMISH since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for SEATTLE - SOUTH PARK #2 since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Seattle-10th & Weller since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for TACOMA - ALEXANDER AVE since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for TACOMA - L STREET since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Tacoma-S 36th St since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "Parsed weather for Tukwila Allentown since 2024-03-15 till 2024-03-17.\n", + "Took 2.11 sec.\n", + "\n", + "Parsed weather for Tulalip-Totem Beach Rd since 2024-03-15 till 2024-03-17.\n", + "Took 2.1 sec.\n", + "\n", + "----------------------------------------------------------------\n", + "Parsed new weather data for ALL cities up to 2024-03-17.\n", + "Took 94.79 sec.\n", + "\n" + ] + } + ], "source": [ "# Storing the current time as the start time of the cell execution\n", "start_of_cell = time.time()\n", @@ -388,8 +974,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a7bff400-a2fb-48a3-a07b-5bd2a0469cd7", + "execution_count": 18, + "id": "de4d4870", "metadata": { "tags": [] }, @@ -410,12 +996,119 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "11752b30-2f40-4668-9813-2a90199c62b8", + "execution_count": 19, + "id": "82d7bc88", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_namedatetemperature_maxtemperature_minprecipitation_sumrain_sumsnowfall_sumprecipitation_hourswind_speed_maxwind_gusts_maxwind_direction_dominantunix_time
132Tulalip-Totem Beach Rd2024-03-1515.34.20.00.00.00.010.822.33431710460800000
133Tulalip-Totem Beach Rd2024-03-1621.34.80.00.00.00.09.825.93361710547200000
134Tulalip-Totem Beach Rd2024-03-1722.09.20.00.00.00.011.414.0931710633600000
\n", + "
" + ], + "text/plain": [ + " city_name date temperature_max temperature_min \\\n", + "132 Tulalip-Totem Beach Rd 2024-03-15 15.3 4.2 \n", + "133 Tulalip-Totem Beach Rd 2024-03-16 21.3 4.8 \n", + "134 Tulalip-Totem Beach Rd 2024-03-17 22.0 9.2 \n", + "\n", + " precipitation_sum rain_sum snowfall_sum precipitation_hours \\\n", + "132 0.0 0.0 0.0 0.0 \n", + "133 0.0 0.0 0.0 0.0 \n", + "134 0.0 0.0 0.0 0.0 \n", + "\n", + " wind_speed_max wind_gusts_max wind_direction_dominant unix_time \n", + "132 10.8 22.3 343 1710460800000 \n", + "133 9.8 25.9 336 1710547200000 \n", + "134 11.4 14.0 93 1710633600000 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Converting the 'date' column in the 'df_aq_update' DataFrame to string format\n", "df_aq_update.date = df_aq_update.date.astype(str)\n", @@ -430,15 +1123,7 @@ }, { "cell_type": "markdown", - "id": "792dd383", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "5aef353d", + "id": "7b5640f9", "metadata": { "tags": [] }, @@ -448,10 +1133,44 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f81bb922", + "execution_count": 20, + "id": "fd72be07", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad45db9921ea44f392976d59e79f3999", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Uploading Dataframe: 0.00% | | Rows 0/135 | Elapsed Time: 00:00 | Remaining Time: ?" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: air_quality_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://snurran.hops.works/p/5242/jobs/named/air_quality_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(, None)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Insert new data\n", "air_quality_fg.insert(df_aq_update)" @@ -459,10 +1178,44 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "be0c498e", + "execution_count": 21, + "id": "cdffe4a9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "afe7919348224c17b0f680c8c8067507", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Uploading Dataframe: 0.00% | | Rows 0/135 | Elapsed Time: 00:00 | Remaining Time: ?" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: weather_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://snurran.hops.works/p/5242/jobs/named/weather_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(, None)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Insert new data\n", "weather_fg.insert(df_weather_update)" @@ -470,19 +1223,20 @@ }, { "cell_type": "markdown", - "id": "b50c64a1", + "id": "d03605ea", "metadata": {}, "source": [ + "---\n", "## โญ๏ธ **Next:** Part 03: Training Pipeline\n", " \n", "\n", - "In the following notebook you will read from a feature group and create training dataset within the feature store\n" + "In the following notebook you will create a feature view, create a training dataset, train a model and save it in the Hopsworks Model Registry." ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -496,7 +1250,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" }, "vscode": { "interpreter": { diff --git a/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb b/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb index 2104e1f5..2dc9af7b 100644 --- a/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb +++ b/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb @@ -2,28 +2,29 @@ "cells": [ { "cell_type": "markdown", - "id": "7eb83ff8", + "id": "3d8d5c4a", "metadata": { "tags": [] }, "source": [ "# **Hopsworks Feature Store** - Part 03: Training Pipeline\n", "\n", - "This notebook explains how to read from a feature group and create training dataset within the feature store\n", + "This notebook explains how to create a feature view, create a training dataset, train a model and save it in the Hopsworks Model Registry.\n", "\n", "## ๐Ÿ—’๏ธ This notebook is divided into the following sections:\n", "\n", - "1. Fetch Feature Groups\n", - "2. Define Transformation functions\n", - "4. Create Feature Views\n", - "5. Create Training Dataset with training, validation and test splits\n", + "1. Fetch Feature Groups.\n", + "2. Create a Feature View.\n", + "3. Create a Training Dataset.\n", + "4. Train a model.\n", + "5. Save trained model in the Model Registry.\n", "\n", "![part2](../../images/02_training-dataset.png) " ] }, { "cell_type": "markdown", - "id": "f3b5f602-a575-49a8-bce9-a997cca936e0", + "id": "e89e0ed8", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -31,32 +32,25 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ad609eec-0b46-445f-a0f5-5657e5f69866", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install xgboost --q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3f2ac81-423a-4380-8fd6-b70aa55eb864", + "execution_count": 1, + "id": "7e858f8a", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-03-12 15:53:54,685 INFO: generated new fontManager\n" + ] + } + ], "source": [ "import os\n", - "import datetime\n", - "import time\n", - "import json\n", - "import pickle\n", "import joblib\n", "\n", "import pandas as pd\n", - "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", @@ -73,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "a0b3bcd1", + "id": "e4d834bc", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -81,10 +75,21 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "89ad779f", + "execution_count": 2, + "id": "817cdef7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/5242\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "import hopsworks\n", "\n", @@ -95,8 +100,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "735a083e", + "execution_count": 3, + "id": "dff51a06", "metadata": {}, "outputs": [], "source": [ @@ -113,18 +118,16 @@ }, { "cell_type": "markdown", - "id": "be427dca", + "id": "45881fbc", "metadata": {}, "source": [ - "--- \n", - "\n", - "## ๐Ÿ– Feature View Creation and Retrieving " + "## ๐Ÿ– Feature View Creation and Retrieval " ] }, { "cell_type": "code", - "execution_count": null, - "id": "cc3192d3", + "execution_count": 4, + "id": "0d4e6eba", "metadata": {}, "outputs": [], "source": [ @@ -137,8 +140,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "b3b8ba7b-b0ab-4ea5-b050-f8e1faf43c27", + "execution_count": 5, + "id": "1d5cf648", "metadata": { "scrolled": true, "tags": [] @@ -151,7 +154,7 @@ }, { "cell_type": "markdown", - "id": "d83a1681", + "id": "82c5b7be", "metadata": {}, "source": [ "`Feature Views` stands between **Feature Groups** and **Training Dataset**. ะกombining **Feature Groups** we can create **Feature Views** which store a metadata of our data. Having **Feature Views** we can create **Training Dataset**.\n", @@ -175,8 +178,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "403df0b4", + "execution_count": 6, + "id": "c0d7fec3", "metadata": {}, "outputs": [], "source": [ @@ -190,19 +193,17 @@ }, { "cell_type": "markdown", - "id": "0c723c54", + "id": "8f12f3ac", "metadata": {}, "source": [ - "For now `Feature View` is saved in Hopsworks and you can retrieve it using `FeatureStore.get_feature_view()`." + "For now, your `Feature View` is saved in Hopsworks and you can retrieve it using `FeatureStore.get_feature_view()`." ] }, { "cell_type": "markdown", - "id": "6e1187a2", + "id": "72aeb854", "metadata": {}, "source": [ - "---\n", - "\n", "## ๐Ÿ‹๏ธ Training Dataset Creation\n", "\n", "In Hopsworks training data is a query where the projection (set of features) is determined by the parent FeatureView with an optional snapshot on disk of the data returned by the query.\n", @@ -228,10 +229,25 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "2f5bcf22-6ff1-4995-a8c3-11a1dab396a7", + "execution_count": 7, + "id": "317668a8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (12.56s) \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "VersionWarning: Incremented version to `2`.\n" + ] + } + ], "source": [ "X, _ = feature_view.training_data(\n", " description = 'Air Quality dataset',\n", @@ -240,15 +256,7 @@ }, { "cell_type": "markdown", - "id": "c995b340-5ba6-4116-b8b6-86ca34f0a0ab", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "95783124-8303-47c5-bd15-2804efa15611", + "id": "a18b3733", "metadata": {}, "source": [ "## ๐Ÿงฌ Modeling" @@ -256,25 +264,25 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "5b937dec", + "execution_count": 8, + "id": "16c721ea", "metadata": {}, "outputs": [], "source": [ - "# Creating a LabelEncoder object\n", + "# Create a LabelEncoder object\n", "label_encoder = LabelEncoder()\n", "\n", - "# Fitting the encoder to the data in the 'city_name' column\n", + "# Fit the encoder to the data in the 'city_name' column\n", "label_encoder.fit(X[['city_name']])\n", "\n", - "# Transforming the 'city_name' column data using the fitted encoder\n", + "# Transform the 'city_name' column data using the fitted encoder\n", "encoded = label_encoder.transform(X[['city_name']])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "97cdb6bb-6c9c-44b7-9171-1b420bae9181", + "execution_count": 9, + "id": "ee1a5c8c", "metadata": { "tags": [] }, @@ -292,90 +300,342 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "4df41c7d-00bd-4203-90a1-8cc298508d68", + "execution_count": 10, + "id": "612ef824", "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Extracting the target variable 'pm2_5' from the DataFrame 'X' and assigning it to the variable 'y'\n", + "# Extract the target variable 'pm2_5' from the DataFrame 'X' and assigning it to the variable 'y'\n", "y = X.pop('pm2_5')" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d0299506-195f-4ebc-b43e-347fe59db31c", + "execution_count": 11, + "id": "02950e70", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Splitting the data into training and testing sets using the train_test_split function\n", + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pm_2_5_previous_1_daypm_2_5_previous_2_daypm_2_5_previous_3_daypm_2_5_previous_4_daypm_2_5_previous_5_daypm_2_5_previous_6_daypm_2_5_previous_7_daymean_7_daysmean_14_daysmean_28_days...temperature_maxtemperature_minprecipitation_sumrain_sumsnowfall_sumprecipitation_hourswind_speed_maxwind_gusts_maxwind_direction_dominantcity_name_encoded
1058180.00.02.03.06.06.07.03.4285716.8571435.142857...5.13.71.61.60.06.027.247.9639
669794.73.73.98.015.314.210.68.6285718.8500009.428571...0.0-6.80.00.00.00.019.345.06036
12222310.012.04.010.03.04.04.06.71428610.00000010.964286...1.5-1.90.00.00.00.013.524.127911
\n", + "

3 rows ร— 38 columns

\n", + "
" + ], + "text/plain": [ + " pm_2_5_previous_1_day pm_2_5_previous_2_day pm_2_5_previous_3_day \\\n", + "105818 0.0 0.0 2.0 \n", + "66979 4.7 3.7 3.9 \n", + "122223 10.0 12.0 4.0 \n", + "\n", + " pm_2_5_previous_4_day pm_2_5_previous_5_day pm_2_5_previous_6_day \\\n", + "105818 3.0 6.0 6.0 \n", + "66979 8.0 15.3 14.2 \n", + "122223 10.0 3.0 4.0 \n", + "\n", + " pm_2_5_previous_7_day mean_7_days mean_14_days mean_28_days ... \\\n", + "105818 7.0 3.428571 6.857143 5.142857 ... \n", + "66979 10.6 8.628571 8.850000 9.428571 ... \n", + "122223 4.0 6.714286 10.000000 10.964286 ... \n", + "\n", + " temperature_max temperature_min precipitation_sum rain_sum \\\n", + "105818 5.1 3.7 1.6 1.6 \n", + "66979 0.0 -6.8 0.0 0.0 \n", + "122223 1.5 -1.9 0.0 0.0 \n", + "\n", + " snowfall_sum precipitation_hours wind_speed_max wind_gusts_max \\\n", + "105818 0.0 6.0 27.2 47.9 \n", + "66979 0.0 0.0 19.3 45.0 \n", + "122223 0.0 0.0 13.5 24.1 \n", + "\n", + " wind_direction_dominant city_name_encoded \n", + "105818 6 39 \n", + "66979 60 36 \n", + "122223 279 11 \n", + "\n", + "[3 rows x 38 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Split the data into training and testing sets using the train_test_split function\n", "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, test_size=0.2, random_state=42)" + " X, \n", + " y, \n", + " test_size=0.2, \n", + " random_state=42,\n", + ")\n", + "\n", + "X_train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b4ddfaeb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "105818 2.0\n", + "66979 9.8\n", + "122223 11.0\n", + "Name: pm2_5, dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "59e85ea3", + "metadata": {}, + "source": [ + "## ๐Ÿƒ๐Ÿปโ€โ™‚๏ธ Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "44a6893f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
+       "             colsample_bylevel=None, colsample_bynode=None,\n",
+       "             colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+       "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
+       "             gamma=None, grow_policy=None, importance_type=None,\n",
+       "             interaction_constraints=None, learning_rate=None, max_bin=None,\n",
+       "             max_cat_threshold=None, max_cat_to_onehot=None,\n",
+       "             max_delta_step=None, max_depth=None, max_leaves=None,\n",
+       "             min_child_weight=None, missing=nan, monotone_constraints=None,\n",
+       "             multi_strategy=None, n_estimators=None, n_jobs=None,\n",
+       "             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=None, ...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create an instance of the XGBoost Regressor\n", + "xgb_regressor = XGBRegressor()\n", + "\n", + "# Fit the XGBoost Regressor to the training data\n", + "xgb_regressor.fit(X_train, y_train)" ] }, { "cell_type": "markdown", - "id": "8fd4e24e-7f02-4944-a309-6475b65e7846", + "id": "6335331f", "metadata": {}, "source": [ - "### โš–๏ธ Model Validation" + "## โš–๏ธ Model Validation" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4e5be9f8-0f88-4a7e-8fc1-65ec8b02920d", + "execution_count": 14, + "id": "405bb3d6", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Storing the current time as the start time of the cell execution\n", - "start_of_cell = time.time()\n", - "\n", - "# Creating an instance of the XGBoost Regressor\n", - "xgb_regressor = XGBRegressor()\n", - "\n", - "# Fitting the XGBoost Regressor to the training data\n", - "xgb_regressor.fit(X_train, y_train)\n", - "\n", - "# Predicting target values on the test set\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โ›ณ๏ธ MSE: 29.739315036119873\n", + "โ›ณ๏ธ RMSE: 5.453376480321148\n", + "โ›ณ๏ธ R^2: 0.7422035343350755\n" + ] + } + ], + "source": [ + "# Predict target values on the test set\n", "y_pred = xgb_regressor.predict(X_test)\n", "\n", - "# Calculating Mean Squared Error (MSE) using sklearn\n", + "# Calculate Mean Squared Error (MSE) using sklearn\n", "mse = mean_squared_error(y_test, y_pred)\n", - "print(\"MSE:\", mse)\n", + "print(\"โ›ณ๏ธ MSE:\", mse)\n", "\n", - "# Calculating Root Mean Squared Error (RMSE) using sklearn\n", + "# Calculate Root Mean Squared Error (RMSE) using sklearn\n", "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", - "print(\"RMSE:\", rmse)\n", + "print(\"โ›ณ๏ธ RMSE:\", rmse)\n", "\n", - "# Calculating R squared using sklearn\n", + "# Calculate R squared using sklearn\n", "r2 = r2_score(y_test, y_pred)\n", - "print(\"R squared:\", r2)\n", - "\n", - "# Storing the current time as the end time of the cell execution\n", - "end_of_cell = time.time()\n", - "\n", - "# Printing information about the execution, including the time taken\n", - "print(f\"Took {round(end_of_cell - start_of_cell, 2)} sec.\\n\")" + "print(\"โ›ณ๏ธ R^2:\", r2)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ac31f9fb-7904-416a-9938-e85320340412", + "execution_count": 15, + "id": "b19bd4aa", "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Creating a DataFrame 'df_' to store true and predicted values for evaluation\n", - "df_ = pd.DataFrame({\n", + "# Create a DataFrame 'df_' to store true and predicted values for evaluation\n", + "df_pred = pd.DataFrame({\n", " \"y_true\": y_test,\n", " \"y_pred\": y_pred,\n", "})" @@ -383,55 +643,71 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f2fc8448-2150-4cfd-803d-afbd4845b59e", + "execution_count": 16, + "id": "9a72e8f5", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Creating a residual plot using Seaborn\n", - "residplot = sns.residplot(data=df_, x=\"y_true\", y=\"y_pred\", color='orange')\n", - "\n", - "# Adding title, xlabel, and ylabel to the residual plot\n", + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create a residual plot using Seaborn\n", + "residplot = sns.residplot(data=df_pred, x=\"y_true\", y=\"y_pred\", color='orange')\n", + "\n", + "# Add title, xlabel, and ylabel to the residual plot\n", "plt.title('Model Residuals')\n", "plt.xlabel('Observation #')\n", "plt.ylabel('Error')\n", "\n", - "# Displaying the residual plot\n", + "# Display the residual plot\n", "plt.show()\n", "\n", - "# Getting the figure from the residual plot and displaying it separately\n", + "# Get the figure from the residual plot and displaying it separately\n", "fig = residplot.get_figure()\n", "fig.show()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "5ae4e226-7a93-4e4d-8131-c1de62a7b6f9", + "execution_count": 17, + "id": "d5157113", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Plotting feature importances using the plot_importance function from XGBoost\n", - "# 'xgb_regressor' is the trained XGBoost Regressor\n", - "# Setting 'max_num_features' to 25 to display the top 25 most important features\n", - "plot_importance(xgb_regressor, max_num_features=25)" - ] - }, - { - "cell_type": "markdown", - "id": "3dcea831-6c21-4396-a0ce-0631d21d1875", - "metadata": {}, - "source": [ - "---" + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot feature importances using the plot_importance function from XGBoost\n", + "plot_importance(\n", + " xgb_regressor, \n", + " max_num_features=25, # Display the top 25 most important features\n", + ")\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "c066fe79-315e-4b85-b2ab-32d503679dc7", + "id": "0977f9fe", "metadata": { "tags": [] }, @@ -443,12 +719,20 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a787ec40-6bd7-4950-aa5d-bf004e1e5ade", + "execution_count": 18, + "id": "e2fd7e49", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "# Retrieve the model registry\n", "mr = project.get_model_registry()" @@ -456,7 +740,7 @@ }, { "cell_type": "markdown", - "id": "7d240dc7-8a02-47b2-9667-7483508b2d24", + "id": "04886431", "metadata": {}, "source": [ "### โš™๏ธ Model Schema" @@ -464,7 +748,7 @@ }, { "cell_type": "markdown", - "id": "5c658df3-56a4-450b-90ee-127d0afe5b74", + "id": "86eb61ed", "metadata": {}, "source": [ "The model needs to be set up with a [Model Schema](https://docs.hopsworks.ai/machine-learning-api/latest/generated/model_schema/), which describes the inputs and outputs for a model.\n", @@ -474,8 +758,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "cd3f3751", + "execution_count": 19, + "id": "8a98e889", "metadata": { "scrolled": true }, @@ -484,47 +768,79 @@ "from hsml.schema import Schema\n", "from hsml.model_schema import ModelSchema\n", "\n", - "# Creating input and output schemas using the 'Schema' class for features (X) and target variable (y)\n", + "# Create input and output schemas using the 'Schema' class for features (X) and target variable (y)\n", "input_schema = Schema(X)\n", "output_schema = Schema(y)\n", "\n", - "# Creating a model schema using 'ModelSchema' with the input and output schemas\n", + "# Create a model schema using 'ModelSchema' with the input and output schemas\n", "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", "\n", - "# Converting the model schema to a dictionary representation\n", + "# Convert the model schema to a dictionary representation\n", "schema_dict = model_schema.to_dict()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d2777f5e", + "execution_count": 20, + "id": "a3d26b4d", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# Creating a directory for the model artifacts if it doesn't exist\n", + "# Create a directory for the model artifacts if it doesn't exist\n", "model_dir = \"air_quality_model\"\n", "if os.path.isdir(model_dir) == False:\n", " os.mkdir(model_dir)\n", "\n", - "# Saving the label encoder and XGBoost regressor as joblib files in the model directory\n", + "# Save the label encoder and XGBoost regressor as joblib files in the model directory\n", "joblib.dump(label_encoder, model_dir + '/label_encoder.pkl')\n", "joblib.dump(xgb_regressor, model_dir + '/xgboost_regressor.pkl')\n", "\n", - "# Saving the residual plot figure as an image in the model directory\n", + "# Save the residual plot figure as an image in the model directory\n", "fig.savefig(model_dir + \"/residplot.png\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "41f6811e", + "execution_count": 21, + "id": "ac2d8166", "metadata": {}, - "outputs": [], - "source": [ - "# Creating a Python model in the model registry named 'air_quality_xgboost_model'\n", + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "db8f5c6580f0428fa7ac741fe6bd7f89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/6 [00:00 **Hopsworks Feature Store** - Part 04: Batch Inference\n", @@ -10,12 +10,14 @@ "## ๐Ÿ—’๏ธ This notebook is divided into the following sections:\n", "\n", "1. Load batch data.\n", - "2. Predict using model from Model Registry." + "2. Retrieve your trained model from the Model Registry.\n", + "3. Load batch data.\n", + "4. Predict batch data." ] }, { "cell_type": "markdown", - "id": "8855ee1a", + "id": "0cbefa72", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -23,31 +25,41 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "019c9226", + "execution_count": 1, + "id": "2635641d", "metadata": {}, "outputs": [], "source": [ "import joblib\n", "import datetime\n", - "import time\n", "import pandas as pd" ] }, { "cell_type": "markdown", - "id": "ce2fe8a8", + "id": "97e466ff", "metadata": {}, "source": [ - "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " + "## ๐Ÿ“ก Connect to Hopsworks Feature Store " ] }, { "cell_type": "code", - "execution_count": null, - "id": "39f83bc9", + "execution_count": 2, + "id": "dd83456c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/5242\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "import hopsworks\n", "\n", @@ -58,16 +70,16 @@ }, { "cell_type": "markdown", - "id": "87485ee0", + "id": "88a9587d", "metadata": {}, "source": [ - "## โš™๏ธ Feature View Retrieval\n" + "## โš™๏ธ Feature View Retrieval" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e622d6b4", + "execution_count": 3, + "id": "fa7f8ed2", "metadata": {}, "outputs": [], "source": [ @@ -80,18 +92,26 @@ }, { "cell_type": "markdown", - "id": "e1dac8b6", + "id": "20bc6944", "metadata": {}, "source": [ - "## ๐Ÿ—„ Model Registry\n" + "## ๐Ÿ—„ Model Registry" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ca35a9f4", + "execution_count": 4, + "id": "33c4f742", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], "source": [ "# Retrieve the model registry\n", "mr = project.get_model_registry()" @@ -99,47 +119,99 @@ }, { "cell_type": "markdown", - "id": "6f3589dc", + "id": "f887c4ba", "metadata": {}, "source": [ - "## ๐Ÿช Retrieving model from Model Registry" + "## ๐Ÿช Retrieve model from Model Registry" ] }, { "cell_type": "code", - "execution_count": null, - "id": "6ac8014f", + "execution_count": 5, + "id": "9f52593a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading model artifact (0 dirs, 6 files)... DONE\r" + ] + } + ], "source": [ - "# Retrieving the 'air_quality_xgboost_model' from the model registry\n", + "# Retrieve the 'air_quality_xgboost_model' from the model registry\n", "retrieved_model = mr.get_model(\n", " name=\"air_quality_xgboost_model\",\n", " version=1,\n", ")\n", "\n", - "# Downloading the saved model artifacts to a local directory\n", + "# Download the saved model artifacts to a local directory\n", "saved_model_dir = retrieved_model.download()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3812f78d", + "execution_count": 6, + "id": "020d13b0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
+       "             colsample_bylevel=None, colsample_bynode=None,\n",
+       "             colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+       "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
+       "             gamma=None, grow_policy=None, importance_type=None,\n",
+       "             interaction_constraints=None, learning_rate=None, max_bin=None,\n",
+       "             max_cat_threshold=None, max_cat_to_onehot=None,\n",
+       "             max_delta_step=None, max_depth=None, max_leaves=None,\n",
+       "             min_child_weight=None, missing=nan, monotone_constraints=None,\n",
+       "             multi_strategy=None, n_estimators=None, n_jobs=None,\n",
+       "             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=None, ...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Loading the XGBoost regressor model and label encoder from the saved model directory\n", + "# Load the XGBoost regressor model and label encoder from the saved model directory\n", "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/xgboost_regressor.pkl\")\n", "retrieved_encoder = joblib.load(saved_model_dir + \"/label_encoder.pkl\")\n", "\n", - "# Displaying the retrieved XGBoost regressor model\n", + "# Display the retrieved XGBoost regressor model\n", "retrieved_xgboost_model" ] }, { "cell_type": "markdown", - "id": "9a762442", + "id": "b8e37bb1", "metadata": {}, "source": [ "## โœจ Load Batch Data of last days\n", @@ -149,38 +221,59 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "4bd49291", + "execution_count": 7, + "id": "733a1355", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'2024-02-11'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Getting the current date\n", + "# Get the current date\n", "today = datetime.date.today()\n", "\n", - "# Calculating a date threshold 30 days ago from the current date\n", + "# Calculate a date threshold 30 days ago from the current date\n", "date_threshold = today - datetime.timedelta(days=30)\n", "\n", - "# Converting the date threshold to a string format\n", + "# Convert the date threshold to a string format\n", "str(date_threshold)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3990e55f", + "execution_count": 8, + "id": "5a5c283b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.68s) \n" + ] + } + ], "source": [ - "# Initializing batch scoring\n", + "# Initialize batch scoring\n", "feature_view.init_batch_scoring(1)\n", "\n", - "# Retrieving batch data from the feature view with a start time set to the date threshold\n", - "batch_data = feature_view.get_batch_data(start_time=date_threshold)" + "# Retrieve batch data from the feature view with a start time set to the date threshold\n", + "batch_data = feature_view.get_batch_data(\n", + " start_time=date_threshold,\n", + ")" ] }, { "cell_type": "markdown", - "id": "36f82c4a", + "id": "00a46a76", "metadata": {}, "source": [ "### ๐Ÿค– Making the predictions" @@ -188,44 +281,219 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a10ff736", + "execution_count": 9, + "id": "f1e09066", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pm_2_5_previous_1_daypm_2_5_previous_2_daypm_2_5_previous_3_daypm_2_5_previous_4_daypm_2_5_previous_5_daypm_2_5_previous_6_daypm_2_5_previous_7_daymean_7_daysmean_14_daysmean_28_days...temperature_maxtemperature_minprecipitation_sumrain_sumsnowfall_sumprecipitation_hourswind_speed_maxwind_gusts_maxwind_direction_dominantcity_name_encoded
05.316.216.315.425.017.210.215.08571415.24285712.939286...14.28.40.00.000.00.034.759.430220
112.217.512.114.512.316.814.314.24285719.85714318.432143...13.110.520.920.900.022.040.164.821224
211.68.018.212.79.96.812.511.3857149.9214298.271429...10.85.29.113.650.05.014.843.218030
\n", + "

3 rows ร— 38 columns

\n", + "
" + ], + "text/plain": [ + " pm_2_5_previous_1_day pm_2_5_previous_2_day pm_2_5_previous_3_day \\\n", + "0 5.3 16.2 16.3 \n", + "1 12.2 17.5 12.1 \n", + "2 11.6 8.0 18.2 \n", + "\n", + " pm_2_5_previous_4_day pm_2_5_previous_5_day pm_2_5_previous_6_day \\\n", + "0 15.4 25.0 17.2 \n", + "1 14.5 12.3 16.8 \n", + "2 12.7 9.9 6.8 \n", + "\n", + " pm_2_5_previous_7_day mean_7_days mean_14_days mean_28_days ... \\\n", + "0 10.2 15.085714 15.242857 12.939286 ... \n", + "1 14.3 14.242857 19.857143 18.432143 ... \n", + "2 12.5 11.385714 9.921429 8.271429 ... \n", + "\n", + " temperature_max temperature_min precipitation_sum rain_sum \\\n", + "0 14.2 8.4 0.0 0.00 \n", + "1 13.1 10.5 20.9 20.90 \n", + "2 10.8 5.2 9.1 13.65 \n", + "\n", + " snowfall_sum precipitation_hours wind_speed_max wind_gusts_max \\\n", + "0 0.0 0.0 34.7 59.4 \n", + "1 0.0 22.0 40.1 64.8 \n", + "2 0.0 5.0 14.8 43.2 \n", + "\n", + " wind_direction_dominant city_name_encoded \n", + "0 302 20 \n", + "1 212 24 \n", + "2 180 30 \n", + "\n", + "[3 rows x 38 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Transforming the 'city_name' column in the batch data using the retrieved label encoder\n", + "# Transform the 'city_name' column in the batch data using the retrieved label encoder\n", "encoded = retrieved_encoder.transform(batch_data['city_name'])\n", "\n", - "# Concatenating the label-encoded 'city_name' with the original batch data\n", + "# Concatenate the label-encoded 'city_name' with the original batch data\n", "X_batch = pd.concat([batch_data, pd.DataFrame(encoded)], axis=1)\n", "\n", - "# Dropping unnecessary columns ('date', 'city_name', 'unix_time') from the batch data\n", + "# Drop unnecessary columns ('date', 'city_name', 'unix_time') from the batch data\n", "X_batch = X_batch.drop(columns=['date', 'city_name', 'unix_time'])\n", "\n", - "# Renaming the newly added column with label-encoded city names to 'city_name_encoded'\n", + "# Rename the newly added column with label-encoded city names to 'city_name_encoded'\n", "X_batch = X_batch.rename(columns={0: 'city_name_encoded'})\n", "\n", - "# Extracting the target variable 'pm2_5' from the batch data\n", - "y_batch = X_batch.pop('pm2_5')" + "# Extract the target variable 'pm2_5' from the batch data\n", + "y_batch = X_batch.pop('pm2_5')\n", + "\n", + "X_batch.head(3)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b597ea2b", + "execution_count": 10, + "id": "5149127e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 5.9190893, 6.7028375, 7.9574 , 15.73646 , 8.050383 ],\n", + " dtype=float32)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Making predictions on the batch data using the retrieved XGBoost regressor model\n", + "# Make predictions on the batch data using the retrieved XGBoost regressor model\n", "predictions = retrieved_xgboost_model.predict(X_batch)\n", "\n", - "# Displaying the first 5 predictions\n", + "# Display the first 5 predictions\n", "predictions[:5]" ] }, { "cell_type": "markdown", - "id": "80e2b142", + "id": "ccbc0bb6", "metadata": {}, "source": [ "---\n", @@ -234,17 +502,17 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c208069a", + "execution_count": 11, + "id": "d784e05e", "metadata": {}, "outputs": [], "source": [ - "!python3 -m streamlit run streamlit_app.py" + "# !python3 -m streamlit run streamlit_app.py" ] }, { "cell_type": "markdown", - "id": "c97c7f97", + "id": "569fc146", "metadata": {}, "source": [ "---\n", @@ -260,7 +528,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -274,7 +542,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/air_quality/5_function_calling.ipynb b/advanced_tutorials/air_quality/5_function_calling.ipynb new file mode 100644 index 00000000..1f18f187 --- /dev/null +++ b/advanced_tutorials/air_quality/5_function_calling.ipynb @@ -0,0 +1,776 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "574f4ea0", + "metadata": {}, + "source": [ + "## ๐Ÿ“ Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1d2db28d", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "\n", + "from functions.llm_chain import load_model, get_llm_chain, generate_response" + ] + }, + { + "cell_type": "markdown", + "id": "6b079783", + "metadata": {}, + "source": [ + "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d1aff226", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/5242\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store() " + ] + }, + { + "cell_type": "markdown", + "id": "3ce1a7b7", + "metadata": {}, + "source": [ + "## โš™๏ธ Feature View Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ae227ec2", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'air_quality_fv' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name='air_quality_fv',\n", + " version=1,\n", + ")\n", + "\n", + "# Initialize batch scoring\n", + "feature_view.init_batch_scoring(1)" + ] + }, + { + "cell_type": "markdown", + "id": "5199d607", + "metadata": {}, + "source": [ + "## ๐Ÿช Retrieve AirQuality Model from Model Registry" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f7661fcf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "Downloading model artifact (0 dirs, 6 files)... DONE\r" + ] + } + ], + "source": [ + "# Retrieve the model registry\n", + "mr = project.get_model_registry()\n", + "\n", + "# Retrieve the 'air_quality_xgboost_model' from the model registry\n", + "retrieved_model = mr.get_model(\n", + " name=\"air_quality_xgboost_model\",\n", + " version=1,\n", + ")\n", + "\n", + "# Download the saved model artifacts to a local directory\n", + "saved_model_dir = retrieved_model.download()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b5b41d51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
+       "             colsample_bylevel=None, colsample_bynode=None,\n",
+       "             colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+       "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
+       "             gamma=None, grow_policy=None, importance_type=None,\n",
+       "             interaction_constraints=None, learning_rate=None, max_bin=None,\n",
+       "             max_cat_threshold=None, max_cat_to_onehot=None,\n",
+       "             max_delta_step=None, max_depth=None, max_leaves=None,\n",
+       "             min_child_weight=None, missing=nan, monotone_constraints=None,\n",
+       "             multi_strategy=None, n_estimators=None, n_jobs=None,\n",
+       "             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=None, ...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the XGBoost regressor model and label encoder from the saved model directory\n", + "model_air_quality = joblib.load(saved_model_dir + \"/xgboost_regressor.pkl\")\n", + "encoder = joblib.load(saved_model_dir + \"/label_encoder.pkl\")\n", + "\n", + "# Display the retrieved XGBoost regressor model\n", + "model_air_quality" + ] + }, + { + "cell_type": "markdown", + "id": "c07382c8", + "metadata": {}, + "source": [ + "## โฌ‡๏ธ LLM Loading" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6d790560", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-03-17 13:56:41,741 INFO: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1113d3cc0aa64e7fba248fdcf2b9055a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00โ›“๏ธ LangChain" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "44d106da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`. (Deprecated NumPy 1.24)\n" + ] + } + ], + "source": [ + "# Create and configure a language model chain.\n", + "llm_chain = get_llm_chain(\n", + " model_llm, \n", + " tokenizer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9b5b7257", + "metadata": {}, + "source": [ + "## ๐Ÿงฌ Model Inference\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0fb93740", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– \n", + "\n", + "Hello! How can I help you with air quality today?\n" + ] + } + ], + "source": [ + "QUESTION7 = \"Hi!\"\n", + "\n", + "response7 = generate_response(\n", + " QUESTION7,\n", + " feature_view,\n", + " model_llm, \n", + " tokenizer,\n", + " model_air_quality,\n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response7)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f7e1bbc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– \n", + "\n", + "I am an AI Air Quality assistant, here to help you with any air quality-related questions or concerns you may have. I can provide information on current and historical air quality data for your location, offer advice on whether it's safe to go outside, and suggest ways to improve air quality. How can I help you today?\n" + ] + } + ], + "source": [ + "QUESTION = \"Who are you?\"\n", + "\n", + "response = generate_response(\n", + " QUESTION,\n", + " feature_view,\n", + " model_llm,\n", + " tokenizer,\n", + " model_air_quality,\n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "307f2d8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.49s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for New York:\n", + "Date: 2024-01-10; Air Quality: 7.2\n", + "Date: 2024-01-11; Air Quality: 5.9\n", + "Date: 2024-01-12; Air Quality: 10.8\n", + "Date: 2024-01-13; Air Quality: 5.9\n", + "Date: 2024-01-14; Air Quality: 5.1\n", + "\n", + "The average air quality in New York from January 10th to January 14th was 6.8. This indicates that the air quality was generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION1 = \"What was the average air quality from 2024-01-10 till 2024-01-14 in New York?\"\n", + "\n", + "response1 = generate_response(\n", + " QUESTION1, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4d39a38b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.40s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for New York:\n", + "Date: 2024-01-10; Air Quality: 7.2\n", + "Date: 2024-01-11; Air Quality: 5.9\n", + "Date: 2024-01-12; Air Quality: 10.8\n", + "Date: 2024-01-13; Air Quality: 5.9\n", + "Date: 2024-01-14; Air Quality: 5.1\n", + "\n", + "The maximum air quality in New York from January 10th to January 14th was on January 12th with an air quality of 10.8. This indicates that the air quality on that day was not safe for most people, especially those with respiratory issues, and it would be advisable to limit outdoor activities.\n" + ] + } + ], + "source": [ + "QUESTION11 = \"When and what was the maximum air quality from 2024-01-10 till 2024-01-14 in New York?\"\n", + "\n", + "response11 = generate_response(\n", + " QUESTION11, \n", + " feature_view, \n", + " model_llm,\n", + " tokenizer,\n", + " model_air_quality,\n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response11)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "36ac09a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.47s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for New York:\n", + "Date: 2024-01-10; Air Quality: 7.2\n", + "Date: 2024-01-11; Air Quality: 5.9\n", + "Date: 2024-01-12; Air Quality: 10.8\n", + "Date: 2024-01-13; Air Quality: 5.9\n", + "Date: 2024-01-14; Air Quality: 5.1\n", + "\n", + "The minimum air quality in New York from January 10th to January 14th was on January 11th with an air quality of 5.9. This indicates that the air quality on that day was generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION12 = \"When and what was the minimum air quality from 2024-01-10 till 2024-01-14 in New York?\"\n", + "\n", + "response12 = generate_response(\n", + " QUESTION12, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response12)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b80fa4f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.39s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for London:\n", + "Date: 2024-03-16; Air Quality: 9.5\n", + "\n", + "The air quality yesterday in London was 9.5, which indicates that the air quality was generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION2 = \"What was the air quality yesterday in London?\"\n", + "\n", + "response2 = generate_response(\n", + " QUESTION2, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response2)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "397d5168", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.47s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for London:\n", + "Date: 2024-03-17; Air Quality: 7.6\n", + "Date: 2024-03-18; Air Quality: 9.88\n", + "Date: 2024-03-19; Air Quality: 9.18\n", + "Date: 2024-03-20; Air Quality: 9.34\n", + "Date: 2024-03-21; Air Quality: 9.37\n", + "Date: 2024-03-22; Air Quality: 9.37\n", + "Date: 2024-03-23; Air Quality: 9.37\n", + "\n", + "The air quality in London on 2024-03-23 is expected to be 9.37, which indicates that the air quality is generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION3 = \"What will the air quality be like in London in 2024-03-23?\"\n", + "\n", + "response3 = generate_response(\n", + " QUESTION3, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer,\n", + " model_air_quality,\n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response3)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5d3ffba1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.39s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for Chicago:\n", + "Date: 2024-03-17; Air Quality: 6.1\n", + "Date: 2024-03-18; Air Quality: 8.37\n", + "Date: 2024-03-19; Air Quality: 7.39\n", + "\n", + "The air quality in Chicago the day after tomorrow, on 2024-03-19, is expected to be 7.39, which indicates that the air quality is generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION4 = \"What will the air quality be like in Chicago the day after tomorrow?\"\n", + "\n", + "response4 = generate_response(\n", + " QUESTION4, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0dce8283", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.38s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for London:\n", + "Date: 2024-03-17; Air Quality: 7.6\n", + "\n", + "The air quality in London on Sunday, 2024-03-17, is expected to be 7.6, which indicates that the air quality is generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION5 = \"What will the air quality be like in London on Sunday?\"\n", + "\n", + "response5 = generate_response(\n", + " QUESTION5, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5e5fc2e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (7.58s) \n", + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– Air Quality Measurements for London:\n", + "Date: 2024-03-17; Air Quality: 7.6\n", + "Date: 2024-03-18; Air Quality: 9.88\n", + "Date: 2024-03-19; Air Quality: 9.18\n", + "Date: 2024-03-20; Air Quality: 9.34\n", + "Date: 2024-03-21; Air Quality: 9.37\n", + "\n", + "The air quality in London on March 21 is expected to be 9.37, which indicates that the air quality is generally safe for most people to breathe, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION7 = \"What will the air quality be like on March 21 in London?\"\n", + "\n", + "response7 = generate_response(\n", + " QUESTION7, \n", + " feature_view,\n", + " model_llm,\n", + " tokenizer, \n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response7)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fde239f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The air quality level is not dangerous, but individuals with respiratory issues may still need to take precautions.\n" + ] + } + ], + "source": [ + "QUESTION = \"Is this air quality level dangerous?\"\n", + "\n", + "response = generate_response(\n", + " QUESTION, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer,\n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1c0873b6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ—“๏ธ Today's date: Sunday, 2024-03-17\n", + "๐Ÿ“– \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Of course! Air quality levels are typically measured on a scale, and the specific scale can vary depending on the location and the organization providing the measurements. Generally, air quality levels are categorized into different ranges, with each range corresponding to a specific level of air quality. Here is a general overview of air quality levels:\n", + "\n", + "1. Good (0-50): The air quality is considered good, and it is safe for most people to breathe.\n", + "2. Moderate (51-100): The air quality is acceptable, but it may cause a slight irritation to some people with respiratory issues.\n", + "3. Poor (101-150): The air quality is considered unhealthy for sensitive groups, such as children, the elderly, and those with respiratory issues. It is advisable for these groups to limit their outdoor activities.\n", + "4. Very Poor (151-200): The air quality is considered unhealthy, and it may cause respiratory issues for most people. Outdoor activities should be limited.\n", + "5. Hazardous (over 200): The air quality is considered hazardous and can cause severe respiratory issues for everyone. Outdoor activities should be strictly avoided.\n", + "\n", + "These categories may vary slightly depending on the location and the organization providing the measurements, but they generally provide a good understanding of the air quality levels.\n" + ] + } + ], + "source": [ + "QUESTION = \"Can you please explain different air quality levels?\"\n", + "\n", + "response = generate_response(\n", + " QUESTION, \n", + " feature_view, \n", + " model_llm, \n", + " tokenizer,\n", + " model_air_quality, \n", + " encoder,\n", + " llm_chain,\n", + " verbose=True,\n", + ")\n", + "\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "1fd12ab8", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/air_quality/app_gradio.py b/advanced_tutorials/air_quality/app_gradio.py new file mode 100644 index 00000000..40bcf53c --- /dev/null +++ b/advanced_tutorials/air_quality/app_gradio.py @@ -0,0 +1,104 @@ +import gradio as gr +from transformers import pipeline +import numpy as np +import hopsworks +import joblib +from functions.llm_chain import load_model, get_llm_chain, generate_response + +# Initialize the ASR pipeline +transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") + +def connect_to_hopsworks(): + # Initialize Hopsworks feature store connection + project = hopsworks.login() + fs = project.get_feature_store() + + # Retrieve the model registry + mr = project.get_model_registry() + + # Retrieve the 'air_quality_fv' feature view + feature_view = fs.get_feature_view( + name="air_quality_fv", + version=1, + ) + + # Initialize batch scoring + feature_view.init_batch_scoring(1) + + # Retrieve the 'air_quality_xgboost_model' from the model registry + retrieved_model = mr.get_model( + name="air_quality_xgboost_model", + version=1, + ) + + # Download the saved model artifacts to a local directory + saved_model_dir = retrieved_model.download() + + # Load the XGBoost regressor model and label encoder from the saved model directory + model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl") + encoder = joblib.load(saved_model_dir + "/label_encoder.pkl") + + return feature_view, model_air_quality, encoder + + +def retrieve_llm_chain(): + + # Load the LLM and its corresponding tokenizer. + model_llm, tokenizer = load_model() + + # Create and configure a language model chain. + llm_chain = get_llm_chain( + model_llm, + tokenizer, + ) + + return model_llm, tokenizer, llm_chain + + +# Retrieve the feature view, air quality model and encoder for the city_name column +feature_view, model_air_quality, encoder = connect_to_hopsworks() + +# Load the LLM and its corresponding tokenizer and configure a language model chain +model_llm, tokenizer, llm_chain = retrieve_llm_chain() + +def transcribe(audio): + sr, y = audio + y = y.astype(np.float32) + if y.ndim > 1 and y.shape[1] > 1: + y = np.mean(y, axis=1) + y /= np.max(np.abs(y)) + return transcriber({"sampling_rate": sr, "raw": y})["text"] + +def generate_query_response(user_query): + response = generate_response( + user_query, + feature_view, + model_llm, + tokenizer, + model_air_quality, + encoder, + llm_chain, + verbose=False, + ) + return response + +def handle_input(text_input=None, audio_input=None): + if audio_input is not None: + user_query = transcribe(audio_input) + else: + user_query = text_input + + if user_query: + return generate_query_response(user_query) + else: + return "Please provide input either via text or voice." + +iface = gr.Interface( + fn=handle_input, + inputs=[gr.Textbox(placeholder="Type here or use voice input..."), gr.Audio()], + outputs="text", + title="๐ŸŒค๏ธ AirQuality AI Assistant ๐Ÿ’ฌ", + description="Ask your questions about air quality or use your voice to interact." +) + +iface.launch(share=True) diff --git a/advanced_tutorials/air_quality/app_streamlit.py b/advanced_tutorials/air_quality/app_streamlit.py new file mode 100644 index 00000000..4dfa6d0c --- /dev/null +++ b/advanced_tutorials/air_quality/app_streamlit.py @@ -0,0 +1,99 @@ +import streamlit as st +import hopsworks +import joblib +from functions.llm_chain import load_model, get_llm_chain, generate_response +import warnings +warnings.filterwarnings('ignore') + +st.title("๐ŸŒค๏ธ AirQuality AI assistant ๐Ÿ’ฌ") + +@st.cache_resource() +def connect_to_hopsworks(): + # Initialize Hopsworks feature store connection + project = hopsworks.login() + fs = project.get_feature_store() + + # Retrieve the model registry + mr = project.get_model_registry() + + # Retrieve the 'air_quality_fv' feature view + feature_view = fs.get_feature_view( + name="air_quality_fv", + version=1, + ) + + # Initialize batch scoring + feature_view.init_batch_scoring(1) + + # Retrieve the 'air_quality_xgboost_model' from the model registry + retrieved_model = mr.get_model( + name="air_quality_xgboost_model", + version=1, + ) + + # Download the saved model artifacts to a local directory + saved_model_dir = retrieved_model.download() + + # Load the XGBoost regressor model and label encoder from the saved model directory + model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl") + encoder = joblib.load(saved_model_dir + "/label_encoder.pkl") + + return feature_view, model_air_quality, encoder + + +@st.cache_resource() +def retrieve_llm_chain(): + + # Load the LLM and its corresponding tokenizer. + model_llm, tokenizer = load_model() + + # Create and configure a language model chain. + llm_chain = get_llm_chain( + model_llm, + tokenizer, + ) + + return model_llm, tokenizer, llm_chain + + +# Retrieve the feature view, air quality model and encoder for the city_name column +feature_view, model_air_quality, encoder = connect_to_hopsworks() + +# Load the LLM and its corresponding tokenizer and configure a language model chain +model_llm, tokenizer, llm_chain = retrieve_llm_chain() + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Display chat messages from history on app rerun +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# React to user input +if user_query := st.chat_input("How can I help you?"): + # Display user message in chat message container + st.chat_message("user").markdown(user_query) + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": user_query}) + + st.write('โš™๏ธ Generating Response...') + + # Generate a response to the user query + response = generate_response( + user_query, + feature_view, + model_llm, + tokenizer, + model_air_quality, + encoder, + llm_chain, + verbose=False, + ) + + # Display assistant response in chat message container + with st.chat_message("assistant"): + st.markdown(response) + # Add assistant response to chat history + st.session_state.messages.append({"role": "assistant", "content": response}) diff --git a/advanced_tutorials/air_quality/feature_pipeline.py b/advanced_tutorials/air_quality/feature_pipeline.py deleted file mode 100644 index 0cee9c43..00000000 --- a/advanced_tutorials/air_quality/feature_pipeline.py +++ /dev/null @@ -1,158 +0,0 @@ -import datetime -import time -import requests -import pandas as pd -import json -import hopsworks - -from functions import * - -import warnings -warnings.filterwarnings("ignore") - -from dotenv import load_dotenv -load_dotenv() - -import os - -# Get the value of the PARAMETER environment variable -continent = os.environ.get('CONTINENT') - - -file_path = os.path.join(os.getcwd(), 'advanced_tutorials', 'air_quality', 'target_cities.json') -with open(file_path) as json_file: - target_cities = json.load(json_file) - - -def get_batch_data_from_fs(td_version, date_threshold): - print(f"Retrieving the Batch data since {date_threshold}") - feature_view.init_batch_scoring(training_dataset_version=td_version) - - batch_data = feature_view.get_batch_data(start_time=date_threshold) - return batch_data - - -def parse_aq_data(last_dates_dict, today): - start_of_cell = time.time() - df_aq_raw = pd.DataFrame() - - print("Parsing started...") - # for continent in target_cities: - for city_name, coords in target_cities[continent].items(): - df_ = get_aqi_data_from_open_meteo(city_name=city_name, - coordinates=coords, - start_date=last_dates_dict[city_name], - end_date=str(today)) - df_aq_raw = pd.concat([df_aq_raw, df_]).reset_index(drop=True) - end_of_cell = time.time() - print("-" * 64) - print(f"Parsed new PM2.5 data for ALL locations up to {str(today)}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - return df_aq_raw - - -def parse_weather(last_dates_dict, today): - df_weather_update = pd.DataFrame() - start_of_cell = time.time() - - print("Parsing started...") - # for continent in target_cities: - for city_name, coords in target_cities[continent].items(): - df_ = get_weather_data_from_open_meteo(city_name=city_name, - coordinates=coords, - start_date=last_dates_dict[city_name], - end_date=str(today), - forecast=True) - df_weather_update = pd.concat([df_weather_update, df_]).reset_index(drop=True) - - end_of_cell = time.time() - print(f"Parsed new weather data for ALL cities up to {str(today)}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - return df_weather_update - - - -if __name__=="__main__": - project = hopsworks.login() - fs = project.get_feature_store() - print("โœ… Logged in successfully!") - - feature_view = fs.get_feature_view( - name='air_quality_fv', - version=1 - ) - - # I am going to load data for of last 60 days (for feature engineering) - today = datetime.date.today() - date_threshold = today - datetime.timedelta(days=60) - - print("Getting the batch data...") - batch_data = get_batch_data_from_fs(td_version=1, - date_threshold=date_threshold) - - print("Retreived batch data.") - - - last_dates_dict = batch_data[["date", "city_name"]].groupby("city_name").max() - last_dates_dict.date = last_dates_dict.date.astype(str) - # here is a dictionary with city names as keys and last updated date as values - last_dates_dict = last_dates_dict.to_dict()["date"] - - df_aq_raw = parse_aq_data(last_dates_dict, today) - - # we need the previous data to calculate aggregation functions - df_aq_update = pd.concat([ - batch_data[df_aq_raw.columns], - df_aq_raw - ]).reset_index(drop=True) - df_aq_update = df_aq_update.drop_duplicates(subset=['city_name', 'date']) - - print(df_aq_update.tail(7)) - - print('\n๐Ÿ›  Feature Engineering the PM2.5') - - ### - df_aq_update['date'] = pd.to_datetime(df_aq_update['date']) - df_aq_update = feature_engineer_aq(df_aq_update) - df_aq_update = df_aq_update.dropna() - - print(df_aq_update.groupby("city_name").max().tail(7)) - print("โœ… Success!") - ### - - print(3 * "-") - print('\n๐ŸŒค๐Ÿ“† Parsing Weather data') - - df_weather_update = parse_weather(last_dates_dict, today) - print(df_weather_update.groupby("city_name").max().tail(7)) - print("โœ… Successfully parsed!") - - df_aq_update.date = df_aq_update.date.astype(str) - df_weather_update.date = df_weather_update.date.astype(str) - - print("Connecting to feature groups...") - air_quality_fg = fs.get_or_create_feature_group( - name = 'air_quality', - version = 1 - ) - weather_fg = fs.get_or_create_feature_group( - name = 'weather', - version = 1 - ) - - df_aq_update.date = pd.to_datetime(df_aq_update.date) - df_weather_update.date = pd.to_datetime(df_weather_update.date) - - df_aq_update["unix_time"] = df_aq_update["date"].apply(convert_date_to_unix) - df_weather_update["unix_time"] = df_weather_update["date"].apply(convert_date_to_unix) - - df_aq_update.date = df_aq_update.date.astype(str) - df_weather_update.date = df_weather_update.date.astype(str) - - air_quality_fg.insert(df_aq_update) - print("Created job to insert parsed PM2.5 data into FS...") - print("Inserting into air_quality fg.") - - weather_fg.insert(df_weather_update) - print("Created job to insert parsed weather data into FS...") - print("Inserting into weather fg.") diff --git a/advanced_tutorials/air_quality/features/__init__.py b/advanced_tutorials/air_quality/features/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/advanced_tutorials/air_quality/features/air_quality.py b/advanced_tutorials/air_quality/features/air_quality.py index 40fc8d1e..4cd4fed0 100644 --- a/advanced_tutorials/air_quality/features/air_quality.py +++ b/advanced_tutorials/air_quality/features/air_quality.py @@ -18,7 +18,6 @@ def shift_pm_2_5(df: pd.DataFrame, days: int = 5) -> pd.DataFrame: """ for shift_value in range(1, days + 1): df[f'pm_2_5_previous_{shift_value}_day'] = df.groupby('city_name')['pm2_5'].shift(shift_value) - df = df.dropna() return df @@ -227,8 +226,9 @@ def feature_engineer_aq(df: pd.DataFrame) -> pd.DataFrame: for i in [7, 14, 28]: for func in [moving_std, exponential_moving_average, exponential_moving_std]: df_res = func(df_res, i) - - df_res = df_res.sort_values(by=["date", "pm2_5"]).dropna() + + + df_res = df_res.sort_values(by=["date", "pm2_5"]) df_res = df_res.reset_index(drop=True) df_res['year'] = year(df_res['date']) diff --git a/advanced_tutorials/air_quality/functions.py b/advanced_tutorials/air_quality/functions.py deleted file mode 100644 index e0d46205..00000000 --- a/advanced_tutorials/air_quality/functions.py +++ /dev/null @@ -1,392 +0,0 @@ -import os -import datetime -import time -import requests -import pandas as pd -import json - -from geopy.geocoders import Nominatim - - -def convert_date_to_unix(x): - """ - Convert datetime to unix time in milliseconds. - """ - dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S') - dt_obj = int(dt_obj.timestamp() * 1000) - return dt_obj - - -def get_city_coordinates(city_name: str): - """ - Takes city name and returns its latitude and longitude (rounded to 2 digits after dot). - """ - # Initialize Nominatim API (for getting lat and long of the city) - geolocator = Nominatim(user_agent="MyApp") - city = geolocator.geocode(city_name) - - latitude = round(city.latitude, 2) - longitude = round(city.longitude, 2) - - return latitude, longitude - - -##################################### EEA -def convert_to_daily(df, pollutant: str): - """ - Returns DataFrame where pollutant column is resampled to days and rounded. - """ - res_df = df.copy() - # convert dates in 'time' column - res_df["date"] = pd.to_datetime(res_df["date"]) - - # I want data daily, not hourly (mean per each day = 1 datarow per 1 day) - res_df = res_df.set_index('date') - res_df = res_df[pollutant].resample('1d').mean().reset_index() - res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median()) - res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0)) - - return res_df - - -def find_fullest_csv(csv_links: list, year: str): - candidates = [link for link in csv_links if str(year) in link] - biggest_df = pd.read_csv(candidates[0]) - for link in candidates[1:]: - _df = pd.read_csv(link) - if len(biggest_df) < len(_df): - biggest_df = _df - return biggest_df - - -def get_air_quality_from_eea( - city_name: str, - pollutant: str, - start_year: str, - end_year: str, - ): - """ - Takes city name, daterange and returns pandas DataFrame with daily air quality data. - It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...) - - EEA means European Environmental Agency. So it has data for Europe Union countries ONLY. - """ - start_of_cell = time.time() - - params = { - 'CountryCode': '', - 'CityName': city_name, - 'Pollutant': pollutant.upper(), - 'Year_from': start_year, - 'Year_to': end_year, - 'Station': '', - 'Source': 'All', - 'Samplingpoint': '', - 'Output': 'TEXT', - 'UpdateDate': '', - 'TimeCoverage': 'Year' - } - - # observations endpoint - base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?" - try: - response = requests.get(base_url, params=params) - except ConnectionError: - response = requests.get(base_url, params=params) - - response.encoding = response.apparent_encoding - csv_links = response.text.split("\r\n") - - res_df = pd.DataFrame() - target_year = int(start_year) - - for year in range(int(start_year), int(end_year) + 1): - try: - # find the fullest, the biggest csv file with observations for this particular year - _df = find_fullest_csv(csv_links, year) - # append it to res_df - res_df = pd.concat([res_df, _df]) - except IndexError: - print(f"!! Missing data for {year} for {city} city.") - pass - - pollutant = pollutant.lower() - if pollutant == "pm2.5": - pollutant = "pm2_5" - - res_df = res_df.rename(columns={ - 'DatetimeBegin': 'date', - 'Concentration': pollutant - }) - - # cut timezones info - res_df['date'] = res_df['date'].apply(lambda x: x[:-6]) - # convert dates in 'time' column - res_df['date'] = pd.to_datetime(res_df['date']) - - res_df = convert_to_daily(res_df, pollutant) - - res_df['city_name'] = city_name - res_df = res_df[['city_name', 'date', pollutant.lower()]] - - end_of_cell = time.time() - - print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - - return res_df - - - -##################################### USEPA -city_code_dict = {} -pollutant_dict = { - 'CO': '42101', - 'SO2': '42401', - 'NO2': '42602', - 'O3': '44201', - 'PM10': '81102', - 'PM2.5': '88101' -} - -def get_city_code(city_name: str): - "Encodes city name to be used later for data parsing using USEPA." - if city_code_dict: - city_full = [i for i in city_code_dict.keys() if city_name in i][0] - return city_code_dict[city_full] - else: - params = { - "email": "test@aqs.api", - "key": "test" - } - response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params) - response_json = response.json() - data = response_json["Data"] - for item in data: - city_code_dict[item['value_represented']] = item['code'] - - return get_city_code(city_name) - - -def get_air_quality_from_usepa( - city_name: str, - pollutant: str, - start_date: str, - end_date: str - ): - """ - Takes city name, daterange and returns pandas DataFrame with daily air quality data. - - USEPA means United States Environmental Protection Agency. So it has data for US ONLY. - """ - start_of_cell = time.time() - res_df = pd.DataFrame() - - for start_date_, end_date_ in make_date_intervals(start_date, end_date): - params = { - "email": "test@aqs.api", - "key": "test", - "param": pollutant_dict[pollutant.upper().replace("_", ".")], # encoded pollutant - "bdate": start_date_, - "edate": end_date_, - "cbsa": get_city_code(city_name) # Core-based statistical area - } - - # observations endpoint - base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?" - - response = requests.get(base_url, params=params) - response_json = response.json() - - df_ = pd.DataFrame(response_json["Data"]) - - pollutant = pollutant.lower() - if pollutant == "pm2.5": - pollutant = "pm2_5" - df_ = df_.rename(columns={ - 'date_local': 'date', - 'arithmetic_mean': pollutant - }) - - # convert dates in 'date' column - df_['date'] = pd.to_datetime(df_['date']) - df_['city_name'] = city_name - df_ = df_[['city_name', 'date', pollutant]] - res_df = pd.concat([res_df, df_]) - - # there are duplicated rows (several records for the same day and station). get rid of it. - res_df = res_df.groupby(['date', 'city_name'], as_index=False)[pollutant].mean() - res_df[pollutant] = round(res_df[pollutant], 1) - - end_of_cell = time.time() - print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - - return res_df - - -def make_date_intervals(start_date, end_date): - start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d') - end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d') - date_intervals = [] - for year in range(start_dt.year, end_dt.year + 1): - year_start = datetime.datetime(year, 1, 1) - year_end = datetime.datetime(year, 12, 31) - interval_start = max(start_dt, year_start) - interval_end = min(end_dt, year_end) - if interval_start < interval_end: - date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d'))) - return date_intervals - -##################################### Weather Open Meteo -def get_weather_data_from_open_meteo( - city_name: str, - start_date: str, - end_date: str, - coordinates: list = None, - forecast: bool = False, - ): - """ - Takes [city name OR coordinates] and returns pandas DataFrame with weather data. - - Examples of arguments: - coordinates=(47.755, -122.2806), start_date="2023-01-01" - """ - start_of_cell = time.time() - - if coordinates: - latitude, longitude = coordinates - else: - latitude, longitude = get_city_coordinates(city_name=city_name) - - params = { - 'latitude': latitude, - 'longitude': longitude, - 'daily': ["temperature_2m_max", "temperature_2m_min", - "precipitation_sum", "rain_sum", "snowfall_sum", - "precipitation_hours", "windspeed_10m_max", - "windgusts_10m_max", "winddirection_10m_dominant"], - 'timezone': "Europe/London", - 'start_date': start_date, - 'end_date': end_date, - } - - if forecast: - # historical forecast endpoint - base_url = 'https://api.open-meteo.com/v1/forecast' - else: - # historical observations endpoint - base_url = 'https://archive-api.open-meteo.com/v1/archive' - - try: - response = requests.get(base_url, params=params) - time.sleep(2) - except ConnectionError: - response = requests.get(base_url, params=params) - - response_json = response.json() - - res_df = pd.DataFrame(response_json["daily"]) - res_df["city_name"] = city_name - - # rename columns - res_df = res_df.rename(columns={ - "time": "date", - "temperature_2m_max": "temperature_max", - "temperature_2m_min": "temperature_min", - "windspeed_10m_max": "wind_speed_max", - "winddirection_10m_dominant": "wind_direction_dominant", - "windgusts_10m_max": "wind_gusts_max" - }) - - # change columns order - res_df = res_df[ - ['city_name', 'date', 'temperature_max', 'temperature_min', - 'precipitation_sum', 'rain_sum', 'snowfall_sum', - 'precipitation_hours', 'wind_speed_max', - 'wind_gusts_max', 'wind_direction_dominant'] - ] - - # convert dates in 'date' column - res_df["date"] = pd.to_datetime(res_df["date"]) - end_of_cell = time.time() - print(f"Parsed weather for {city_name} since {start_date} till {end_date}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - - return res_df - - -##################################### Air Quality data from Open Meteo -def get_aqi_data_from_open_meteo( - city_name: str, - start_date: str, - end_date: str, - coordinates: list = None, - pollutant: str = "pm2_5" - ): - """ - Takes [city name OR coordinates] and returns pandas DataFrame with AQI data. - - Examples of arguments: - ... - coordinates=(47.755, -122.2806), - start_date="2023-01-01", - pollutant="no2" - ... - """ - start_of_cell = time.time() - - if coordinates: - latitude, longitude = coordinates - else: - latitude, longitude = get_city_coordinates(city_name=city_name) - - pollutant = pollutant.lower() - if pollutant == "pm2.5": - pollutant = "pm2_5" - - # make it work with both "no2" and "nitrogen_dioxide" passed. - if pollutant == "no2": - pollutant = "nitrogen_dioxide" - - params = { - 'latitude': latitude, - 'longitude': longitude, - 'hourly': [pollutant], - 'start_date': start_date, - 'end_date': end_date, - 'timezone': "Europe/London" - } - - # base endpoint - base_url = "https://air-quality-api.open-meteo.com/v1/air-quality" - try: - response = requests.get(base_url, params=params) - except ConnectionError: - response = requests.get(base_url, params=params) - response_json = response.json() - res_df = pd.DataFrame(response_json["hourly"]) - - # convert dates - res_df["time"] = pd.to_datetime(res_df["time"]) - - # resample to days - res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index() - res_df[pollutant] = round(res_df[pollutant], 1) - - # rename columns - res_df = res_df.rename(columns={ - "time": "date" - }) - - res_df["city_name"] = city_name - - # change columns order - res_df = res_df[ - ['city_name', 'date', pollutant] - ] - end_of_cell = time.time() - print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.") - print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") - - return res_df \ No newline at end of file diff --git a/advanced_tutorials/air_quality/functions/air_quality_data_retrieval.py b/advanced_tutorials/air_quality/functions/air_quality_data_retrieval.py new file mode 100644 index 00000000..8f9afd6c --- /dev/null +++ b/advanced_tutorials/air_quality/functions/air_quality_data_retrieval.py @@ -0,0 +1,164 @@ +import pandas as pd +from typing import Any, Dict, List +import datetime +import pandas as pd + + +def transform_data(data, encoder): + """ + Transform the input data by encoding the 'city_name' column and dropping unnecessary columns. + + Args: + - data (DataFrame): Input data to be transformed. + - encoder (LabelEncoder): Label encoder object to encode 'city_name'. + + Returns: + - data_transformed (DataFrame): Transformed data with 'city_name_encoded' and dropped columns. + """ + + # Create a copy of the input data to avoid modifying the original data + data_transformed = data.copy() + + # Transform the 'city_name' column in the batch data using the retrieved label encoder + data_transformed['city_name_encoded'] = encoder.transform(data_transformed['city_name']) + + # Drop unnecessary columns from the batch data + data_transformed = data_transformed.drop(columns=['unix_time', 'pm2_5', 'city_name', 'date']) + + return data_transformed + + +def get_data_for_date(date: str, city_name: str, feature_view, model, encoder) -> pd.DataFrame: + """ + Retrieve data for a specific date and city from a feature view. + + Args: + date (str): The date in the format "%Y-%m-%d". + city_name (str): The name of the city to retrieve data for. + feature_view: The feature view object. + model: The machine learning model used for prediction. + encoder (LabelEncoder): Label encoder object to encode 'city_name'. + + Returns: + pd.DataFrame: A DataFrame containing data for the specified date and city. + """ + # Convert date string to datetime object + date_datetime = datetime.datetime.strptime(date, "%Y-%m-%d").date() + + # Retrieve batch data for the specified date range + batch_data = feature_view.get_batch_data( + start_time=date_datetime, + end_time=date_datetime + datetime.timedelta(days=1), + ) + + # Filter batch data for the specified city + batch_data_filtered = batch_data[batch_data['city_name'] == city_name] + + return batch_data_filtered[['date', 'pm2_5']].sort_values('date').reset_index(drop=True) + + +def get_data_in_date_range(date_start: str, date_end: str, city_name: str, feature_view, model, encoder) -> pd.DataFrame: + """ + Retrieve data for a specific date range and city from a feature view. + + Args: + date_start (str): The start date in the format "%Y-%m-%d". + date_end (str): The end date in the format "%Y-%m-%d". + city_name (str): The name of the city to retrieve data for. + feature_view: The feature view object. + model: The machine learning model used for prediction. + encoder (LabelEncoder): Label encoder object to encode 'city_name'. + + Returns: + pd.DataFrame: A DataFrame containing data for the specified date range and city. + """ + # Convert date strings to datetime objects + date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d").date() + date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d").date() + + # Retrieve batch data for the specified date range + batch_data = feature_view.get_batch_data( + start_time=date_start_dt, + end_time=date_end_dt + datetime.timedelta(days=1), + ) + + # Filter batch data for the specified city + batch_data_filtered = batch_data[batch_data['city_name'] == city_name] + + return batch_data_filtered[['date', 'pm2_5']].sort_values('date').reset_index(drop=True) + + +def get_future_data(date: str, city_name: str, feature_view, model, encoder) -> pd.DataFrame: + """ + Predicts future PM2.5 data for a specified date and city using a given feature view and model. + + Args: + date (str): The target future date in the format 'YYYY-MM-DD'. + city_name (str): The name of the city for which the prediction is made. + feature_view: The feature view used to retrieve batch data. + model: The machine learning model used for prediction. + encoder (LabelEncoder): Label encoder object to encode 'city_name'. + + Returns: + pd.DataFrame: A DataFrame containing predicted PM2.5 values for each day starting from the target date. + + """ + # Get today's date + today = datetime.date.today() + + # Convert the target date string to a datetime object + date_in_future = datetime.datetime.strptime(date, "%Y-%m-%d").date() + + # Calculate the difference in days between today and the target date + difference_in_days = (date_in_future - today).days + + # Retrieve batch data for the specified date range + batch_data = feature_view.get_batch_data( + start_time=today, + end_time=today + datetime.timedelta(days=1), + ) + + # Filter batch data for the specified city + batch_data_filtered = batch_data[batch_data['city_name'] == city_name] + + # Transform batch data + batch_data_transformed = transform_data(batch_data_filtered, encoder) + + # Initialize a DataFrame to store predicted PM2.5 values + try: + pm2_5_value = batch_data_filtered['pm2_5'].values[0] + except (IndexError, TypeError): + # If accessing pm2_5 values fails, return a message indicating the feature pipeline needs updating + return "Data is not available. Ask user to run the feature pipeline to update data." + else: + # Initialize a DataFrame to store predicted PM2.5 values + predicted_pm2_5_df = pd.DataFrame({ + 'date': [today.strftime("%Y-%m-%d")], + 'pm2_5': pm2_5_value, + }) + + # Iterate through each day starting from tomorrow up to the target date + for day_number in range(1, difference_in_days + 1): + + # Calculate the date for the current future day + date_future_day = (today + datetime.timedelta(days=day_number)).strftime("%Y-%m-%d") + + # Predict PM2.5 for the current day + predicted_pm2_5 = model.predict(batch_data_transformed) + + # Update previous day PM2.5 values in the batch data for the next prediction + batch_data_transformed['pm_2_5_previous_7_day'] = batch_data_transformed['pm_2_5_previous_6_day'] + batch_data_transformed['pm_2_5_previous_6_day'] = batch_data_transformed['pm_2_5_previous_5_day'] + batch_data_transformed['pm_2_5_previous_5_day'] = batch_data_transformed['pm_2_5_previous_4_day'] + batch_data_transformed['pm_2_5_previous_4_day'] = batch_data_transformed['pm_2_5_previous_3_day'] + batch_data_transformed['pm_2_5_previous_3_day'] = batch_data_transformed['pm_2_5_previous_2_day'] + batch_data_transformed['pm_2_5_previous_2_day'] = batch_data_transformed['pm_2_5_previous_1_day'] + batch_data_transformed['pm_2_5_previous_1_day'] = predicted_pm2_5 + + # Append the predicted PM2.5 value for the current day to the DataFrame + predicted_pm2_5_df = predicted_pm2_5_df._append({ + 'date': date_future_day, + 'pm2_5': predicted_pm2_5[0], + }, ignore_index=True) + + return predicted_pm2_5_df diff --git a/advanced_tutorials/air_quality/functions/common_functions.py b/advanced_tutorials/air_quality/functions/common_functions.py new file mode 100644 index 00000000..98767a09 --- /dev/null +++ b/advanced_tutorials/air_quality/functions/common_functions.py @@ -0,0 +1,25 @@ +import datetime +from geopy.geocoders import Nominatim + + +def convert_date_to_unix(x): + """ + Convert datetime to unix time in milliseconds. + """ + dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S') + dt_obj = int(dt_obj.timestamp() * 1000) + return dt_obj + + +def get_city_coordinates(city_name: str): + """ + Takes city name and returns its latitude and longitude (rounded to 2 digits after dot). + """ + # Initialize Nominatim API (for getting lat and long of the city) + geolocator = Nominatim(user_agent="MyApp") + city = geolocator.geocode(city_name) + + latitude = round(city.latitude, 2) + longitude = round(city.longitude, 2) + + return latitude, longitude \ No newline at end of file diff --git a/advanced_tutorials/air_quality/functions/context_engineering.py b/advanced_tutorials/air_quality/functions/context_engineering.py new file mode 100644 index 00000000..4b3bd4dc --- /dev/null +++ b/advanced_tutorials/air_quality/functions/context_engineering.py @@ -0,0 +1,191 @@ +import xml.etree.ElementTree as ET +import re +import inspect +from typing import get_type_hints +import json +import datetime +import torch +import sys +import pandas as pd +from functions.air_quality_data_retrieval import get_data_for_date, get_data_in_date_range, get_future_data +from typing import Any, Dict, List + + +def get_type_name(t: Any) -> str: + """Get the name of the type.""" + name = str(t) + if "list" in name or "dict" in name: + return name + else: + return t.__name__ + + +def serialize_function_to_json(func: Any) -> str: + """Serialize a function to JSON.""" + signature = inspect.signature(func) + type_hints = get_type_hints(func) + + function_info = { + "name": func.__name__, + "description": func.__doc__, + "parameters": { + "type": "object", + "properties": {} + }, + "returns": type_hints.get('return', 'void').__name__ + } + + for name, _ in signature.parameters.items(): + param_type = get_type_name(type_hints.get(name, type(None))) + function_info["parameters"]["properties"][name] = {"type": param_type} + + return json.dumps(function_info, indent=2) + + +def generate_hermes(prompt: str, model_llm, tokenizer) -> str: + """Retrieves a function name and extracts function parameters based on the user query.""" + fn = """{"name": "function_name", "arguments": {"arg_1": "value_1", "arg_2": value_2, ...}}""" + example = """{"name": "get_data_in_date_range", "arguments": {"date_start": "2024-01-10", "date_end": "2024-01-14", "city_name": "New York"}}""" + + prompt = f"""<|im_start|>system +You are a helpful assistant with access to the following functions: + +{serialize_function_to_json(get_data_for_date)} + +{serialize_function_to_json(get_data_in_date_range)} + +{serialize_function_to_json(get_future_data)} + +###INSTRUCTIONS: +- You need to choose one function to use and retrieve paramenters for this function from the user input. +- If the user query contains 'will', it is very likely that you will need to use the get_future_data function. +- Do not include feature_view, model and encoder parameters. +- Dates should be provided in the format YYYY-MM-DD. +- Generate an 'No Function needed' string if the user query does not require function calling. + +IMPORTANT: Today is {datetime.date.today().strftime("%A")}, {datetime.date.today()}. + +To use one of there functions respond STRICTLY with: + + {fn} + + +###EXAMPLES + +EXAMPLE 1: +- User: Hi! +- AI Assiatant: No Function needed. + +EXAMPLE 2: +- User: Is it good or bad? +- AI Assiatant: No Function needed. + +EXAMPLE 3: +- User: When and what was the minimum air quality from 2024-01-10 till 2024-01-14 in New York? +- AI Assistant: + + {example} + + +<|im_end|> +<|im_start|>user +{prompt}<|im_end|> +<|im_start|>assistant""" + + tokens = tokenizer(prompt, return_tensors="pt").to(model_llm.device) + input_size = tokens.input_ids.numel() + with torch.inference_mode(): + generated_tokens = model_llm.generate( + **tokens, + use_cache=True, + do_sample=True, + temperature=0.2, + top_p=1.0, + top_k=0, + max_new_tokens=512, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.eos_token_id, + ) + + return tokenizer.decode( + generated_tokens.squeeze()[input_size:], + skip_special_tokens=True, + ) + + +def extract_function_calls(completion: str) -> List[Dict[str, Any]]: + """Extract function calls from completion.""" + completion = completion.strip() + pattern = r"((.*?))" + match = re.search(pattern, completion, re.DOTALL) + if not match: + return None + + multiplefn = match.group(1) + root = ET.fromstring(multiplefn) + functions = root.findall("functioncall") + + return [json.loads(fn.text) for fn in functions] + + +def invoke_function(function, feature_view, model, encoder) -> pd.DataFrame: + """Invoke a function with given arguments.""" + # Extract function name and arguments from input_data + function_name = function['name'] + arguments = function['arguments'] + + # Using Python's getattr function to dynamically call the function by its name and passing the arguments + function_output = getattr(sys.modules[__name__], function_name)( + **arguments, + feature_view=feature_view, + model=model, + encoder=encoder, + ) + + if type(function_output) == str: + return function_output + + # Round the 'pm2_5' value to 2 decimal places + function_output['pm2_5'] = function_output['pm2_5'].apply(round, ndigits=2) + return function_output + + +def get_context_data(user_query: str, feature_view, model_llm, tokenizer, model_air_quality, encoder) -> str: + """ + Retrieve context data based on user query. + + Args: + user_query (str): The user query. + feature_view: Feature View for data retrieval. + model_llm: The language model. + tokenizer: The tokenizer. + model_air_quality: The air quality model. + encoder: The encoder. + + Returns: + str: The context data. + """ + # Generate a response using LLM + completion = generate_hermes( + user_query, + model_llm, + tokenizer, + ) + + # Extract function calls from the completion + functions = extract_function_calls(completion) + + # If function calls were found + if functions: + # Invoke the function with provided arguments + data = invoke_function(functions[0], feature_view, model_air_quality, encoder) + # Return formatted data as string + if isinstance(data, pd.DataFrame): + return f'Air Quality Measurements for {functions[0]["arguments"]["city_name"]}:\n' + '\n'.join( + [f'Date: {row["date"]}; Air Quality: {row["pm2_5"]}' for _, row in data.iterrows()] + ) + # Return message if data is not updated + return data + + # If no function calls were found, return an empty string + return '' diff --git a/advanced_tutorials/air_quality/functions/llm_chain.py b/advanced_tutorials/air_quality/functions/llm_chain.py new file mode 100644 index 00000000..6d0833ac --- /dev/null +++ b/advanced_tutorials/air_quality/functions/llm_chain.py @@ -0,0 +1,202 @@ +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from langchain.llms import HuggingFacePipeline +from langchain.prompts import PromptTemplate +from langchain.chains.llm import LLMChain +from langchain.memory import ConversationBufferWindowMemory +import torch +import datetime +from typing import Any, Dict, Union +from functions.context_engineering import get_context_data + + +def load_model(model_id: str = "teknium/OpenHermes-2.5-Mistral-7B") -> tuple: + """ + Load the LLM and its corresponding tokenizer. + + Args: + model_id (str, optional): Identifier for the pre-trained model. Defaults to "teknium/OpenHermes-2.5-Mistral-7B". + + Returns: + tuple: A tuple containing the loaded model and tokenizer. + """ + + # Load the tokenizer for Mistral-7B-Instruct model + tokenizer = AutoTokenizer.from_pretrained( + model_id, + ) + + # Set the pad token to the unknown token to handle padding + tokenizer.pad_token = tokenizer.unk_token + + # Set the padding side to "right" to prevent warnings during tokenization + tokenizer.padding_side = "right" + + # BitsAndBytesConfig int-4 config + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + + # Load the Mistral-7B-Instruct model with quantization configuration + model_llm = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + quantization_config=bnb_config, + ) + + # Configure the pad token ID in the model to match the tokenizer's pad token ID + model_llm.config.pad_token_id = tokenizer.pad_token_id + + return model_llm, tokenizer + + +def get_prompt_template(): + """ + Retrieve a template for generating prompts in a conversational AI system. + + Returns: + str: A string representing the template for generating prompts. + This template includes placeholders for system information, + instructions, previous conversation, context, date and user query. + """ + prompt_template = """<|im_start|>system +You are a helpful Air Quality assistant. +Provide your answers based on the provided context table which consists of the dates and air quality indicators for the city provided by user. + +INSTRUCTIONS: +- If you don't know the answer, you will respond politely that you cannot help. +- Use the provided table with air quality indicators for city provided by user to generate your answer. +- You answer should be at least one sentence. +- Do not show any calculations to the user. +- If the user asks for the air quality level in specific range, you can calculate an average air quality level. +- Make sure that you use correct air quality indicators for the required date. +- Add a description of the air quality level, such as whether it is safe, whether to go for a walk, etc. +- If user asks more general question, use your last responses in the chat history as a context. +<|im_end|> + +Previous conversation: +{chat_history} + +### CONTEXT: +{context} + +IMPORTANT: Today is {date_today}. + +<|im_start|>user +{question}<|im_end|> +<|im_start|>assistant""" + return prompt_template + + +def get_llm_chain(model_llm, tokenizer): + """ + Create and configure a language model chain. + + Args: + model_llm: The pre-trained language model for text generation. + tokenizer: The tokenizer corresponding to the language model. + + Returns: + LLMChain: The configured language model chain. + """ + # Create a text generation pipeline using the loaded model and tokenizer + text_generation_pipeline = transformers.pipeline( + model=model_llm, # The pre-trained language model for text generation + tokenizer=tokenizer, # The tokenizer corresponding to the language model + task="text-generation", # Specify the task as text generation + use_cache=True, + do_sample=True, + temperature=0.4, + top_p=1.0, + top_k=0, + max_new_tokens=512, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.eos_token_id, + ) + + # Create a Hugging Face pipeline for Mistral LLM using the text generation pipeline + mistral_llm = HuggingFacePipeline( + pipeline=text_generation_pipeline, + ) + + # Create prompt from prompt template + prompt = PromptTemplate( + input_variables=["context", "question", "date_today", "chat_history"], + template=get_prompt_template(), + ) + + # Create a ConversationBufferWindowMemory with specified configuration + memory = ConversationBufferWindowMemory( + k=3, # Number of turns to remember in the conversation buffer + memory_key="chat_history", # Key to store the conversation history in memory + input_key="question", # Key to access the input question in the conversation + ) + + # Create LLM chain + llm_chain = LLMChain( + llm=mistral_llm, + prompt=prompt, + verbose=False, + memory=memory, + ) + + return llm_chain + + +def generate_response( + user_query: str, + feature_view, + model_llm, + tokenizer, + model_air_quality, + encoder, + llm_chain, + verbose: bool = False, +) -> str: + """ + Generate response to user query using LLM chain and context data. + + Args: + user_query (str): The user's query. + feature_view: Feature view for data retrieval. + model_llm: Language model for text generation. + tokenizer: Tokenizer for processing text. + model_air_quality: Model for predicting air quality. + encoder: Label Encoder for the city_name column. + llm_chain: LLM Chain. + verbose (bool): Whether to print verbose information. Defaults to False. + + Returns: + str: Generated response to the user query. + """ + + # Get context data based on user query + context = get_context_data( + user_query, + feature_view, + model_llm, + tokenizer, + model_air_quality, + encoder, + ) + + # Get today's date in a readable format + date_today = f'{datetime.date.today().strftime("%A")}, {datetime.date.today()}' + + # Print today's date and context information if verbose mode is enabled + if verbose: + print(f"๐Ÿ—“๏ธ Today's date: {date_today}") + print(f'๐Ÿ“– {context}') + + # Invoke the language model chain with relevant context + model_output = llm_chain.invoke({ + "context": context, + "date_today": date_today, + "question": user_query, + }) + + # Return the generated text from the model output + return model_output['text'] diff --git a/advanced_tutorials/air_quality/functions/parse_air_quality.py b/advanced_tutorials/air_quality/functions/parse_air_quality.py new file mode 100644 index 00000000..06dd41fe --- /dev/null +++ b/advanced_tutorials/air_quality/functions/parse_air_quality.py @@ -0,0 +1,79 @@ +import time +from functions.common_functions import * +import requests +import pandas as pd + + +def get_aqi_data_from_open_meteo( + city_name: str, + start_date: str, + end_date: str, + coordinates: list = None, + pollutant: str = "pm2_5" + ): + """ + Takes [city name OR coordinates] and returns pandas DataFrame with AQI data. + + Examples of arguments: + ... + coordinates=(47.755, -122.2806), + start_date="2023-01-01", + pollutant="no2" + ... + """ + start_of_cell = time.time() + + if coordinates: + latitude, longitude = coordinates + else: + latitude, longitude = get_city_coordinates(city_name=city_name) + + pollutant = pollutant.lower() + if pollutant == "pm2.5": + pollutant = "pm2_5" + + # make it work with both "no2" and "nitrogen_dioxide" passed. + if pollutant == "no2": + pollutant = "nitrogen_dioxide" + + params = { + 'latitude': latitude, + 'longitude': longitude, + 'hourly': [pollutant], + 'start_date': start_date, + 'end_date': end_date, + 'timezone': "Europe/London" + } + + # base endpoint + base_url = "https://air-quality-api.open-meteo.com/v1/air-quality" + try: + response = requests.get(base_url, params=params) + except ConnectionError: + response = requests.get(base_url, params=params) + response_json = response.json() + res_df = pd.DataFrame(response_json["hourly"]) + + # convert dates + res_df["time"] = pd.to_datetime(res_df["time"]) + + # resample to days + res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index() + res_df[pollutant] = round(res_df[pollutant], 1) + + # rename columns + res_df = res_df.rename(columns={ + "time": "date" + }) + + res_df["city_name"] = city_name + + # change columns order + res_df = res_df[ + ['city_name', 'date', pollutant] + ] + end_of_cell = time.time() + print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.") + print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") + + return res_df \ No newline at end of file diff --git a/advanced_tutorials/air_quality/functions/parse_weather.py b/advanced_tutorials/air_quality/functions/parse_weather.py new file mode 100644 index 00000000..bbebc34c --- /dev/null +++ b/advanced_tutorials/air_quality/functions/parse_weather.py @@ -0,0 +1,81 @@ +import time +from functions.common_functions import * +import requests +import pandas as pd + + +def get_weather_data_from_open_meteo( + city_name: str, + start_date: str, + end_date: str, + coordinates: list = None, + forecast: bool = False, + ): + """ + Takes [city name OR coordinates] and returns pandas DataFrame with weather data. + + Examples of arguments: + coordinates=(47.755, -122.2806), start_date="2023-01-01" + """ + start_of_cell = time.time() + + if coordinates: + latitude, longitude = coordinates + else: + latitude, longitude = get_city_coordinates(city_name=city_name) + + params = { + 'latitude': latitude, + 'longitude': longitude, + 'daily': ["temperature_2m_max", "temperature_2m_min", + "precipitation_sum", "rain_sum", "snowfall_sum", + "precipitation_hours", "windspeed_10m_max", + "windgusts_10m_max", "winddirection_10m_dominant"], + 'timezone': "Europe/London", + 'start_date': start_date, + 'end_date': end_date, + } + + if forecast: + # historical forecast endpoint + base_url = 'https://api.open-meteo.com/v1/forecast' + else: + # historical observations endpoint + base_url = 'https://archive-api.open-meteo.com/v1/archive' + + try: + response = requests.get(base_url, params=params) + time.sleep(2) + except ConnectionError: + response = requests.get(base_url, params=params) + + response_json = response.json() + + res_df = pd.DataFrame(response_json["daily"]) + res_df["city_name"] = city_name + + # rename columns + res_df = res_df.rename(columns={ + "time": "date", + "temperature_2m_max": "temperature_max", + "temperature_2m_min": "temperature_min", + "windspeed_10m_max": "wind_speed_max", + "winddirection_10m_dominant": "wind_direction_dominant", + "windgusts_10m_max": "wind_gusts_max" + }) + + # change columns order + res_df = res_df[ + ['city_name', 'date', 'temperature_max', 'temperature_min', + 'precipitation_sum', 'rain_sum', 'snowfall_sum', + 'precipitation_hours', 'wind_speed_max', + 'wind_gusts_max', 'wind_direction_dominant'] + ] + + # convert dates in 'date' column + res_df["date"] = pd.to_datetime(res_df["date"]) + end_of_cell = time.time() + print(f"Parsed weather for {city_name} since {start_date} till {end_date}.") + print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n") + + return res_df \ No newline at end of file diff --git a/advanced_tutorials/air_quality/requirements.txt b/advanced_tutorials/air_quality/requirements.txt index 9d933db1..edafbefb 100644 --- a/advanced_tutorials/air_quality/requirements.txt +++ b/advanced_tutorials/air_quality/requirements.txt @@ -1,7 +1,12 @@ -hopsworks -geopy -pandas -numpy -streamlit -streamlit-folium -joblib \ No newline at end of file +geopy==2.4.1 +joblib==1.2.0 +xgboost==2.0.3 +transformers==4.38.2 +protobuf==3.20.0 +langchain==0.1.10 +flask-sqlalchemy==3.1.1 +bitsandbytes==0.42.0 +accelerate==0.27.2 +streamlit==1.31.1 +sentencepiece==0.2.0 +gradio==4.21.0