[FSTORE-1207] AirQuality LLM project (#250)

* Function Calling & AirQuality FunctionCalling Chatbot
logicalclocks · Mar 18, 2024 · c0d6818 · c0d6818
1 parent 3b4eb4e
commit c0d6818
Show file tree

Hide file tree

Showing 18 changed files with 4,125 additions and 1,123 deletions.
diff --git a/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb b/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb
diff --git a/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb b/advanced_tutorials/air_quality/2_air_quality_feature_pipeline.ipynb
diff --git a/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb b/advanced_tutorials/air_quality/3_air_quality_training_pipeline.ipynb
diff --git a/advanced_tutorials/air_quality/4_air_quality_batch_inference.ipynb b/advanced_tutorials/air_quality/4_air_quality_batch_inference.ipynb
diff --git a/advanced_tutorials/air_quality/5_function_calling.ipynb b/advanced_tutorials/air_quality/5_function_calling.ipynb
diff --git a/advanced_tutorials/air_quality/app_gradio.py b/advanced_tutorials/air_quality/app_gradio.py
@@ -0,0 +1,104 @@
+import gradio as gr
+from transformers import pipeline
+import numpy as np
+import hopsworks
+import joblib
+from functions.llm_chain import load_model, get_llm_chain, generate_response
+
+# Initialize the ASR pipeline
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
+
+def connect_to_hopsworks():
+    # Initialize Hopsworks feature store connection
+    project = hopsworks.login()
+    fs = project.get_feature_store()
+
+    # Retrieve the model registry
+    mr = project.get_model_registry()
+
+    # Retrieve the 'air_quality_fv' feature view
+    feature_view = fs.get_feature_view(
+        name="air_quality_fv", 
+        version=1,
+    )
+
+    # Initialize batch scoring
+    feature_view.init_batch_scoring(1)
+
+    # Retrieve the 'air_quality_xgboost_model' from the model registry
+    retrieved_model = mr.get_model(
+        name="air_quality_xgboost_model",
+        version=1,
+    )
+
+    # Download the saved model artifacts to a local directory
+    saved_model_dir = retrieved_model.download()
+
+    # Load the XGBoost regressor model and label encoder from the saved model directory
+    model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
+    encoder = joblib.load(saved_model_dir + "/label_encoder.pkl")
+
+    return feature_view, model_air_quality, encoder
+
+
+def retrieve_llm_chain():
+
+    # Load the LLM and its corresponding tokenizer.
+    model_llm, tokenizer = load_model()
+
+    # Create and configure a language model chain.
+    llm_chain = get_llm_chain(
+        model_llm, 
+        tokenizer,
+    )
+
+    return model_llm, tokenizer, llm_chain
+
+
+# Retrieve the feature view, air quality model and encoder for the city_name column
+feature_view, model_air_quality, encoder = connect_to_hopsworks()
+
+# Load the LLM and its corresponding tokenizer and configure a language model chain
+model_llm, tokenizer, llm_chain = retrieve_llm_chain()
+
+def transcribe(audio):
+    sr, y = audio
+    y = y.astype(np.float32)
+    if y.ndim > 1 and y.shape[1] > 1:
+        y = np.mean(y, axis=1)
+    y /= np.max(np.abs(y))
+    return transcriber({"sampling_rate": sr, "raw": y})["text"]
+
+def generate_query_response(user_query):
+    response = generate_response(
+        user_query,
+        feature_view,
+        model_llm,
+        tokenizer,
+        model_air_quality,
+        encoder,
+        llm_chain,
+        verbose=False,
+    )
+    return response
+
+def handle_input(text_input=None, audio_input=None):
+    if audio_input is not None:
+        user_query = transcribe(audio_input)
+    else:
+        user_query = text_input
+
+    if user_query:
+        return generate_query_response(user_query)
+    else:
+        return "Please provide input either via text or voice."
+
+iface = gr.Interface(
+    fn=handle_input,
+    inputs=[gr.Textbox(placeholder="Type here or use voice input..."), gr.Audio()],
+    outputs="text",
+    title="🌤️ AirQuality AI Assistant 💬",
+    description="Ask your questions about air quality or use your voice to interact."
+)
+
+iface.launch(share=True)
diff --git a/advanced_tutorials/air_quality/app_streamlit.py b/advanced_tutorials/air_quality/app_streamlit.py
@@ -0,0 +1,99 @@
+import streamlit as st
+import hopsworks
+import joblib
+from functions.llm_chain import load_model, get_llm_chain, generate_response
+import warnings
+warnings.filterwarnings('ignore')
+
+st.title("🌤️ AirQuality AI assistant 💬")
+
+@st.cache_resource()
+def connect_to_hopsworks():
+    # Initialize Hopsworks feature store connection
+    project = hopsworks.login()
+    fs = project.get_feature_store()
+
+    # Retrieve the model registry
+    mr = project.get_model_registry()
+
+    # Retrieve the 'air_quality_fv' feature view
+    feature_view = fs.get_feature_view(
+        name="air_quality_fv", 
+        version=1,
+    )
+
+    # Initialize batch scoring
+    feature_view.init_batch_scoring(1)
+
+    # Retrieve the 'air_quality_xgboost_model' from the model registry
+    retrieved_model = mr.get_model(
+        name="air_quality_xgboost_model",
+        version=1,
+    )
+
+    # Download the saved model artifacts to a local directory
+    saved_model_dir = retrieved_model.download()
+
+    # Load the XGBoost regressor model and label encoder from the saved model directory
+    model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
+    encoder = joblib.load(saved_model_dir + "/label_encoder.pkl")
+
+    return feature_view, model_air_quality, encoder
+
+
+@st.cache_resource()
+def retrieve_llm_chain():
+
+    # Load the LLM and its corresponding tokenizer.
+    model_llm, tokenizer = load_model()
+
+    # Create and configure a language model chain.
+    llm_chain = get_llm_chain(
+        model_llm, 
+        tokenizer,
+    )
+
+    return model_llm, tokenizer, llm_chain
+
+
+# Retrieve the feature view, air quality model and encoder for the city_name column
+feature_view, model_air_quality, encoder = connect_to_hopsworks()
+
+# Load the LLM and its corresponding tokenizer and configure a language model chain
+model_llm, tokenizer, llm_chain = retrieve_llm_chain()
+
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+# React to user input
+if user_query := st.chat_input("How can I help you?"):
+    # Display user message in chat message container
+    st.chat_message("user").markdown(user_query)
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": user_query})
+
+    st.write('⚙️ Generating Response...')
+
+    # Generate a response to the user query
+    response = generate_response(
+        user_query,
+        feature_view,
+        model_llm,
+        tokenizer,
+        model_air_quality,
+        encoder,
+        llm_chain,
+        verbose=False,
+    )
+
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        st.markdown(response)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/advanced_tutorials/air_quality/feature_pipeline.py b/advanced_tutorials/air_quality/feature_pipeline.py
diff --git a/advanced_tutorials/air_quality/features/__init__.py b/advanced_tutorials/air_quality/features/__init__.py
diff --git a/advanced_tutorials/air_quality/features/air_quality.py b/advanced_tutorials/air_quality/features/air_quality.py
@@ -18,7 +18,6 @@ def shift_pm_2_5(df: pd.DataFrame, days: int = 5) -> pd.DataFrame:
     """
     for shift_value in range(1, days + 1):
         df[f'pm_2_5_previous_{shift_value}_day'] = df.groupby('city_name')['pm2_5'].shift(shift_value)
-    df = df.dropna()
     return df
 
 
@@ -227,8 +226,9 @@ def feature_engineer_aq(df: pd.DataFrame) -> pd.DataFrame:
     for i in [7, 14, 28]:
         for func in [moving_std, exponential_moving_average, exponential_moving_std]:
             df_res = func(df_res, i)
-
-    df_res = df_res.sort_values(by=["date", "pm2_5"]).dropna()
+
+
+    df_res = df_res.sort_values(by=["date", "pm2_5"])
     df_res = df_res.reset_index(drop=True)
 
     df_res['year'] = year(df_res['date'])