From 079cd0df3a85ff83e52ad4f77241c829348525e2 Mon Sep 17 00:00:00 2001 From: Arvin Singh <29691465+arvinsingh@users.noreply.github.com> Date: Sun, 10 Nov 2024 00:07:28 +0000 Subject: [PATCH] added docstring and typing --- src/config/db.py | 16 ++++++ src/config/logger.py | 20 ++++++- src/config/model.py | 16 ++++++ src/db/db_model.py | 33 +++++++++++ src/model/model_builder.py | 31 +++++++++- src/model/model_inference.py | 38 ++++++++++++- src/model/pipeline/collection.py | 17 +++++- src/model/pipeline/model.py | 95 +++++++++++++++++++++++++++++-- src/model/pipeline/preparation.py | 53 +++++++++++++---- src/runner_builder.py | 13 +++++ src/runner_inference.py | 18 +++++- 11 files changed, 324 insertions(+), 26 deletions(-) diff --git a/src/config/db.py b/src/config/db.py index b26c3e6..b5c9afc 100644 --- a/src/config/db.py +++ b/src/config/db.py @@ -1,8 +1,24 @@ +""" +This module setups the database configuration. + +This module uses Pydantic's BaseSettings to manage configuration, +allowing settings to be read from environment variables and a .env file. +""" + from pydantic_settings import BaseSettings, SettingsConfigDict from sqlalchemy import create_engine class DbSettings(BaseSettings): + """ + Database configuration settings for the application. + + Attributes: + model_config (SettingsConfigDict): Model config, loaded from .env file. + db_conn_str (str): Database connection string. + rent_apart_table_name (str): Name of the rental apartments table in DB. + """ + model_config = SettingsConfigDict( env_file='config/.env', env_encoding='utf-8', diff --git a/src/config/logger.py b/src/config/logger.py index acb971e..db82f91 100644 --- a/src/config/logger.py +++ b/src/config/logger.py @@ -1,8 +1,23 @@ +""" +This module is responsible for configuring the logger. + +Pydantic is used to load the log level from the environment variables. +The logger is configured using the loguru library. +""" + from loguru import logger from pydantic_settings import BaseSettings, SettingsConfigDict class LoggerSettings(BaseSettings): + """ + Logger configuration settings for the application. + + Attributes: + model_config (SettingsConfigDict): Model config, loaded from .env file. + log_level (str): Log level for the logger. + """ + model_config = SettingsConfigDict( env_file='config/.env', env_encoding='utf-8', @@ -11,7 +26,10 @@ class LoggerSettings(BaseSettings): log_level: str -def configure_logging(level: str): +def configure_logging(level: str) -> None: + """ + Configures the logger with the specified log level. + """ logger.remove() logger.add( 'logs/app.log', diff --git a/src/config/model.py b/src/config/model.py index 9e2f40b..ee530a4 100644 --- a/src/config/model.py +++ b/src/config/model.py @@ -1,8 +1,24 @@ +""" +This module contains the model configuration settings for the application. + +This module uses Pydantic's BaseSettings to manage configuration, +allowing settings to be read from environment variables and a .env file. +""" + from pydantic import DirectoryPath from pydantic_settings import BaseSettings, SettingsConfigDict class ModelSettings(BaseSettings): + """ + Model configuration settings for the application. + + Attributes: + model_config (SettingsConfigDict): Model config, loaded from .env file. + model_path (DirectoryPath): Path to the model directory. + model_name (str): Name of the model file. + """ + model_config = SettingsConfigDict( env_file='config/.env', env_encoding='utf-8', diff --git a/src/db/db_model.py b/src/db/db_model.py index 6faa6ab..1f40de7 100644 --- a/src/db/db_model.py +++ b/src/db/db_model.py @@ -1,3 +1,11 @@ +""" +This module contains the SQLAlchemy model for the rent_apartments table. + +The model is defined using SQLAlchemy's DeclarativeBase and Mapped classes. +Base is a subclass of DeclarativeBase, RentApartments is a subclass of Base. +Future models can be defined in a similar way. +""" + from sqlalchemy import INTEGER, REAL, VARCHAR from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column @@ -5,11 +13,36 @@ class Base(DeclarativeBase): + """ + Base class for the SQLAlchemy model. + """ pass class RentApartments(Base): + """ + rent_apartments table model for the SQLAlchemy ORM. + + Attributes: + address (Mapped[str]): Address of the apartment. + area (Mapped[float]): Area of the apartment. + constraction_year (Mapped[int]): Year of construction of the apartment. + rooms (Mapped[int]): Number of rooms in the apartment. + bedrooms (Mapped[int]): Number of bedrooms in the apartment. + bathrooms (Mapped[int]): Number of bathrooms in the apartment. + balcony (Mapped[str]): Whether the apartment has a balcony. + storage (Mapped[str]): Whether the apartment has storage. + parking (Mapped[str]): Whether the apartment has parking. + furnished (Mapped[str]): Whether the apartment is furnished. + garage (Mapped[str]): Whether the apartment has a garage. + garden (Mapped[str]): Whether the apartment has a garden. + energy (Mapped[str]): Energy efficiency rating of the apartment. + facilities (Mapped[str]): Additional facilities in the apartment. + zip (Mapped[str]): ZIP code of the apartment. + neighborhood (Mapped[str]): Neighborhood of the apartment. + rent (Mapped[int]): Rent of the apartment. + """ __tablename__ = db_settings.rent_apart_table_name diff --git a/src/model/model_builder.py b/src/model/model_builder.py index bf525e3..808ddc3 100644 --- a/src/model/model_builder.py +++ b/src/model/model_builder.py @@ -1,3 +1,10 @@ +""" +The module is responsible for building the model. + +The model is built using the `build_model` function +from the `model` module. +""" + from loguru import logger from config import model_settings @@ -5,12 +12,32 @@ class ModelBuilderService: + """ + The service class for building the model. + + The class provides functionalities to train + the model and save it to a specified path. + + Attributes: + model_path (str): Path to the model directory. + model_name (str): Name of the model file. + + Methods: + __init__: Initializes the ModelBuilderService. + train_model: Trains the model and saves it to a + specified directory. + """ - def __init__(self): + def __init__(self) -> None: + """Initialize the ModelBuilderService.""" self.model_path = model_settings.model_path self.model_name = model_settings.model_name - def load_model(self): + def train_model(self) -> None: + """ + Train the model from a specified path and + save to the model's directory. + """ logger.info( f'Building the model file at ' f'{self.model_path}/{self.model_name}', diff --git a/src/model/model_inference.py b/src/model/model_inference.py index 219195f..01f9d50 100644 --- a/src/model/model_inference.py +++ b/src/model/model_inference.py @@ -1,3 +1,11 @@ +""" +This module provides functionality for making +predictions using a ML model. + +It contains the ModelInferenceService class that offers +methods to load a model and make predictions. +""" + from pathlib import Path import pickle as pkl @@ -7,13 +15,36 @@ class ModelInferenceService: + """ + The service class for making predictions using a ML model. + + The class provides functionalities to load a model and make + predictions using the model. - def __init__(self): + Attributes: + model (object): The ML model object. + model_path (str): Path to the model directory. + model_name (str): Name of the model file. + + Methods: + __init__: Initializes the ModelInferenceService. + load_model: Loads the model from a specified path. + predict: Makes predictions using the loaded model. + """ + + def __init__(self) -> None: + """Initialize the ModelInferenceService.""" self.model = None self.model_path = model_settings.model_path self.model_name = model_settings.model_name - def load_model(self): + def load_model(self) -> None: + """ + Load the model from a specified path + + Raises: + FileNotFoundError: If the model file does not exist. + """ logger.info( f'Checking for existing model file at ' f'{self.model_path}/{self.model_name}', @@ -31,6 +62,7 @@ def load_model(self): with open(model_path, 'rb') as model_file: self.model = pkl.load(model_file) - def predict(self, input_parameters): + def predict(self, input_parameters: list) -> list: + """Make predictions using the loaded model.""" logger.info('Making predictions') return self.model.predict([input_parameters]) diff --git a/src/model/pipeline/collection.py b/src/model/pipeline/collection.py index 087adbd..3e7571f 100644 --- a/src/model/pipeline/collection.py +++ b/src/model/pipeline/collection.py @@ -1,3 +1,12 @@ +""" +This module is responsible for extracting data from the database. + +It uses a function to extract data from the RentApartments table in +the database and load it into a pandas DataFrame. +It uses SqlAlchemy to retrieve data from the database for further +analysis or processing. +""" + from loguru import logger import pandas as pd from sqlalchemy import select @@ -6,7 +15,13 @@ from db.db_model import RentApartments -def load_data_from_db(): +def load_data_from_db() -> pd.DataFrame: + """ + Load data from the RentApartments table in the database. + + Returns: + pd.DataFrame: DataFrame containing the RentApartments data + """ logger.info("Extracting data from database") query = select(RentApartments) return pd.read_sql(query, engine) diff --git a/src/model/pipeline/model.py b/src/model/pipeline/model.py index 9ddb375..9eef147 100644 --- a/src/model/pipeline/model.py +++ b/src/model/pipeline/model.py @@ -1,5 +1,14 @@ +""" +This module creates the pipeline for building, training and saving ML model. + +It includes the process of data preparation, model training using +RandomForestRegressor, hyperparameter tuning with GridSearchCV, +model evaluation, and serialization of the trained model. +""" + import pickle as pkl +import pandas as pd from loguru import logger from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, GridSearchCV @@ -8,7 +17,15 @@ from model.pipeline.preparation import prepare_data -def build_model(): +def build_model() -> None: + """ + This function creates the pipeline for building, + training and saving ML model. + + It includes the process of data preparation, data splitting, + model training using RandomForestRegressor, evaluation of the model, and + serialization of the trained model. + """ logger.info('Model building pipeline started') df = prepare_data() feature_names = [ @@ -35,12 +52,40 @@ def build_model(): _save_model(model) -def _get_x_y(data, col_x, col_y): +def _get_x_y(data: pd.DataFrame, col_x: dict, col_y: str) -> tuple: + """ + Extracts desired columns from the data for features and target. + + Args: + data (pd.DataFrame): The data to extract features and target from. + col_x (list): The list of column names to extract as features. + col_y (str): The column name to extract as target. + + Returns: + pd.Series: The features extracted from the data. + pd.Series: The target extracted from the data. + """ logger.info(f'Defining X: {col_x} and Y: {col_y}') return data[col_x], data[col_y] -def _split_train_test(features, target): +def _split_train_test( + features: pd.DataFrame, + target: pd.Series, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + """ + Splits the data into train and test sets. + + Args: + features (pd.Series): The features to split. + target (pd.Series): The target to split. + + Returns: + pd.Series: The features for training. + pd.Series: The features for testing. + pd.Series: The target for training. + pd.Series: The target for testing. + """ logger.info('Splitting data into train and test sets') return train_test_split( features, @@ -50,7 +95,21 @@ def _split_train_test(features, target): ) -def _train_model(x_train, y_train): +def _train_model( + x_train: pd.DataFrame, + y_train: pd.Series, +) -> RandomForestRegressor: + """ + Trains the model using RandomForestRegressor. + GridSearchCV is used for hyperparameter tuning. + + Args: + x_train (pd.Series): The features for training. + y_train (pd.Series): The target for training. + + Returns: + RandomForestRegressor: The trained model. + """ logger.info('Training model') grid_space = { 'n_estimators': [100, 200, 300], @@ -66,13 +125,37 @@ def _train_model(x_train, y_train): return model.best_estimator_ -def _evaluate_model(model, x_test, y_test): +def _evaluate_model( + model: RandomForestRegressor, + x_test: pd.DataFrame, + y_test: pd.Series, +) -> float: + """ + Evaluates the model using the test data. + + Args: + model (RandomForestRegressor): The trained model. + x_test (pd.Series): The features for testing. + y_test (pd.Series): The target for testing. + + Returns: + float: The score of the model. + """ model_score = model.score(x_test, y_test) logger.info(f'Evaluating model: {model_score}') return model_score -def _save_model(model): +def _save_model(model: RandomForestRegressor) -> None: + """ + Saves the trained model to the specified path. + + Args: + model (RandomForestRegressor): The trained model to save. + + Returns: + None + """ model_path = f'{model_settings.model_path}/{model_settings.model_name}' logger.info(f'Saving model to {model_path}') with open(model_path, 'wb') as model_file: diff --git a/src/model/pipeline/preparation.py b/src/model/pipeline/preparation.py index 1f1aebb..faec32f 100644 --- a/src/model/pipeline/preparation.py +++ b/src/model/pipeline/preparation.py @@ -1,3 +1,11 @@ +""" +The module is used for preprocessing the data before it is used for +training the model. + +It consists of functions to load data from a database, encode categorical +columns, and parse specific columns for further processing. +""" + import re import pandas as pd @@ -6,7 +14,14 @@ from model.pipeline.collection import load_data_from_db -def prepare_data(): +def prepare_data() -> pd.DataFrame: + """ + Prepare the dataset for analysis and modelling. This involves loading + the data, encoding categorical columns, and parsing the 'garden' column. + + Returns: + pd.DataFrame: The processed dataset. + """ logger.info('Preporcessing data pipeline started') data = load_data_from_db() data_encoded = encode_cat_cols(data) @@ -14,7 +29,16 @@ def prepare_data(): return df -def encode_cat_cols(data): +def encode_cat_cols(data: pd.DataFrame) -> pd.DataFrame: + """ + Encode specific categorical columns into dummy variables. + + Args: + data (pd.DataFrame): The original dataset. + + Returns: + pd.DataFrame: Dataset with categorical columns encoded. + """ cols = ['balcony', 'storage', 'parking', 'furnished', 'garage'] logger.info('Encoding categorical columns: {cols}') return pd.get_dummies(data, @@ -23,13 +47,20 @@ def encode_cat_cols(data): dtype=int) -def parse_garden_col(data): +def parse_garden_col(dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Parse the 'garden' column in the dataset. If the garden data is not + prsent, it is replaced with 0, otherwise extract the number from the + string. + + Args: + dataframe (pd.DataFrame): The dataset with a 'garden' column. + + Returns: + pd.DataFrame: The dataset with the 'garden' column parsed. + """ logger.info('Parsing garden column') - for i in range(len(data)): - if data.loc[i, 'garden'] == 'Not present': - data.loc[i, 'garden'] = 0 - else: - data.loc[i, 'garden'] = int( - re.findall(r'\d+', data.loc[i, 'garden'])[0] - ) - return data + dataframe['garden'] = dataframe['garden'].apply( + lambda x: 0 if x == 'Not present' else int(re.findall(r'\d+', x)[0]), + ) + return dataframe diff --git a/src/runner_builder.py b/src/runner_builder.py index 7cf5870..070d24e 100644 --- a/src/runner_builder.py +++ b/src/runner_builder.py @@ -1,3 +1,10 @@ +""" +This module provides functionality for training a ML model. + +This script initializes the ModelBuilderService, trains and saves the model, +and logs the output. A typical workflow of an ML model builder service. +""" + from loguru import logger from model.model_builder import ModelBuilderService @@ -5,6 +12,12 @@ @logger.catch def main(): + """ + Run the application. + + Initialize the ModelBuilderService, train the ML model, + and log the output. + """ logger.info('Running builder application') ml_svc = ModelBuilderService() ml_svc.train_model() diff --git a/src/runner_inference.py b/src/runner_inference.py index ced6ba1..1551381 100644 --- a/src/runner_inference.py +++ b/src/runner_inference.py @@ -1,11 +1,25 @@ +""" +This module provides functionality for running +inference on a trained ML model. + +This script initializes the ModelInferenceService, loads the model, +and predicts the rent for a given set of features. +""" + from loguru import logger -from model.model_service import ModelService +from model.model_inference import ModelInferenceService @logger.catch def main(): - ml_svc = ModelService() + """ + Run the application. + + Initialize the ModelInferenceService, load the ML model, + and log the predicted rent for a given set of features. + """ + ml_svc = ModelInferenceService() ml_svc.load_model() feature_list = { 'area': 100,