From 079cd0df3a85ff83e52ad4f77241c829348525e2 Mon Sep 17 00:00:00 2001
From: Arvin Singh <29691465+arvinsingh@users.noreply.github.com>
Date: Sun, 10 Nov 2024 00:07:28 +0000
Subject: [PATCH] added docstring and typing

---
 src/config/db.py                  | 16 ++++++
 src/config/logger.py              | 20 ++++++-
 src/config/model.py               | 16 ++++++
 src/db/db_model.py                | 33 +++++++++++
 src/model/model_builder.py        | 31 +++++++++-
 src/model/model_inference.py      | 38 ++++++++++++-
 src/model/pipeline/collection.py  | 17 +++++-
 src/model/pipeline/model.py       | 95 +++++++++++++++++++++++++++++--
 src/model/pipeline/preparation.py | 53 +++++++++++++----
 src/runner_builder.py             | 13 +++++
 src/runner_inference.py           | 18 +++++-
 11 files changed, 324 insertions(+), 26 deletions(-)

diff --git a/src/config/db.py b/src/config/db.py
index b26c3e6..b5c9afc 100644
--- a/src/config/db.py
+++ b/src/config/db.py
@@ -1,8 +1,24 @@
+"""
+This module setups the database configuration.
+
+This module uses Pydantic's BaseSettings to manage configuration,
+allowing settings to be read from environment variables and a .env file.
+"""
+
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from sqlalchemy import create_engine
 
 
 class DbSettings(BaseSettings):
+    """
+    Database configuration settings for the application.
+
+    Attributes:
+        model_config (SettingsConfigDict): Model config, loaded from .env file.
+        db_conn_str (str): Database connection string.
+        rent_apart_table_name (str): Name of the rental apartments table in DB.
+    """
+
     model_config = SettingsConfigDict(
         env_file='config/.env',
         env_encoding='utf-8',
diff --git a/src/config/logger.py b/src/config/logger.py
index acb971e..db82f91 100644
--- a/src/config/logger.py
+++ b/src/config/logger.py
@@ -1,8 +1,23 @@
+"""
+This module is responsible for configuring the logger.
+
+Pydantic is used to load the log level from the environment variables.
+The logger is configured using the loguru library.
+"""
+
 from loguru import logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
 class LoggerSettings(BaseSettings):
+    """
+    Logger configuration settings for the application.
+
+    Attributes:
+        model_config (SettingsConfigDict): Model config, loaded from .env file.
+        log_level (str): Log level for the logger.
+    """
+
     model_config = SettingsConfigDict(
         env_file='config/.env',
         env_encoding='utf-8',
@@ -11,7 +26,10 @@ class LoggerSettings(BaseSettings):
     log_level: str
 
 
-def configure_logging(level: str):
+def configure_logging(level: str) -> None:
+    """
+    Configures the logger with the specified log level.
+    """
     logger.remove()
     logger.add(
         'logs/app.log',
diff --git a/src/config/model.py b/src/config/model.py
index 9e2f40b..ee530a4 100644
--- a/src/config/model.py
+++ b/src/config/model.py
@@ -1,8 +1,24 @@
+"""
+This module contains the model configuration settings for the application.
+
+This module uses Pydantic's BaseSettings to manage configuration,
+allowing settings to be read from environment variables and a .env file.
+"""
+
 from pydantic import DirectoryPath
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
 class ModelSettings(BaseSettings):
+    """
+    Model configuration settings for the application.
+
+    Attributes:
+        model_config (SettingsConfigDict): Model config, loaded from .env file.
+        model_path (DirectoryPath): Path to the model directory.
+        model_name (str): Name of the model file.
+    """
+
     model_config = SettingsConfigDict(
         env_file='config/.env',
         env_encoding='utf-8',
diff --git a/src/db/db_model.py b/src/db/db_model.py
index 6faa6ab..1f40de7 100644
--- a/src/db/db_model.py
+++ b/src/db/db_model.py
@@ -1,3 +1,11 @@
+"""
+This module contains the SQLAlchemy model for the rent_apartments table.
+
+The model is defined using SQLAlchemy's DeclarativeBase and Mapped classes.
+Base is a subclass of DeclarativeBase, RentApartments is a subclass of Base.
+Future models can be defined in a similar way.
+"""
+
 from sqlalchemy import INTEGER, REAL, VARCHAR
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 
@@ -5,11 +13,36 @@
 
 
 class Base(DeclarativeBase):
+    """
+    Base class for the SQLAlchemy model.
+    """
 
     pass
 
 
 class RentApartments(Base):
+    """
+    rent_apartments table model for the SQLAlchemy ORM.
+
+    Attributes:
+        address (Mapped[str]): Address of the apartment.
+        area (Mapped[float]): Area of the apartment.
+        constraction_year (Mapped[int]): Year of construction of the apartment.
+        rooms (Mapped[int]): Number of rooms in the apartment.
+        bedrooms (Mapped[int]): Number of bedrooms in the apartment.
+        bathrooms (Mapped[int]): Number of bathrooms in the apartment.
+        balcony (Mapped[str]): Whether the apartment has a balcony.
+        storage (Mapped[str]): Whether the apartment has storage.
+        parking (Mapped[str]): Whether the apartment has parking.
+        furnished (Mapped[str]): Whether the apartment is furnished.
+        garage (Mapped[str]): Whether the apartment has a garage.
+        garden (Mapped[str]): Whether the apartment has a garden.
+        energy (Mapped[str]): Energy efficiency rating of the apartment.
+        facilities (Mapped[str]): Additional facilities in the apartment.
+        zip (Mapped[str]): ZIP code of the apartment.
+        neighborhood (Mapped[str]): Neighborhood of the apartment.
+        rent (Mapped[int]): Rent of the apartment.
+    """
 
     __tablename__ = db_settings.rent_apart_table_name
 
diff --git a/src/model/model_builder.py b/src/model/model_builder.py
index bf525e3..808ddc3 100644
--- a/src/model/model_builder.py
+++ b/src/model/model_builder.py
@@ -1,3 +1,10 @@
+"""
+The module is responsible for building the model.
+
+The model is built using the `build_model` function
+from the `model` module.
+"""
+
 from loguru import logger
 
 from config import model_settings
@@ -5,12 +12,32 @@
 
 
 class ModelBuilderService:
+    """
+    The service class for building the model.
+
+    The class provides functionalities to train
+    the model and save it to a specified path.
+
+    Attributes:
+        model_path (str): Path to the model directory.
+        model_name (str): Name of the model file.
+
+    Methods:
+        __init__: Initializes the ModelBuilderService.
+        train_model: Trains the model and saves it to a
+        specified directory.
+    """
 
-    def __init__(self):
+    def __init__(self) -> None:
+        """Initialize the ModelBuilderService."""
         self.model_path = model_settings.model_path
         self.model_name = model_settings.model_name
 
-    def load_model(self):
+    def train_model(self) -> None:
+        """
+        Train the model from a specified path and
+        save to the model's directory.
+        """
         logger.info(
             f'Building the model file at '
             f'{self.model_path}/{self.model_name}',
diff --git a/src/model/model_inference.py b/src/model/model_inference.py
index 219195f..01f9d50 100644
--- a/src/model/model_inference.py
+++ b/src/model/model_inference.py
@@ -1,3 +1,11 @@
+"""
+This module provides functionality for making
+predictions using a ML model.
+
+It contains the ModelInferenceService class that offers
+methods to load a model and make predictions.
+"""
+
 from pathlib import Path
 import pickle as pkl
 
@@ -7,13 +15,36 @@
 
 
 class ModelInferenceService:
+    """
+    The service class for making predictions using a ML model.
+
+    The class provides functionalities to load a model and make
+    predictions using the model.
 
-    def __init__(self):
+    Attributes:
+        model (object): The ML model object.
+        model_path (str): Path to the model directory.
+        model_name (str): Name of the model file.
+
+    Methods:
+        __init__: Initializes the ModelInferenceService.
+        load_model: Loads the model from a specified path.
+        predict: Makes predictions using the loaded model.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the ModelInferenceService."""
         self.model = None
         self.model_path = model_settings.model_path
         self.model_name = model_settings.model_name
 
-    def load_model(self):
+    def load_model(self) -> None:
+        """
+        Load the model from a specified path
+
+        Raises:
+            FileNotFoundError: If the model file does not exist.
+        """
         logger.info(
             f'Checking for existing model file at '
             f'{self.model_path}/{self.model_name}',
@@ -31,6 +62,7 @@ def load_model(self):
         with open(model_path, 'rb') as model_file:
             self.model = pkl.load(model_file)
 
-    def predict(self, input_parameters):
+    def predict(self, input_parameters: list) -> list:
+        """Make predictions using the loaded model."""
         logger.info('Making predictions')
         return self.model.predict([input_parameters])
diff --git a/src/model/pipeline/collection.py b/src/model/pipeline/collection.py
index 087adbd..3e7571f 100644
--- a/src/model/pipeline/collection.py
+++ b/src/model/pipeline/collection.py
@@ -1,3 +1,12 @@
+"""
+This module is responsible for extracting data from the database.
+
+It uses a function to extract data from the RentApartments table in
+the database and load it into a pandas DataFrame.
+It uses SqlAlchemy to retrieve data from the database for further
+analysis or processing.
+"""
+
 from loguru import logger
 import pandas as pd
 from sqlalchemy import select
@@ -6,7 +15,13 @@
 from db.db_model import RentApartments
 
 
-def load_data_from_db():
+def load_data_from_db() -> pd.DataFrame:
+    """
+    Load data from the RentApartments table in the database.
+
+    Returns:
+        pd.DataFrame: DataFrame containing the RentApartments data
+    """
     logger.info("Extracting data from database")
     query = select(RentApartments)
     return pd.read_sql(query, engine)
diff --git a/src/model/pipeline/model.py b/src/model/pipeline/model.py
index 9ddb375..9eef147 100644
--- a/src/model/pipeline/model.py
+++ b/src/model/pipeline/model.py
@@ -1,5 +1,14 @@
+"""
+This module creates the pipeline for building, training and saving ML model.
+
+It includes the process of data preparation, model training using
+RandomForestRegressor, hyperparameter tuning with GridSearchCV,
+model evaluation, and serialization of the trained model.
+"""
+
 import pickle as pkl
 
+import pandas as pd
 from loguru import logger
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split, GridSearchCV
@@ -8,7 +17,15 @@
 from model.pipeline.preparation import prepare_data
 
 
-def build_model():
+def build_model() -> None:
+    """
+    This function creates the pipeline for building,
+    training and saving ML model.
+
+    It includes the process of data preparation, data splitting,
+    model training using RandomForestRegressor, evaluation of the model, and
+    serialization of the trained model.
+    """
     logger.info('Model building pipeline started')
     df = prepare_data()
     feature_names = [
@@ -35,12 +52,40 @@ def build_model():
     _save_model(model)
 
 
-def _get_x_y(data, col_x, col_y):
+def _get_x_y(data: pd.DataFrame, col_x: dict, col_y: str) -> tuple:
+    """
+    Extracts desired columns from the data for features and target.
+
+    Args:
+        data (pd.DataFrame): The data to extract features and target from.
+        col_x (list): The list of column names to extract as features.
+        col_y (str): The column name to extract as target.
+
+    Returns:
+        pd.Series: The features extracted from the data.
+        pd.Series: The target extracted from the data.
+    """
     logger.info(f'Defining X: {col_x} and Y: {col_y}')
     return data[col_x], data[col_y]
 
 
-def _split_train_test(features, target):
+def _split_train_test(
+        features: pd.DataFrame,
+        target: pd.Series,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
+    """
+    Splits the data into train and test sets.
+
+    Args:
+        features (pd.Series): The features to split.
+        target (pd.Series): The target to split.
+
+    Returns:
+        pd.Series: The features for training.
+        pd.Series: The features for testing.
+        pd.Series: The target for training.
+        pd.Series: The target for testing.
+    """
     logger.info('Splitting data into train and test sets')
     return train_test_split(
         features,
@@ -50,7 +95,21 @@ def _split_train_test(features, target):
     )
 
 
-def _train_model(x_train, y_train):
+def _train_model(
+        x_train: pd.DataFrame,
+        y_train: pd.Series,
+) -> RandomForestRegressor:
+    """
+    Trains the model using RandomForestRegressor.
+    GridSearchCV is used for hyperparameter tuning.
+
+    Args:
+        x_train (pd.Series): The features for training.
+        y_train (pd.Series): The target for training.
+
+    Returns:
+        RandomForestRegressor: The trained model.
+    """
     logger.info('Training model')
     grid_space = {
         'n_estimators': [100, 200, 300],
@@ -66,13 +125,37 @@ def _train_model(x_train, y_train):
     return model.best_estimator_
 
 
-def _evaluate_model(model, x_test, y_test):
+def _evaluate_model(
+        model: RandomForestRegressor,
+        x_test: pd.DataFrame,
+        y_test: pd.Series,
+) -> float:
+    """
+    Evaluates the model using the test data.
+
+    Args:
+        model (RandomForestRegressor): The trained model.
+        x_test (pd.Series): The features for testing.
+        y_test (pd.Series): The target for testing.
+
+    Returns:
+        float: The score of the model.
+    """
     model_score = model.score(x_test, y_test)
     logger.info(f'Evaluating model: {model_score}')
     return model_score
 
 
-def _save_model(model):
+def _save_model(model: RandomForestRegressor) -> None:
+    """
+    Saves the trained model to the specified path.
+
+    Args:
+        model (RandomForestRegressor): The trained model to save.
+
+    Returns:
+        None
+    """
     model_path = f'{model_settings.model_path}/{model_settings.model_name}'
     logger.info(f'Saving model to {model_path}')
     with open(model_path, 'wb') as model_file:
diff --git a/src/model/pipeline/preparation.py b/src/model/pipeline/preparation.py
index 1f1aebb..faec32f 100644
--- a/src/model/pipeline/preparation.py
+++ b/src/model/pipeline/preparation.py
@@ -1,3 +1,11 @@
+"""
+The module is used for preprocessing the data before it is used for
+training the model.
+
+It consists of functions to load data from a database, encode categorical
+columns, and parse specific columns for further processing.
+"""
+
 import re
 
 import pandas as pd
@@ -6,7 +14,14 @@
 from model.pipeline.collection import load_data_from_db
 
 
-def prepare_data():
+def prepare_data() -> pd.DataFrame:
+    """
+    Prepare the dataset for analysis and modelling. This involves loading
+    the data, encoding categorical columns, and parsing the 'garden' column.
+
+    Returns:
+        pd.DataFrame: The processed dataset.
+    """
     logger.info('Preporcessing data pipeline started')
     data = load_data_from_db()
     data_encoded = encode_cat_cols(data)
@@ -14,7 +29,16 @@ def prepare_data():
     return df
 
 
-def encode_cat_cols(data):
+def encode_cat_cols(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Encode specific categorical columns into dummy variables.
+
+    Args:
+        data (pd.DataFrame): The original dataset.
+
+    Returns:
+        pd.DataFrame: Dataset with categorical columns encoded.
+    """
     cols = ['balcony', 'storage', 'parking', 'furnished', 'garage']
     logger.info('Encoding categorical columns: {cols}')
     return pd.get_dummies(data,
@@ -23,13 +47,20 @@ def encode_cat_cols(data):
                           dtype=int)
 
 
-def parse_garden_col(data):
+def parse_garden_col(dataframe: pd.DataFrame) -> pd.DataFrame:
+    """
+    Parse the 'garden' column in the dataset. If the garden data is not
+    prsent, it is replaced with 0, otherwise extract the number from the
+    string.
+
+    Args:
+        dataframe (pd.DataFrame): The dataset with a 'garden' column.
+
+    Returns:
+        pd.DataFrame: The dataset with the 'garden' column parsed.
+    """
     logger.info('Parsing garden column')
-    for i in range(len(data)):
-        if data.loc[i, 'garden'] == 'Not present':
-            data.loc[i, 'garden'] = 0
-        else:
-            data.loc[i, 'garden'] = int(
-                re.findall(r'\d+', data.loc[i, 'garden'])[0]
-            )
-    return data
+    dataframe['garden'] = dataframe['garden'].apply(
+        lambda x: 0 if x == 'Not present' else int(re.findall(r'\d+', x)[0]),
+    )
+    return dataframe
diff --git a/src/runner_builder.py b/src/runner_builder.py
index 7cf5870..070d24e 100644
--- a/src/runner_builder.py
+++ b/src/runner_builder.py
@@ -1,3 +1,10 @@
+"""
+This module provides functionality for training a ML model.
+
+This script initializes the ModelBuilderService, trains and saves the model,
+and logs the output. A typical workflow of an ML model builder service.
+"""
+
 from loguru import logger
 
 from model.model_builder import ModelBuilderService
@@ -5,6 +12,12 @@
 
 @logger.catch
 def main():
+    """
+    Run the application.
+
+    Initialize the ModelBuilderService, train the ML model,
+    and log the output.
+    """
     logger.info('Running builder application')
     ml_svc = ModelBuilderService()
     ml_svc.train_model()
diff --git a/src/runner_inference.py b/src/runner_inference.py
index ced6ba1..1551381 100644
--- a/src/runner_inference.py
+++ b/src/runner_inference.py
@@ -1,11 +1,25 @@
+"""
+This module provides functionality for running
+inference on a trained ML model.
+
+This script initializes the ModelInferenceService, loads the model,
+and predicts the rent for a given set of features.
+"""
+
 from loguru import logger
 
-from model.model_service import ModelService
+from model.model_inference import ModelInferenceService
 
 
 @logger.catch
 def main():
-    ml_svc = ModelService()
+    """
+    Run the application.
+
+    Initialize the ModelInferenceService, load the ML model,
+    and log the predicted rent for a given set of features.
+    """
+    ml_svc = ModelInferenceService()
     ml_svc.load_model()
     feature_list = {
         'area': 100,