From 359a6ec187e2e3046fbaded4560ac0e2514b7f79 Mon Sep 17 00:00:00 2001
From: Fritjof Bengtsson <fritjof.bengtsson@sl.se>
Date: Thu, 7 Mar 2024 13:32:54 +0000
Subject: [PATCH 1/4] moved config to yaml, remove unused tests

---
 baler/baler.py                         | 112 ++++++-------
 baler/modules/config/config_service.py |   8 +
 baler/modules/helper.py                | 218 +++++++++----------------
 baler/modules/plotting.py              |  34 ++--
 baler/modules/training.py              |  59 +++----
 config.yaml                            |  29 ++++
 poetry.lock                            |  28 ++--
 pyproject.toml                         |   1 +
 tests/test_data_processing.py          |  11 --
 tests/unit/test_config_creation.py     |   6 +
 10 files changed, 237 insertions(+), 269 deletions(-)
 create mode 100644 baler/modules/config/config_service.py
 create mode 100644 config.yaml
 create mode 100644 tests/unit/test_config_creation.py

diff --git a/baler/baler.py b/baler/baler.py
index f7bcfb2e..7cea2835 100644
--- a/baler/baler.py
+++ b/baler/baler.py
@@ -20,7 +20,6 @@
 
 from .modules import helper
 import gzip
-from .modules.profiling import pytorch_profile
 
 
 __all__ = (
@@ -85,7 +84,7 @@ def perform_training(output_path, config, verbose: bool):
     """Main function calling the training functions, ran when --mode=train is selected.
         The three functions called are: `helper.process`, `helper.mode_init` and `helper.training`.
 
-        Depending on `config.data_dimensions`, the calculated latent space size will differ.
+        Depending on `config["data_dimension"]`, the calculated latent space size will differ.
 
     Args:
         output_path (path): Selects base path for determining output path
@@ -101,12 +100,12 @@ def perform_training(output_path, config, verbose: bool):
         normalization_features,
         original_shape,
     ) = helper.process(
-        config.input_path,
-        config.custom_norm,
-        config.test_size,
-        config.apply_normalization,
-        config.convert_to_blocks if hasattr(config, "convert_to_blocks") else None,
-        verbose,
+        input_path=config.get("input_path"),
+        custom_norm=config.get("custom_norm"),
+        test_size=config.get("test_size"),
+        apply_normalization=config.get("apply_normalization"),
+        convert_to_blocks=config.get("convert_to_blocks"),
+        verbose=verbose,
     )
 
     if verbose:
@@ -114,52 +113,55 @@ def perform_training(output_path, config, verbose: bool):
 
     try:
         n_features = 0
-        if config.data_dimension == 1:
-            number_of_columns = train_set_norm.shape[1]
-            config.latent_space_size = ceil(
-                number_of_columns / config.compression_ratio
-            )
-            config.number_of_columns = number_of_columns
-            n_features = number_of_columns
-        elif config.data_dimension == 2:
-            if config.model_type == "dense":
-                number_of_rows = train_set_norm.shape[1]
-                number_of_columns = train_set_norm.shape[2]
-                n_features = number_of_columns * number_of_rows
-            else:
-                number_of_rows = original_shape[1]
-                number_of_columns = original_shape[2]
+        data_dimension = config["data_dimension"]
+        match data_dimension:
+            case 1:
+                number_of_columns = train_set_norm.shape[1]
+                config["latent_space_size"] = ceil(
+                    number_of_columns / config["compression_ratio"]
+                )
+                config["number_of_columns"] = number_of_columns
                 n_features = number_of_columns
-            config.latent_space_size = ceil(
-                (number_of_rows * number_of_columns) / config.compression_ratio
-            )
-            config.number_of_columns = number_of_columns
-        else:
-            raise NameError(
-                "Data dimension can only be 1 or 2. Got config.data_dimension value = "
-                + str(config.data_dimension)
-            )
+            case 2:
+                if config["model_type"] == "dense":
+                    number_of_rows = train_set_norm.shape[1]
+                    number_of_columns = train_set_norm.shape[2]
+                    n_features = number_of_columns * number_of_rows
+                else:
+                    number_of_rows = original_shape[1]
+                    number_of_columns = original_shape[2]
+                    n_features = number_of_columns
+                config["latent_space_size"] = ceil(
+                    (number_of_rows * number_of_columns) / config["compression_ratio"]
+                )
+                config["number_of_columns"] = number_of_columns
+            case _:
+                raise NameError(
+                    "Data dimension can only be 1 or 2. Got data_dimension value = "
+                    + str(config["data_dimension"])
+                )
     except AttributeError:
         if verbose:
             print(
-                f"{config.number_of_columns} -> {config.latent_space_size} dimensions"
+                f"{config['number_of_columns']} -> {config['latent_space_size']} dimensions"
             )
-        assert number_of_columns == config.number_of_columns
+
+        assert number_of_columns == config["number_of_columns"]
 
     if verbose:
         print(
-            f"Intitalizing Model with Latent Size - {config.latent_space_size} and Features - {n_features}"
+            f"Intitalizing Model with Latent Size - {config['latent_space_size']} and Features - {n_features}"
         )
 
     device = helper.get_device()
     if verbose:
         print(f"Device used for training: {device}")
 
-    model_object = helper.model_init(config.model_name)
-    model = model_object(n_features=n_features, z_dim=config.latent_space_size)
+    model_object = helper.model_init(config["model_name"])
+    model = model_object(n_features=n_features, z_dim=config["latent_space_size"])
     model.to(device)
 
-    if config.model_name == "Conv_AE_3D" and hasattr(
+    if config["model_name"] == "Conv_AE_3D" and hasattr(
         config, "compress_to_latent_space"
     ):
         model.set_compress_to_latent_space(config.compress_to_latent_space)
@@ -178,7 +180,7 @@ def perform_training(output_path, config, verbose: bool):
     if verbose:
         print("Training complete")
 
-    if config.apply_normalization:
+    if config["apply_normalization"]:
         np.save(
             os.path.join(training_path, "normalization_features.npy"),
             normalization_features,
@@ -188,7 +190,7 @@ def perform_training(output_path, config, verbose: bool):
                 f"Normalization features saved to {os.path.join(training_path, 'normalization_features.npy')}"
             )
 
-    if config.separate_model_saving:
+    if hasattr(config, "separate_model_saving") and config["separate_model_saving"]:
         helper.encoder_decoder_saver(
             trained_model,
             os.path.join(output_path, "compressed_output", "encoder.pt"),
@@ -240,7 +242,7 @@ def perform_compression(output_path, config, verbose: bool):
     """Main function calling the compression functions, ran when --mode=compress is selected.
        The main function being called here is: `helper.compress`
 
-        If `config.extra_compression` is selected, the compressed file is further compressed via zip
+        If `config["extra_compression"]` is selected, the compressed file is further compressed via zip
         Else, the function returns a compressed file of `.npz`, only compressed by Baler.
 
     Args:
@@ -252,17 +254,17 @@ def perform_compression(output_path, config, verbose: bool):
         An `.npz` file which includes:
         - The compressed data
         - The data headers
-        - Normalization features if `config.apply_normalization=True`
+        - Normalization features if `config["apply_normalization"]=True`
     """
     print("Compressing...")
     start = time.time()
     normalization_features = []
 
-    if config.apply_normalization:
+    if config["apply_normalization"]:
         normalization_features = np.load(
             os.path.join(output_path, "training", "normalization_features.npy")
         )
-    if config.separate_model_saving:
+    if config.get("separate_model_saving"):
         (
             compressed,
             error_bound_batch,
@@ -287,9 +289,9 @@ def perform_compression(output_path, config, verbose: bool):
 
     print("Compression took:", f"{(end - start) / 60:.3} minutes")
 
-    names = np.load(config.input_path)["names"]
+    names = np.load(config["input_path"])["names"]
 
-    if config.extra_compression:
+    if config["extra_compression"]:
         if verbose:
             print("Extra compression selected")
             print(
@@ -313,7 +315,7 @@ def perform_compression(output_path, config, verbose: bool):
             names=names,
             normalization_features=normalization_features,
         )
-    if config.save_error_bounded_deltas:
+    if config["save_error_bounded_deltas"]:
         error_bound_batch_index = np.array(
             [error_bound_batch, error_bound_index], dtype=object
         )
@@ -342,7 +344,7 @@ def perform_decompression(output_path, config, verbose: bool):
     """Main function calling the decompression functions, ran when --mode=decompress is selected.
        The main function being called here is: `helper.decompress`
 
-        If `config.apply_normalization=True` the output is un-normalized with the same normalization features saved from `perform_training()`.
+        If `config["apply_normalization"]=True` the output is un-normalized with the same normalization features saved from `perform_training()`.
 
     Args:
         output_path (path): Selects base path for determining output path
@@ -352,9 +354,9 @@ def perform_decompression(output_path, config, verbose: bool):
     print("Decompressing...")
 
     start = time.time()
-    model_name = config.model_name
-    data_before = np.load(config.input_path)["data"]
-    if config.separate_model_saving:
+    model_name = config["model_name"]
+    data_before = np.load(config["input_path"])["data"]
+    if config.get("separate_model_saving"):
         decompressed, names, normalization_features = helper.decompress(
             model_path=os.path.join(output_path, "compressed_output", "decoder.pt"),
             input_path=os.path.join(output_path, "compressed_output", "compressed.npz"),
@@ -398,7 +400,7 @@ def perform_decompression(output_path, config, verbose: bool):
             "Target Shape - ",
             data_before.shape,
         )
-        if config.model_type == "dense":
+        if config["model_type"] == "dense":
             decompressed = decompressed.reshape(
                 data_before.shape[0], data_before.shape[1], data_before.shape[2]
             )
@@ -407,7 +409,7 @@ def perform_decompression(output_path, config, verbose: bool):
                 data_before.shape[0], 1, data_before.shape[1], data_before.shape[2]
             )
 
-    if config.apply_normalization:
+    if config["apply_normalization"]:
         print("Un-normalizing...")
         normalization_features = np.load(
             os.path.join(output_path, "training", "normalization_features.npy"),
@@ -437,7 +439,7 @@ def perform_decompression(output_path, config, verbose: bool):
     end = time.time()
     print("Decompression took:", f"{(end - start) / 60:.3} minutes")
 
-    if config.extra_compression:
+    if config["extra_compression"]:
         if verbose:
             print("Extra compression selected")
             print(
@@ -467,7 +469,7 @@ def print_info(output_path, config):
         "================================== \n Information about your compression \n================================== "
     )
 
-    original = config.input_path
+    original = config["input_path"]
     compressed_path = os.path.join(output_path, "compressed_output")
     decompressed_path = os.path.join(output_path, "decompressed_output")
     training_path = os.path.join(output_path, "training")
diff --git a/baler/modules/config/config_service.py b/baler/modules/config/config_service.py
new file mode 100644
index 00000000..0eb28427
--- /dev/null
+++ b/baler/modules/config/config_service.py
@@ -0,0 +1,8 @@
+from typing import Any
+import yaml
+
+
+def load_config(file_path) -> dict[str, Any]:
+    with open(file_path, "r") as file:
+        config = yaml.safe_load(file).get("config")
+    return config
diff --git a/baler/modules/helper.py b/baler/modules/helper.py
index 705559d3..b8af4a0b 100644
--- a/baler/modules/helper.py
+++ b/baler/modules/helper.py
@@ -15,13 +15,15 @@
 import argparse
 import importlib
 import os
+import shutil
 import sys
-from dataclasses import dataclass
 from math import ceil
 import gzip
+from pyparsing import Any
 
 from tqdm import tqdm
 
+
 sys.path.append(os.getcwd())
 import numpy as np
 import torch
@@ -29,6 +31,7 @@
 from sklearn.model_selection import train_test_split
 
 from ..modules import training, plotting, data_processing, diagnostics
+from ..modules.config import config_service
 
 
 def get_arguments():
@@ -82,15 +85,18 @@ def get_arguments():
 
     workspace_name = args.project[0]
     project_name = args.project[1]
-    config_path = (
-        f"workspaces.{workspace_name}.{project_name}.config.{project_name}_config"
-    )
-
-    if args.mode == "newProject":
-        config = None
-    else:
-        config = Config
-        importlib.import_module(config_path).set_config(config)
+    config: dict[str, Any] = {}
+
+    if args.mode != "newProject":
+        path_to_config = os.path.join(
+            "workspaces",
+            workspace_name,
+            project_name,
+            "config",
+            f"{project_name}_config.yaml",
+        )
+        print(f"Trying to load config from {path_to_config}")
+        config = config_service.load_config(path_to_config)
 
     return (
         config,
@@ -140,94 +146,10 @@ def create_new_project(
             print(f"Creating directory {directory}...")
         os.makedirs(directory, exist_ok=True)
 
-    # Populate default config
-    with open(
-        os.path.join(project_path, "config", f"{project_name}_config.py"), "w"
-    ) as f:
-        f.write(create_default_config(workspace_name, project_name))
-
-
-@dataclass
-class Config:
-    """Defines a configuration dataclass"""
-
-    input_path: str
-    compression_ratio: float
-    epochs: int
-    early_stopping: bool
-    early_stoppin_patience: int
-    lr_scheduler: bool
-    lr_scheduler_patience: int
-    min_delta: int
-    model_name: str
-    model_type = str
-    custom_norm: bool
-    l1: bool
-    reg_param: float
-    RHO: float
-    lr: float
-    batch_size: int
-    test_size: float
-    data_dimension: int
-    intermittent_model_saving: bool
-    intermittent_saving_patience: int
-    mse_avg: bool
-    mse_sum: bool
-    emd: bool
-    l1: bool
-    deterministic_algorithm: bool
-
-
-def create_default_config(workspace_name: str, project_name: str) -> str:
-    """Creates a default config file for a project.
-    Args:
-        workspace_name (str): Name of the workspace.
-        project_name (str): Name of the project.
-    Returns:
-        str: Default config file.
-    """
-
-    return f"""
-# === Configuration options ===
-
-def set_config(c):
-    c.input_path                   = "workspaces/{workspace_name}/data/{project_name}_data.npz"
-    c.data_dimension               = 1
-    c.compression_ratio            = 2.0
-    c.apply_normalization          = True
-    c.model_name                   = "AE"
-    c.model_type                    = "dense"
-    c.epochs                       = 5
-    c.lr                           = 0.001
-    c.batch_size                   = 512
-    c.early_stopping               = True
-    c.lr_scheduler                 = True
-
-
-
-
-# === Additional configuration options ===
-
-    c.early_stopping_patience      = 100
-    c.min_delta                    = 0
-    c.lr_scheduler_patience        = 50
-    c.custom_norm                  = False
-    c.reg_param                    = 0.001
-    c.RHO                          = 0.05
-    c.test_size                    = 0
-    # c.number_of_columns            = 24
-    # c.latent_space_size            = 12
-    c.extra_compression            = False
-    c.intermittent_model_saving    = False
-    c.intermittent_saving_patience = 100
-    c.mse_avg                      = False
-    c.mse_sum                      = True
-    c.emd                          = False
-    c.l1                           = True
-    c.activation_extraction        = False
-    c.deterministic_algorithm      = True
-
-"""
+    shutil.copy2(
+        "config.yaml",
+        os.path.join(project_path, "config", f"{project_name}_config.yaml"),
+    )
 
 
 def model_init(model_name: str):
@@ -291,6 +213,10 @@ def process(
     Returns: ndarray, ndarray, ndarray: Array with the train set, array with the test set and array with the
     normalization features.
     """
+
+    if input_path is None:
+        raise ValueError("Input path is None")
+
     loaded = np.load(input_path)
     data = loaded["data"]
 
@@ -485,8 +411,8 @@ def compress(model_path, config):
         torch.Tensor: Compressed data as PyTorch tensor
     """
 
-    # Loads the data and applies normalization if config.apply_normalization = True
-    loaded = np.load(config.input_path)
+    # Loads the data and applies normalization if config["apply_normalization"] = True
+    loaded = np.load(config["input_path"])
     data_before = loaded["data"]
     original_shape = data_before.shape
 
@@ -495,71 +421,75 @@ def compress(model_path, config):
             config.convert_to_blocks, data_before
         )
 
-    if config.apply_normalization:
+    if config["apply_normalization"]:
         print("Normalizing...")
-        data = normalize(data_before, config.custom_norm)
+        data = normalize(data_before, config["custom_norm"])
     else:
         data = data_before
     number_of_columns = 0
     try:
         n_features = 0
-        if config.data_dimension == 1:
-            column_names = np.load(config.input_path)["names"]
+        if config["data_dimension"] == 1:
+            column_names = np.load(config["input_path"])["names"]
             number_of_columns = len(column_names)
-            config.latent_space_size = ceil(
-                number_of_columns / config.compression_ratio
+            config["latent_space_size"] = ceil(
+                number_of_columns / config["compression_ratio"]
             )
-            config.number_of_columns = number_of_columns
+            config["number_of_columns"] = number_of_columns
             n_features = number_of_columns
-        elif config.data_dimension == 2:
-            if config.model_type == "dense":
+        elif config["data_dimension"] == 2:
+            if config["model_type"] == "dense":
                 number_of_rows = data.shape[1]
-                config.number_of_columns = data.shape[2]
-                n_features = number_of_rows * config.number_of_columns
+                config["number_of_columns"] = data.shape[2]
+                n_features = number_of_rows * config["number_of_columns"]
             else:
                 number_of_rows = original_shape[1]
-                config.number_of_columns = original_shape[2]
-                n_features = config.number_of_columns
-            config.latent_space_size = ceil(
-                (number_of_rows * config.number_of_columns) / config.compression_ratio
+                config["number_of_columns"] = original_shape[2]
+                n_features = config["number_of_columns"]
+            config["latent_space_size"] = ceil(
+                (number_of_rows * config["number_of_columns"])
+                / config["compression_ratio"]
             )
         else:
             raise NameError(
-                "Data dimension can only be 1 or 2. Got config.data_dimension = "
-                + str(config.data_dimension)
+                "Data dimension can only be 1 or 2. Got data_dimension = "
+                + str(config["data_dimension"])
             )
-    except AttributeError:
-        number_of_columns = config.number_of_columns
-        latent_space_size = config.latent_space_size
+    except KeyError:
+        number_of_columns = config["number_of_columns"]
+        latent_space_size = config["latent_space_size"]
         print(f"{number_of_columns} -> {latent_space_size} dimensions")
 
     # Initialise and load the model correctly.
-    latent_space_size = config.latent_space_size
-    bs = config.batch_size
+    latent_space_size = config["latent_space_size"]
+    bs = config["batch_size"]
     device = get_device()
-    model_object = data_processing.initialise_model(config.model_name)
+    model_object = data_processing.initialise_model(config["model_name"])
     model = data_processing.load_model(
         model_object,
         model_path=model_path,
         n_features=n_features,
-        z_dim=config.latent_space_size,
+        z_dim=config["latent_space_size"],
     )
     model.eval()
 
-    if config.data_dimension == 2:
-        if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D":
+    if config["data_dimension"] == 2:
+        if (
+            config["model_type"] == "convolutional"
+            and config["model_name"] == "Conv_AE_3D"
+        ):
             data_tensor = torch.tensor(data, dtype=torch.float32).view(
                 data.shape[0] // bs, 1, bs, data.shape[1], data.shape[2]
             )
-        elif config.model_type == "convolutional":
+        elif config["model_type"] == "convolutional":
             data_tensor = torch.tensor(data, dtype=torch.float32).view(
                 data.shape[0], 1, data.shape[1], data.shape[2]
             )
-        elif config.model_type == "dense":
+        elif config["model_type"] == "dense":
             data_tensor = torch.tensor(data, dtype=torch.float32).view(
                 data.shape[0], data.shape[1] * data.shape[2]
             )
-    elif config.data_dimension == 1:
+    elif config["data_dimension"] == 1:
         data_tensor = torch.tensor(data, dtype=torch.float64)
 
     # Batching data to avoid memory leaks
@@ -584,7 +514,7 @@ def compress(model_path, config):
 
             compressed_output = model.encode(data_batch)
 
-            if config.save_error_bounded_deltas:
+            if config["save_error_bounded_deltas"]:
                 decoded_output = model.decode(compressed_output)
                 decoded_output = detacher(decoded_output)
                 deltas_compressed = 0
@@ -592,7 +522,7 @@ def compress(model_path, config):
             compressed_output = detacher(compressed_output)
             data_batch = detacher(data_batch)
 
-            if config.save_error_bounded_deltas:
+            if config["save_error_bounded_deltas"]:
                 (
                     deltas,
                     rms_pred_error_index,
@@ -608,7 +538,7 @@ def compress(model_path, config):
             else:
                 compressed = np.concatenate((compressed, compressed_output))
 
-    if config.save_error_bounded_deltas:
+    if config["save_error_bounded_deltas"]:
         print("Total Deltas Found - ", deltas_compressed)
 
     return (compressed, error_bound_batch, error_bound_deltas, error_bound_index)
@@ -645,12 +575,12 @@ def decompress(
     names = loaded["names"]
     normalization_features = loaded["normalization_features"]
 
-    if config.model_type == "convolutional":
+    if config.get("model_type") == "convolutional":
         final_layer_details = np.load(
             os.path.join(output_path, "training", "final_layer.npy"), allow_pickle=True
         )
 
-    if config.save_error_bounded_deltas:
+    if config["save_error_bounded_deltas"]:
         loaded_deltas = np.load(
             gzip.GzipFile(input_path_deltas, "r"), allow_pickle=True
         )
@@ -662,11 +592,11 @@ def decompress(
         error_bound_index = loaded_batch_indexes[1]
         deltas_added = 0
 
-    model_name = config.model_name
+    model_name = config["model_name"]
     latent_space_size = len(data[0])
-    bs = config.batch_size
+    bs = config["batch_size"]
     model_dict = torch.load(str(model_path), map_location=get_device())
-    if config.data_dimension == 2 and config.model_type == "dense":
+    if config["data_dimension"] == 2 and config["model_type"] == "dense":
         number_of_columns = int((len(model_dict[list(model_dict.keys())[-1]])))
     else:
         number_of_columns = len(model_dict[list(model_dict.keys())[-1]])
@@ -682,7 +612,7 @@ def decompress(
     )
     model.eval()
 
-    if config.model_type == "convolutional":
+    if config.get("model_type") == "convolutional":
         model.set_final_layer_dims(final_layer_details)
 
     # Load the data, convert to tensor and batch it to avoid memory leaks
@@ -703,7 +633,7 @@ def decompress(
             out = model.decode(data_batch).to(device)
             # Converting back to numpyarray
             out = detacher(out)
-            if config.save_error_bounded_deltas:
+            if config["save_error_bounded_deltas"]:
                 if idx in error_bound_batch:
                     # Error Bounded Deltas added to Decompressed output
                     delta_idx = np.where(error_bound_batch == idx)
@@ -720,10 +650,10 @@ def decompress(
             else:
                 decompressed = np.concatenate((decompressed, out))
 
-    if config.save_error_bounded_deltas:
+    if config["save_error_bounded_deltas"]:
         print("Total Deltas Added - ", deltas_added)
 
-    if config.data_dimension == 2 and config.model_type == "dense":
+    if config["data_dimension"] == 2 and config["model_type"] == "dense":
         decompressed = decompressed.reshape(
             (len(decompressed), original_shape[1], original_shape[2])
         )
@@ -808,12 +738,12 @@ def perform_hls4ml_conversion(output_path, config):
 
     model_path = os.path.join(output_path, "compressed_output", "model.pt")
 
-    model_object = data_processing.initialise_model(config.model_name)
+    model_object = data_processing.initialise_model(config["model_name"])
     model = data_processing.load_model(
         model_object,
         model_path=model_path,
-        n_features=config.number_of_columns,
-        z_dim=config.latent_space_size,
+        n_features=config["number_of_columns"],
+        z_dim=config["latent_space_size"],
     )
     model.to("cpu")
 
diff --git a/baler/modules/plotting.py b/baler/modules/plotting.py
index aa59656f..f7d9b9be 100644
--- a/baler/modules/plotting.py
+++ b/baler/modules/plotting.py
@@ -36,16 +36,16 @@ def loss_plot(path_to_loss_data, output_path, config):
     val_loss = loss_data[1]
     conf_list = [
         len(train_loss),
-        config.model_name,
-        config.reg_param,
-        config.lr,
-        config.batch_size,
+        config["model_name"],
+        config["reg_param"],
+        config["lr"],
+        config["batch_size"],
     ]
 
     plt.figure(figsize=(10, 7))
     plt.title("Loss plot")
     plt.plot(train_loss, color="orange", label="Train Loss")
-    if config.test_size:
+    if config["test_size"]:
         plt.plot(val_loss, color="red", label="Validation Loss")
     for i in range(len(conf_list)):
         plt.plot([], [], " ", label=str_list[i] + " " + str(conf_list[i]))
@@ -108,12 +108,12 @@ def plot_1D(output_path: str, config):
         config (dataclass): The config class containing attributes set in the config file
     """
 
-    before_path = config.input_path
+    before_path = config["input_path"]
     after_path = os.path.join(output_path, "decompressed_output", "decompressed.npz")
 
     before = np.transpose(np.load(before_path)["data"])
     after = np.transpose(np.load(after_path)["data"])
-    names = np.load(config.input_path)["names"]
+    names = np.load(config["input_path"])["names"]
 
     index_to_cut = get_index_to_cut(3, 1e-6, before)
     before = np.delete(before, index_to_cut, axis=1)
@@ -256,7 +256,7 @@ def plot_2D_old(project_path, config):
         config (dataclass): The config class containing attributes set in the config file
     """
 
-    data = np.load(config.input_path)["data"]
+    data = np.load(config["input_path"])["data"]
     data_decompressed = np.load(project_path + "/decompressed_output/decompressed.npz")[
         "data"
     ]
@@ -266,7 +266,7 @@ def plot_2D_old(project_path, config):
     else:
         num_tiles = 1
 
-    if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D":
+    if config["model_type"] == "convolutional" and config["model_name"] == "Conv_AE_3D":
         data_decompressed = data_decompressed.reshape(
             data_decompressed.shape[0] * data_decompressed.shape[2],
             1,
@@ -276,9 +276,9 @@ def plot_2D_old(project_path, config):
 
     print("=== Plotting ===")
     for ind in trange(num_tiles):
-        if config.model_type == "convolutional":
+        if config["model_type"] == "convolutional":
             tile_data_decompressed = data_decompressed[ind][0] * 0.04 * 1000
-        elif config.model_type == "dense":
+        elif config["model_type"] == "dense":
             tile_data_decompressed = data_decompressed[ind] * 0.04 * 1000
         tile_data = data[ind] * 0.04 * 1000
 
@@ -380,7 +380,7 @@ def plot_2D(project_path, config):
         config (dataclass): The config class containing attributes set in the config file
     """
 
-    data = np.load(config.input_path)["data"]
+    data = np.load(config["input_path"])["data"]
     data_decompressed = np.load(project_path + "/decompressed_output/decompressed.npz")[
         "data"
     ]
@@ -395,7 +395,7 @@ def plot_2D(project_path, config):
     else:
         num_tiles = 1
 
-    # if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D":
+    # if config["model_type"] == "convolutional" and config["model_name"] == "Conv_AE_3D":
     #     data_decompressed = data_decompressed.reshape(
     #         data_decompressed.shape[0] * data_decompressed.shape[2],
     #         1,
@@ -405,9 +405,9 @@ def plot_2D(project_path, config):
 
     print("=== Plotting ===")
     for ind in trange(num_tiles):
-        # if config.model_type == "convolutional":
+        # if config["model_type"] == "convolutional":
         #     tile_data_decompressed = data_decompressed[ind][0]
-        # elif config.model_type == "dense":
+        # elif config["model_type"] == "dense":
         #     tile_data_decompressed = data_decompressed[ind][0]
         tile_data = data[ind]
         tile_data_decompressed = data_decompressed[ind]
@@ -444,7 +444,7 @@ def plot(output_path, config):
         output_path (path): The path to the project directory
         config (dataclass): The config class containing attributes set in the config file
     """
-    if config.data_dimension == 1:
+    if config["data_dimension"] == 1:
         plot_1D(output_path, config)
-    elif config.data_dimension == 2:
+    elif config["data_dimension"] == 2:
         plot_2D(output_path, config)
diff --git a/baler/modules/training.py b/baler/modules/training.py
index ca189def..3d4fff63 100644
--- a/baler/modules/training.py
+++ b/baler/modules/training.py
@@ -149,7 +149,7 @@ def seed_worker(worker_id):
 
 def train(model, variables, train_data, test_data, project_path, config):
     """Does the entire training loop by calling the `fit()` and `validate()`. Appart from this, this is the main function where the data is converted
-        to the correct type for it to be trained, via `torch.Tensor()`. Furthermore, the batching is also done here, based on `config.batch_size`,
+        to the correct type for it to be trained, via `torch.Tensor()`. Furthermore, the batching is also done here, based on `config["batch_size"]`,
         and it is the `torch.utils.data.DataLoader` doing the splitting.
         Applying either `EarlyStopping` or `LR Scheduler` is also done here, all based on their respective `config` arguments.
         For reproducibility, the seeds can also be fixed in this function.
@@ -165,7 +165,7 @@ def train(model, variables, train_data, test_data, project_path, config):
     """
     # Fix the random seed - TODO: add flag to make this optional
 
-    if config.deterministic_algorithm:
+    if config["deterministic_algorithm"]:
         random.seed(0)
         torch.manual_seed(0)
         np.random.seed(0)
@@ -173,16 +173,16 @@ def train(model, variables, train_data, test_data, project_path, config):
         g = torch.Generator()
         g.manual_seed(0)
 
-    test_size = config.test_size
-    learning_rate = config.lr
-    bs = config.batch_size
-    reg_param = config.reg_param
-    rho = config.RHO
-    l1 = config.l1
-    epochs = config.epochs
-    latent_space_size = config.latent_space_size
-    intermittent_model_saving = config.intermittent_model_saving
-    intermittent_saving_patience = config.intermittent_saving_patience
+    test_size = config["test_size"]
+    learning_rate = config["lr"]
+    bs = config["batch_size"]
+    reg_param = config["reg_param"]
+    rho = config["RHO"]
+    l1 = config["l1"]
+    epochs = config["epochs"]
+    latent_space_size = config["latent_space_size"]
+    intermittent_model_saving = config["intermittent_model_saving"]
+    intermittent_saving_patience = config["intermittent_saving_patience"]
 
     model_children = list(model.children())
 
@@ -191,8 +191,8 @@ def train(model, variables, train_data, test_data, project_path, config):
     model = model.to(device)
 
     # Converting data to tensors
-    if config.data_dimension == 2:
-        if config.model_type == "dense":
+    if config["data_dimension"] == 2:
+        if config["model_type"] == "dense":
             # print(train_data.shape)
             # print(test_data.shape)
             # sys.exit()
@@ -202,7 +202,10 @@ def train(model, variables, train_data, test_data, project_path, config):
             valid_ds = torch.tensor(test_data, dtype=torch.float32, device=device).view(
                 test_data.shape[0], test_data.shape[1] * test_data.shape[2]
             )
-        elif config.model_type == "convolutional" and config.model_name == "Conv_AE_3D":
+        elif (
+            config["model_type"] == "convolutional"
+            and config["model_name"] == "Conv_AE_3D"
+        ):
             train_ds = torch.tensor(
                 train_data, dtype=torch.float32, device=device
             ).view(
@@ -219,21 +222,21 @@ def train(model, variables, train_data, test_data, project_path, config):
                 train_data.shape[1],
                 train_data.shape[2],
             )
-        elif config.model_type == "convolutional":
+        elif config["model_type"] == "convolutional":
             train_ds = torch.tensor(
                 train_data, dtype=torch.float32, device=device
             ).view(train_data.shape[0], 1, train_data.shape[1], train_data.shape[2])
             valid_ds = torch.tensor(test_data, dtype=torch.float32, device=device).view(
                 train_data.shape[0], 1, train_data.shape[1], train_data.shape[2]
             )
-    elif config.data_dimension == 1:
+    elif config["data_dimension"] == 1:
         train_ds = torch.tensor(train_data, dtype=torch.float64, device=device)
         valid_ds = torch.tensor(test_data, dtype=torch.float64, device=device)
 
     # Pushing input data into the torch-DataLoader object and combines into one DataLoader object (a basic wrapper
     # around several DataLoader objects).
 
-    if config.deterministic_algorithm:
+    if config["deterministic_algorithm"]:
         train_dl = DataLoader(
             train_ds,
             batch_size=bs,
@@ -266,15 +269,15 @@ def train(model, variables, train_data, test_data, project_path, config):
     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 
     # Activate early stopping
-    if config.early_stopping:
+    if config["early_stopping"]:
         early_stopping = utils.EarlyStopping(
-            patience=config.early_stopping_patience, min_delta=config.min_delta
+            patience=config["early_stopping_patience"], min_delta=config["min_delta"]
         )  # Changes to patience & min_delta can be made in configs
 
     # Activate LR Scheduler
-    if config.lr_scheduler:
+    if config["lr_scheduler"]:
         lr_scheduler = utils.LRScheduler(
-            optimizer=optimizer, patience=config.lr_scheduler_patience
+            optimizer=optimizer, patience=config["lr_scheduler_patience"]
         )
 
     # Training and Validation of the model
@@ -283,7 +286,7 @@ def train(model, variables, train_data, test_data, project_path, config):
     start = time.time()
 
     # Registering hooks for activation extraction
-    if config.activation_extraction:
+    if config["activation_extraction"]:
         hooks = model.store_hooks()
 
     for epoch in range(epochs):
@@ -299,7 +302,7 @@ def train(model, variables, train_data, test_data, project_path, config):
             latent_dim=latent_space_size,
             RHO=rho,
             l1=l1,
-            n_dimensions=config.data_dimension,
+            n_dimensions=config["data_dimension"],
         )
         train_loss.append(train_epoch_loss)
 
@@ -315,9 +318,9 @@ def train(model, variables, train_data, test_data, project_path, config):
             val_epoch_loss = train_epoch_loss
             val_loss.append(val_epoch_loss)
 
-        if config.lr_scheduler:
+        if config["lr_scheduler"]:
             lr_scheduler(val_epoch_loss)
-        if config.early_stopping:
+        if config["early_stopping"]:
             early_stopping(val_epoch_loss)
             if early_stopping.early_stop:
                 break
@@ -331,7 +334,7 @@ def train(model, variables, train_data, test_data, project_path, config):
     end = time.time()
 
     # Saving activations values
-    if config.activation_extraction:
+    if config["activation_extraction"]:
         activations = diagnostics.dict_to_square_matrix(model.get_activations())
         model.detach_hooks(hooks)
         np.save(os.path.join(project_path, "activations.npy"), activations)
@@ -341,7 +344,7 @@ def train(model, variables, train_data, test_data, project_path, config):
         os.path.join(project_path, "loss_data.npy"), np.array([train_loss, val_loss])
     )
 
-    if config.model_type == "convolutional":
+    if config.get("model_type") == "convolutional":
         final_layer = model.get_final_layer_dims()
         np.save(os.path.join(project_path, "final_layer.npy"), np.array(final_layer))
 
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 00000000..db18199e
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,29 @@
+config:
+  data_dimension: 1
+  compression_ratio: 2.0
+  apply_normalization: True
+  model_name: "AE"
+  model_type: "dense"
+  epochs: 5
+  lr: 0.001
+  batch_size: 512
+  early_stopping: True
+  lr_scheduler: True
+  early_stopping_patience: 100
+  min_delta: 0
+  lr_scheduler_patience: 50
+  custom_norm: False
+  reg_param: 0.001
+  RHO: 0.05
+  test_size: 0
+  #number_of_columns: 24
+  #latent_space_size: 12
+  extra_compression: False
+  intermittent_model_saving: False
+  intermittent_saving_patience: 100
+  mse_avg: False
+  mse_sum: True
+  emd: False
+  l1: True
+  activation_extraction: False
+  deterministic_algorithm: True
diff --git a/poetry.lock b/poetry.lock
index 7b949e57..ad2628cc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -407,13 +407,13 @@ typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
 name = "flatbuffers"
-version = "23.5.26"
+version = "24.3.6"
 description = "The FlatBuffers serialization format for Python"
 optional = true
 python-versions = "*"
 files = [
-    {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"},
-    {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"},
+    {file = "flatbuffers-24.3.6-py2.py3-none-any.whl", hash = "sha256:0bc1a9d968c0ba996f97b8c255214ec005cf21b962ef60157f7aa2fc647481f1"},
+    {file = "flatbuffers-24.3.6.tar.gz", hash = "sha256:8d90a756ad5754be1fcdaa77065065125c9832ed045b4078875b4d3bc1953352"},
 ]
 
 [[package]]
@@ -724,32 +724,32 @@ files = [
 
 [[package]]
 name = "importlib-metadata"
-version = "7.0.1"
+version = "7.0.2"
 description = "Read metadata from Python packages"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"},
-    {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"},
+    {file = "importlib_metadata-7.0.2-py3-none-any.whl", hash = "sha256:f4bc4c0c070c490abf4ce96d715f68e95923320370efb66143df00199bb6c100"},
+    {file = "importlib_metadata-7.0.2.tar.gz", hash = "sha256:198f568f3230878cb1b44fbd7975f87906c22336dba2e4a7f05278c281fbd792"},
 ]
 
 [package.dependencies]
 zipp = ">=0.5"
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 perf = ["ipython"]
-testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "importlib-resources"
-version = "6.1.2"
+version = "6.1.3"
 description = "Read resources from Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_resources-6.1.2-py3-none-any.whl", hash = "sha256:9a0a862501dc38b68adebc82970140c9e4209fc99601782925178f8386339938"},
-    {file = "importlib_resources-6.1.2.tar.gz", hash = "sha256:308abf8474e2dba5f867d279237cd4076482c3de7104a40b41426370e891549b"},
+    {file = "importlib_resources-6.1.3-py3-none-any.whl", hash = "sha256:4c0269e3580fe2634d364b39b38b961540a7738c02cb984e98add8b4221d793d"},
+    {file = "importlib_resources-6.1.3.tar.gz", hash = "sha256:56fb4525197b78544a3354ea27793952ab93f935bb4bf746b846bb1015020f2b"},
 ]
 
 [package.dependencies]
@@ -757,7 +757,7 @@ zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"]
+testing = ["jaraco.collections", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"]
 
 [[package]]
 name = "iniconfig"
@@ -1777,7 +1777,7 @@ six = ">=1.5"
 name = "pyyaml"
 version = "6.0.1"
 description = "YAML parser and emitter for Python"
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
@@ -2488,4 +2488,4 @@ hls4ml = ["hls4ml", "tensorflow"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8, <3.11.10"
-content-hash = "0b0cf4871b8e171e15c6001e9f68266ef3585ecf750d1aee374b68f94ee3cf72"
+content-hash = "08a5afdbc389785e91457840486a9ba5a87b8aed19ac7bd82e6c76e2cc6ccaba"
diff --git a/pyproject.toml b/pyproject.toml
index 193d2759..04496ff3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ scikit-learn = "^1.2.0"
 hls4ml = { version = "^0.7.1", optional = true }
 tensorflow = { version = "^2.12.0", optional = true }
 numpy = "1.23.5"
+pyyaml = "^6.0.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.1"
diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py
index 1cfc479e..390d1b4e 100644
--- a/tests/test_data_processing.py
+++ b/tests/test_data_processing.py
@@ -23,17 +23,6 @@
 from baler.modules import helper
 
 
-def test_import_config_success():
-    # Call the import_config function with the sample config file path
-    config = helper.Config
-    config.Foo = "Bar"
-    config.Baz = 10
-
-    # Assert that the result is equal to the expected config
-    # This checks that the import_config function correctly loads the JSON file and returns the expected dictionary
-    assert config.Foo == "Bar"
-
-
 def test_save_model():
     # Test data
     model = torch.nn.Linear(3, 2)
diff --git a/tests/unit/test_config_creation.py b/tests/unit/test_config_creation.py
new file mode 100644
index 00000000..10384cb2
--- /dev/null
+++ b/tests/unit/test_config_creation.py
@@ -0,0 +1,6 @@
+from baler.modules.config.config_service import load_config
+
+
+def test_read_config():
+    config = load_config("config.yaml")
+    assert config is not None

From d017c52b3b49530b3f78666fa4d337074f8189e8 Mon Sep 17 00:00:00 2001
From: axelgallen <axel.gallen@physics.uu.se>
Date: Thu, 7 Mar 2024 16:54:18 +0000
Subject: [PATCH 2/4] Fixed place where old config notation was left

---
 baler/modules/plotting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baler/modules/plotting.py b/baler/modules/plotting.py
index f7d9b9be..849d2f2f 100644
--- a/baler/modules/plotting.py
+++ b/baler/modules/plotting.py
@@ -385,7 +385,7 @@ def plot_2D(project_path, config):
         "data"
     ]
 
-    if config.convert_to_blocks:
+    if config["convert_to_blocks"]:
         data_decompressed = data_decompressed.reshape(
             data.shape[0], data.shape[1], data.shape[2]
         )

From bda6b871921ddda8efc26ab697526df136ef672f Mon Sep 17 00:00:00 2001
From: Fritjof Bengtsson <fritjof.bengtsson@sl.se>
Date: Fri, 8 Mar 2024 13:44:57 +0000
Subject: [PATCH 3/4] add empty input_path to template config

---
 config.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/config.yaml b/config.yaml
index db18199e..07158de6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,4 +1,5 @@
 config:
+  input_path: ""
   data_dimension: 1
   compression_ratio: 2.0
   apply_normalization: True
@@ -27,3 +28,30 @@ config:
   l1: True
   activation_extraction: False
   deterministic_algorithm: True
+  type_list: [
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "float64",
+      "int",
+      "int",
+      "int",
+      "int",
+      "int",
+      "int",
+      "int",
+      "float64",
+      "float64",
+      "float64",
+      "int",
+      "int",
+  ]
+  convert_to_blocks: False
\ No newline at end of file

From f956fd92db63e3c70c26e5ec77c3f5eedfc0b206 Mon Sep 17 00:00:00 2001
From: Fritjof Bengtsson <fritjof.bengtsson@sl.se>
Date: Fri, 8 Mar 2024 13:48:19 +0000
Subject: [PATCH 4/4] improve error message when input_path is unset

---
 baler/modules/helper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/baler/modules/helper.py b/baler/modules/helper.py
index b8af4a0b..a9bee818 100644
--- a/baler/modules/helper.py
+++ b/baler/modules/helper.py
@@ -214,8 +214,8 @@ def process(
     normalization features.
     """
 
-    if input_path is None:
-        raise ValueError("Input path is None")
+    if input_path is None or input_path == "":
+        raise ValueError("Input path is None, did you forget to set it in the configuration file?")
 
     loaded = np.load(input_path)
     data = loaded["data"]