From 359a6ec187e2e3046fbaded4560ac0e2514b7f79 Mon Sep 17 00:00:00 2001 From: Fritjof Bengtsson Date: Thu, 7 Mar 2024 13:32:54 +0000 Subject: [PATCH 1/4] moved config to yaml, remove unused tests --- baler/baler.py | 112 ++++++------- baler/modules/config/config_service.py | 8 + baler/modules/helper.py | 218 +++++++++---------------- baler/modules/plotting.py | 34 ++-- baler/modules/training.py | 59 +++---- config.yaml | 29 ++++ poetry.lock | 28 ++-- pyproject.toml | 1 + tests/test_data_processing.py | 11 -- tests/unit/test_config_creation.py | 6 + 10 files changed, 237 insertions(+), 269 deletions(-) create mode 100644 baler/modules/config/config_service.py create mode 100644 config.yaml create mode 100644 tests/unit/test_config_creation.py diff --git a/baler/baler.py b/baler/baler.py index f7bcfb2e..7cea2835 100644 --- a/baler/baler.py +++ b/baler/baler.py @@ -20,7 +20,6 @@ from .modules import helper import gzip -from .modules.profiling import pytorch_profile __all__ = ( @@ -85,7 +84,7 @@ def perform_training(output_path, config, verbose: bool): """Main function calling the training functions, ran when --mode=train is selected. The three functions called are: `helper.process`, `helper.mode_init` and `helper.training`. - Depending on `config.data_dimensions`, the calculated latent space size will differ. + Depending on `config["data_dimension"]`, the calculated latent space size will differ. Args: output_path (path): Selects base path for determining output path @@ -101,12 +100,12 @@ def perform_training(output_path, config, verbose: bool): normalization_features, original_shape, ) = helper.process( - config.input_path, - config.custom_norm, - config.test_size, - config.apply_normalization, - config.convert_to_blocks if hasattr(config, "convert_to_blocks") else None, - verbose, + input_path=config.get("input_path"), + custom_norm=config.get("custom_norm"), + test_size=config.get("test_size"), + apply_normalization=config.get("apply_normalization"), + convert_to_blocks=config.get("convert_to_blocks"), + verbose=verbose, ) if verbose: @@ -114,52 +113,55 @@ def perform_training(output_path, config, verbose: bool): try: n_features = 0 - if config.data_dimension == 1: - number_of_columns = train_set_norm.shape[1] - config.latent_space_size = ceil( - number_of_columns / config.compression_ratio - ) - config.number_of_columns = number_of_columns - n_features = number_of_columns - elif config.data_dimension == 2: - if config.model_type == "dense": - number_of_rows = train_set_norm.shape[1] - number_of_columns = train_set_norm.shape[2] - n_features = number_of_columns * number_of_rows - else: - number_of_rows = original_shape[1] - number_of_columns = original_shape[2] + data_dimension = config["data_dimension"] + match data_dimension: + case 1: + number_of_columns = train_set_norm.shape[1] + config["latent_space_size"] = ceil( + number_of_columns / config["compression_ratio"] + ) + config["number_of_columns"] = number_of_columns n_features = number_of_columns - config.latent_space_size = ceil( - (number_of_rows * number_of_columns) / config.compression_ratio - ) - config.number_of_columns = number_of_columns - else: - raise NameError( - "Data dimension can only be 1 or 2. Got config.data_dimension value = " - + str(config.data_dimension) - ) + case 2: + if config["model_type"] == "dense": + number_of_rows = train_set_norm.shape[1] + number_of_columns = train_set_norm.shape[2] + n_features = number_of_columns * number_of_rows + else: + number_of_rows = original_shape[1] + number_of_columns = original_shape[2] + n_features = number_of_columns + config["latent_space_size"] = ceil( + (number_of_rows * number_of_columns) / config["compression_ratio"] + ) + config["number_of_columns"] = number_of_columns + case _: + raise NameError( + "Data dimension can only be 1 or 2. Got data_dimension value = " + + str(config["data_dimension"]) + ) except AttributeError: if verbose: print( - f"{config.number_of_columns} -> {config.latent_space_size} dimensions" + f"{config['number_of_columns']} -> {config['latent_space_size']} dimensions" ) - assert number_of_columns == config.number_of_columns + + assert number_of_columns == config["number_of_columns"] if verbose: print( - f"Intitalizing Model with Latent Size - {config.latent_space_size} and Features - {n_features}" + f"Intitalizing Model with Latent Size - {config['latent_space_size']} and Features - {n_features}" ) device = helper.get_device() if verbose: print(f"Device used for training: {device}") - model_object = helper.model_init(config.model_name) - model = model_object(n_features=n_features, z_dim=config.latent_space_size) + model_object = helper.model_init(config["model_name"]) + model = model_object(n_features=n_features, z_dim=config["latent_space_size"]) model.to(device) - if config.model_name == "Conv_AE_3D" and hasattr( + if config["model_name"] == "Conv_AE_3D" and hasattr( config, "compress_to_latent_space" ): model.set_compress_to_latent_space(config.compress_to_latent_space) @@ -178,7 +180,7 @@ def perform_training(output_path, config, verbose: bool): if verbose: print("Training complete") - if config.apply_normalization: + if config["apply_normalization"]: np.save( os.path.join(training_path, "normalization_features.npy"), normalization_features, @@ -188,7 +190,7 @@ def perform_training(output_path, config, verbose: bool): f"Normalization features saved to {os.path.join(training_path, 'normalization_features.npy')}" ) - if config.separate_model_saving: + if hasattr(config, "separate_model_saving") and config["separate_model_saving"]: helper.encoder_decoder_saver( trained_model, os.path.join(output_path, "compressed_output", "encoder.pt"), @@ -240,7 +242,7 @@ def perform_compression(output_path, config, verbose: bool): """Main function calling the compression functions, ran when --mode=compress is selected. The main function being called here is: `helper.compress` - If `config.extra_compression` is selected, the compressed file is further compressed via zip + If `config["extra_compression"]` is selected, the compressed file is further compressed via zip Else, the function returns a compressed file of `.npz`, only compressed by Baler. Args: @@ -252,17 +254,17 @@ def perform_compression(output_path, config, verbose: bool): An `.npz` file which includes: - The compressed data - The data headers - - Normalization features if `config.apply_normalization=True` + - Normalization features if `config["apply_normalization"]=True` """ print("Compressing...") start = time.time() normalization_features = [] - if config.apply_normalization: + if config["apply_normalization"]: normalization_features = np.load( os.path.join(output_path, "training", "normalization_features.npy") ) - if config.separate_model_saving: + if config.get("separate_model_saving"): ( compressed, error_bound_batch, @@ -287,9 +289,9 @@ def perform_compression(output_path, config, verbose: bool): print("Compression took:", f"{(end - start) / 60:.3} minutes") - names = np.load(config.input_path)["names"] + names = np.load(config["input_path"])["names"] - if config.extra_compression: + if config["extra_compression"]: if verbose: print("Extra compression selected") print( @@ -313,7 +315,7 @@ def perform_compression(output_path, config, verbose: bool): names=names, normalization_features=normalization_features, ) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: error_bound_batch_index = np.array( [error_bound_batch, error_bound_index], dtype=object ) @@ -342,7 +344,7 @@ def perform_decompression(output_path, config, verbose: bool): """Main function calling the decompression functions, ran when --mode=decompress is selected. The main function being called here is: `helper.decompress` - If `config.apply_normalization=True` the output is un-normalized with the same normalization features saved from `perform_training()`. + If `config["apply_normalization"]=True` the output is un-normalized with the same normalization features saved from `perform_training()`. Args: output_path (path): Selects base path for determining output path @@ -352,9 +354,9 @@ def perform_decompression(output_path, config, verbose: bool): print("Decompressing...") start = time.time() - model_name = config.model_name - data_before = np.load(config.input_path)["data"] - if config.separate_model_saving: + model_name = config["model_name"] + data_before = np.load(config["input_path"])["data"] + if config.get("separate_model_saving"): decompressed, names, normalization_features = helper.decompress( model_path=os.path.join(output_path, "compressed_output", "decoder.pt"), input_path=os.path.join(output_path, "compressed_output", "compressed.npz"), @@ -398,7 +400,7 @@ def perform_decompression(output_path, config, verbose: bool): "Target Shape - ", data_before.shape, ) - if config.model_type == "dense": + if config["model_type"] == "dense": decompressed = decompressed.reshape( data_before.shape[0], data_before.shape[1], data_before.shape[2] ) @@ -407,7 +409,7 @@ def perform_decompression(output_path, config, verbose: bool): data_before.shape[0], 1, data_before.shape[1], data_before.shape[2] ) - if config.apply_normalization: + if config["apply_normalization"]: print("Un-normalizing...") normalization_features = np.load( os.path.join(output_path, "training", "normalization_features.npy"), @@ -437,7 +439,7 @@ def perform_decompression(output_path, config, verbose: bool): end = time.time() print("Decompression took:", f"{(end - start) / 60:.3} minutes") - if config.extra_compression: + if config["extra_compression"]: if verbose: print("Extra compression selected") print( @@ -467,7 +469,7 @@ def print_info(output_path, config): "================================== \n Information about your compression \n================================== " ) - original = config.input_path + original = config["input_path"] compressed_path = os.path.join(output_path, "compressed_output") decompressed_path = os.path.join(output_path, "decompressed_output") training_path = os.path.join(output_path, "training") diff --git a/baler/modules/config/config_service.py b/baler/modules/config/config_service.py new file mode 100644 index 00000000..0eb28427 --- /dev/null +++ b/baler/modules/config/config_service.py @@ -0,0 +1,8 @@ +from typing import Any +import yaml + + +def load_config(file_path) -> dict[str, Any]: + with open(file_path, "r") as file: + config = yaml.safe_load(file).get("config") + return config diff --git a/baler/modules/helper.py b/baler/modules/helper.py index 705559d3..b8af4a0b 100644 --- a/baler/modules/helper.py +++ b/baler/modules/helper.py @@ -15,13 +15,15 @@ import argparse import importlib import os +import shutil import sys -from dataclasses import dataclass from math import ceil import gzip +from pyparsing import Any from tqdm import tqdm + sys.path.append(os.getcwd()) import numpy as np import torch @@ -29,6 +31,7 @@ from sklearn.model_selection import train_test_split from ..modules import training, plotting, data_processing, diagnostics +from ..modules.config import config_service def get_arguments(): @@ -82,15 +85,18 @@ def get_arguments(): workspace_name = args.project[0] project_name = args.project[1] - config_path = ( - f"workspaces.{workspace_name}.{project_name}.config.{project_name}_config" - ) - - if args.mode == "newProject": - config = None - else: - config = Config - importlib.import_module(config_path).set_config(config) + config: dict[str, Any] = {} + + if args.mode != "newProject": + path_to_config = os.path.join( + "workspaces", + workspace_name, + project_name, + "config", + f"{project_name}_config.yaml", + ) + print(f"Trying to load config from {path_to_config}") + config = config_service.load_config(path_to_config) return ( config, @@ -140,94 +146,10 @@ def create_new_project( print(f"Creating directory {directory}...") os.makedirs(directory, exist_ok=True) - # Populate default config - with open( - os.path.join(project_path, "config", f"{project_name}_config.py"), "w" - ) as f: - f.write(create_default_config(workspace_name, project_name)) - - -@dataclass -class Config: - """Defines a configuration dataclass""" - - input_path: str - compression_ratio: float - epochs: int - early_stopping: bool - early_stoppin_patience: int - lr_scheduler: bool - lr_scheduler_patience: int - min_delta: int - model_name: str - model_type = str - custom_norm: bool - l1: bool - reg_param: float - RHO: float - lr: float - batch_size: int - test_size: float - data_dimension: int - intermittent_model_saving: bool - intermittent_saving_patience: int - mse_avg: bool - mse_sum: bool - emd: bool - l1: bool - deterministic_algorithm: bool - - -def create_default_config(workspace_name: str, project_name: str) -> str: - """Creates a default config file for a project. - Args: - workspace_name (str): Name of the workspace. - project_name (str): Name of the project. - Returns: - str: Default config file. - """ - - return f""" -# === Configuration options === - -def set_config(c): - c.input_path = "workspaces/{workspace_name}/data/{project_name}_data.npz" - c.data_dimension = 1 - c.compression_ratio = 2.0 - c.apply_normalization = True - c.model_name = "AE" - c.model_type = "dense" - c.epochs = 5 - c.lr = 0.001 - c.batch_size = 512 - c.early_stopping = True - c.lr_scheduler = True - - - - -# === Additional configuration options === - - c.early_stopping_patience = 100 - c.min_delta = 0 - c.lr_scheduler_patience = 50 - c.custom_norm = False - c.reg_param = 0.001 - c.RHO = 0.05 - c.test_size = 0 - # c.number_of_columns = 24 - # c.latent_space_size = 12 - c.extra_compression = False - c.intermittent_model_saving = False - c.intermittent_saving_patience = 100 - c.mse_avg = False - c.mse_sum = True - c.emd = False - c.l1 = True - c.activation_extraction = False - c.deterministic_algorithm = True - -""" + shutil.copy2( + "config.yaml", + os.path.join(project_path, "config", f"{project_name}_config.yaml"), + ) def model_init(model_name: str): @@ -291,6 +213,10 @@ def process( Returns: ndarray, ndarray, ndarray: Array with the train set, array with the test set and array with the normalization features. """ + + if input_path is None: + raise ValueError("Input path is None") + loaded = np.load(input_path) data = loaded["data"] @@ -485,8 +411,8 @@ def compress(model_path, config): torch.Tensor: Compressed data as PyTorch tensor """ - # Loads the data and applies normalization if config.apply_normalization = True - loaded = np.load(config.input_path) + # Loads the data and applies normalization if config["apply_normalization"] = True + loaded = np.load(config["input_path"]) data_before = loaded["data"] original_shape = data_before.shape @@ -495,71 +421,75 @@ def compress(model_path, config): config.convert_to_blocks, data_before ) - if config.apply_normalization: + if config["apply_normalization"]: print("Normalizing...") - data = normalize(data_before, config.custom_norm) + data = normalize(data_before, config["custom_norm"]) else: data = data_before number_of_columns = 0 try: n_features = 0 - if config.data_dimension == 1: - column_names = np.load(config.input_path)["names"] + if config["data_dimension"] == 1: + column_names = np.load(config["input_path"])["names"] number_of_columns = len(column_names) - config.latent_space_size = ceil( - number_of_columns / config.compression_ratio + config["latent_space_size"] = ceil( + number_of_columns / config["compression_ratio"] ) - config.number_of_columns = number_of_columns + config["number_of_columns"] = number_of_columns n_features = number_of_columns - elif config.data_dimension == 2: - if config.model_type == "dense": + elif config["data_dimension"] == 2: + if config["model_type"] == "dense": number_of_rows = data.shape[1] - config.number_of_columns = data.shape[2] - n_features = number_of_rows * config.number_of_columns + config["number_of_columns"] = data.shape[2] + n_features = number_of_rows * config["number_of_columns"] else: number_of_rows = original_shape[1] - config.number_of_columns = original_shape[2] - n_features = config.number_of_columns - config.latent_space_size = ceil( - (number_of_rows * config.number_of_columns) / config.compression_ratio + config["number_of_columns"] = original_shape[2] + n_features = config["number_of_columns"] + config["latent_space_size"] = ceil( + (number_of_rows * config["number_of_columns"]) + / config["compression_ratio"] ) else: raise NameError( - "Data dimension can only be 1 or 2. Got config.data_dimension = " - + str(config.data_dimension) + "Data dimension can only be 1 or 2. Got data_dimension = " + + str(config["data_dimension"]) ) - except AttributeError: - number_of_columns = config.number_of_columns - latent_space_size = config.latent_space_size + except KeyError: + number_of_columns = config["number_of_columns"] + latent_space_size = config["latent_space_size"] print(f"{number_of_columns} -> {latent_space_size} dimensions") # Initialise and load the model correctly. - latent_space_size = config.latent_space_size - bs = config.batch_size + latent_space_size = config["latent_space_size"] + bs = config["batch_size"] device = get_device() - model_object = data_processing.initialise_model(config.model_name) + model_object = data_processing.initialise_model(config["model_name"]) model = data_processing.load_model( model_object, model_path=model_path, n_features=n_features, - z_dim=config.latent_space_size, + z_dim=config["latent_space_size"], ) model.eval() - if config.data_dimension == 2: - if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D": + if config["data_dimension"] == 2: + if ( + config["model_type"] == "convolutional" + and config["model_name"] == "Conv_AE_3D" + ): data_tensor = torch.tensor(data, dtype=torch.float32).view( data.shape[0] // bs, 1, bs, data.shape[1], data.shape[2] ) - elif config.model_type == "convolutional": + elif config["model_type"] == "convolutional": data_tensor = torch.tensor(data, dtype=torch.float32).view( data.shape[0], 1, data.shape[1], data.shape[2] ) - elif config.model_type == "dense": + elif config["model_type"] == "dense": data_tensor = torch.tensor(data, dtype=torch.float32).view( data.shape[0], data.shape[1] * data.shape[2] ) - elif config.data_dimension == 1: + elif config["data_dimension"] == 1: data_tensor = torch.tensor(data, dtype=torch.float64) # Batching data to avoid memory leaks @@ -584,7 +514,7 @@ def compress(model_path, config): compressed_output = model.encode(data_batch) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: decoded_output = model.decode(compressed_output) decoded_output = detacher(decoded_output) deltas_compressed = 0 @@ -592,7 +522,7 @@ def compress(model_path, config): compressed_output = detacher(compressed_output) data_batch = detacher(data_batch) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: ( deltas, rms_pred_error_index, @@ -608,7 +538,7 @@ def compress(model_path, config): else: compressed = np.concatenate((compressed, compressed_output)) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: print("Total Deltas Found - ", deltas_compressed) return (compressed, error_bound_batch, error_bound_deltas, error_bound_index) @@ -645,12 +575,12 @@ def decompress( names = loaded["names"] normalization_features = loaded["normalization_features"] - if config.model_type == "convolutional": + if config.get("model_type") == "convolutional": final_layer_details = np.load( os.path.join(output_path, "training", "final_layer.npy"), allow_pickle=True ) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: loaded_deltas = np.load( gzip.GzipFile(input_path_deltas, "r"), allow_pickle=True ) @@ -662,11 +592,11 @@ def decompress( error_bound_index = loaded_batch_indexes[1] deltas_added = 0 - model_name = config.model_name + model_name = config["model_name"] latent_space_size = len(data[0]) - bs = config.batch_size + bs = config["batch_size"] model_dict = torch.load(str(model_path), map_location=get_device()) - if config.data_dimension == 2 and config.model_type == "dense": + if config["data_dimension"] == 2 and config["model_type"] == "dense": number_of_columns = int((len(model_dict[list(model_dict.keys())[-1]]))) else: number_of_columns = len(model_dict[list(model_dict.keys())[-1]]) @@ -682,7 +612,7 @@ def decompress( ) model.eval() - if config.model_type == "convolutional": + if config.get("model_type") == "convolutional": model.set_final_layer_dims(final_layer_details) # Load the data, convert to tensor and batch it to avoid memory leaks @@ -703,7 +633,7 @@ def decompress( out = model.decode(data_batch).to(device) # Converting back to numpyarray out = detacher(out) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: if idx in error_bound_batch: # Error Bounded Deltas added to Decompressed output delta_idx = np.where(error_bound_batch == idx) @@ -720,10 +650,10 @@ def decompress( else: decompressed = np.concatenate((decompressed, out)) - if config.save_error_bounded_deltas: + if config["save_error_bounded_deltas"]: print("Total Deltas Added - ", deltas_added) - if config.data_dimension == 2 and config.model_type == "dense": + if config["data_dimension"] == 2 and config["model_type"] == "dense": decompressed = decompressed.reshape( (len(decompressed), original_shape[1], original_shape[2]) ) @@ -808,12 +738,12 @@ def perform_hls4ml_conversion(output_path, config): model_path = os.path.join(output_path, "compressed_output", "model.pt") - model_object = data_processing.initialise_model(config.model_name) + model_object = data_processing.initialise_model(config["model_name"]) model = data_processing.load_model( model_object, model_path=model_path, - n_features=config.number_of_columns, - z_dim=config.latent_space_size, + n_features=config["number_of_columns"], + z_dim=config["latent_space_size"], ) model.to("cpu") diff --git a/baler/modules/plotting.py b/baler/modules/plotting.py index aa59656f..f7d9b9be 100644 --- a/baler/modules/plotting.py +++ b/baler/modules/plotting.py @@ -36,16 +36,16 @@ def loss_plot(path_to_loss_data, output_path, config): val_loss = loss_data[1] conf_list = [ len(train_loss), - config.model_name, - config.reg_param, - config.lr, - config.batch_size, + config["model_name"], + config["reg_param"], + config["lr"], + config["batch_size"], ] plt.figure(figsize=(10, 7)) plt.title("Loss plot") plt.plot(train_loss, color="orange", label="Train Loss") - if config.test_size: + if config["test_size"]: plt.plot(val_loss, color="red", label="Validation Loss") for i in range(len(conf_list)): plt.plot([], [], " ", label=str_list[i] + " " + str(conf_list[i])) @@ -108,12 +108,12 @@ def plot_1D(output_path: str, config): config (dataclass): The config class containing attributes set in the config file """ - before_path = config.input_path + before_path = config["input_path"] after_path = os.path.join(output_path, "decompressed_output", "decompressed.npz") before = np.transpose(np.load(before_path)["data"]) after = np.transpose(np.load(after_path)["data"]) - names = np.load(config.input_path)["names"] + names = np.load(config["input_path"])["names"] index_to_cut = get_index_to_cut(3, 1e-6, before) before = np.delete(before, index_to_cut, axis=1) @@ -256,7 +256,7 @@ def plot_2D_old(project_path, config): config (dataclass): The config class containing attributes set in the config file """ - data = np.load(config.input_path)["data"] + data = np.load(config["input_path"])["data"] data_decompressed = np.load(project_path + "/decompressed_output/decompressed.npz")[ "data" ] @@ -266,7 +266,7 @@ def plot_2D_old(project_path, config): else: num_tiles = 1 - if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D": + if config["model_type"] == "convolutional" and config["model_name"] == "Conv_AE_3D": data_decompressed = data_decompressed.reshape( data_decompressed.shape[0] * data_decompressed.shape[2], 1, @@ -276,9 +276,9 @@ def plot_2D_old(project_path, config): print("=== Plotting ===") for ind in trange(num_tiles): - if config.model_type == "convolutional": + if config["model_type"] == "convolutional": tile_data_decompressed = data_decompressed[ind][0] * 0.04 * 1000 - elif config.model_type == "dense": + elif config["model_type"] == "dense": tile_data_decompressed = data_decompressed[ind] * 0.04 * 1000 tile_data = data[ind] * 0.04 * 1000 @@ -380,7 +380,7 @@ def plot_2D(project_path, config): config (dataclass): The config class containing attributes set in the config file """ - data = np.load(config.input_path)["data"] + data = np.load(config["input_path"])["data"] data_decompressed = np.load(project_path + "/decompressed_output/decompressed.npz")[ "data" ] @@ -395,7 +395,7 @@ def plot_2D(project_path, config): else: num_tiles = 1 - # if config.model_type == "convolutional" and config.model_name == "Conv_AE_3D": + # if config["model_type"] == "convolutional" and config["model_name"] == "Conv_AE_3D": # data_decompressed = data_decompressed.reshape( # data_decompressed.shape[0] * data_decompressed.shape[2], # 1, @@ -405,9 +405,9 @@ def plot_2D(project_path, config): print("=== Plotting ===") for ind in trange(num_tiles): - # if config.model_type == "convolutional": + # if config["model_type"] == "convolutional": # tile_data_decompressed = data_decompressed[ind][0] - # elif config.model_type == "dense": + # elif config["model_type"] == "dense": # tile_data_decompressed = data_decompressed[ind][0] tile_data = data[ind] tile_data_decompressed = data_decompressed[ind] @@ -444,7 +444,7 @@ def plot(output_path, config): output_path (path): The path to the project directory config (dataclass): The config class containing attributes set in the config file """ - if config.data_dimension == 1: + if config["data_dimension"] == 1: plot_1D(output_path, config) - elif config.data_dimension == 2: + elif config["data_dimension"] == 2: plot_2D(output_path, config) diff --git a/baler/modules/training.py b/baler/modules/training.py index ca189def..3d4fff63 100644 --- a/baler/modules/training.py +++ b/baler/modules/training.py @@ -149,7 +149,7 @@ def seed_worker(worker_id): def train(model, variables, train_data, test_data, project_path, config): """Does the entire training loop by calling the `fit()` and `validate()`. Appart from this, this is the main function where the data is converted - to the correct type for it to be trained, via `torch.Tensor()`. Furthermore, the batching is also done here, based on `config.batch_size`, + to the correct type for it to be trained, via `torch.Tensor()`. Furthermore, the batching is also done here, based on `config["batch_size"]`, and it is the `torch.utils.data.DataLoader` doing the splitting. Applying either `EarlyStopping` or `LR Scheduler` is also done here, all based on their respective `config` arguments. For reproducibility, the seeds can also be fixed in this function. @@ -165,7 +165,7 @@ def train(model, variables, train_data, test_data, project_path, config): """ # Fix the random seed - TODO: add flag to make this optional - if config.deterministic_algorithm: + if config["deterministic_algorithm"]: random.seed(0) torch.manual_seed(0) np.random.seed(0) @@ -173,16 +173,16 @@ def train(model, variables, train_data, test_data, project_path, config): g = torch.Generator() g.manual_seed(0) - test_size = config.test_size - learning_rate = config.lr - bs = config.batch_size - reg_param = config.reg_param - rho = config.RHO - l1 = config.l1 - epochs = config.epochs - latent_space_size = config.latent_space_size - intermittent_model_saving = config.intermittent_model_saving - intermittent_saving_patience = config.intermittent_saving_patience + test_size = config["test_size"] + learning_rate = config["lr"] + bs = config["batch_size"] + reg_param = config["reg_param"] + rho = config["RHO"] + l1 = config["l1"] + epochs = config["epochs"] + latent_space_size = config["latent_space_size"] + intermittent_model_saving = config["intermittent_model_saving"] + intermittent_saving_patience = config["intermittent_saving_patience"] model_children = list(model.children()) @@ -191,8 +191,8 @@ def train(model, variables, train_data, test_data, project_path, config): model = model.to(device) # Converting data to tensors - if config.data_dimension == 2: - if config.model_type == "dense": + if config["data_dimension"] == 2: + if config["model_type"] == "dense": # print(train_data.shape) # print(test_data.shape) # sys.exit() @@ -202,7 +202,10 @@ def train(model, variables, train_data, test_data, project_path, config): valid_ds = torch.tensor(test_data, dtype=torch.float32, device=device).view( test_data.shape[0], test_data.shape[1] * test_data.shape[2] ) - elif config.model_type == "convolutional" and config.model_name == "Conv_AE_3D": + elif ( + config["model_type"] == "convolutional" + and config["model_name"] == "Conv_AE_3D" + ): train_ds = torch.tensor( train_data, dtype=torch.float32, device=device ).view( @@ -219,21 +222,21 @@ def train(model, variables, train_data, test_data, project_path, config): train_data.shape[1], train_data.shape[2], ) - elif config.model_type == "convolutional": + elif config["model_type"] == "convolutional": train_ds = torch.tensor( train_data, dtype=torch.float32, device=device ).view(train_data.shape[0], 1, train_data.shape[1], train_data.shape[2]) valid_ds = torch.tensor(test_data, dtype=torch.float32, device=device).view( train_data.shape[0], 1, train_data.shape[1], train_data.shape[2] ) - elif config.data_dimension == 1: + elif config["data_dimension"] == 1: train_ds = torch.tensor(train_data, dtype=torch.float64, device=device) valid_ds = torch.tensor(test_data, dtype=torch.float64, device=device) # Pushing input data into the torch-DataLoader object and combines into one DataLoader object (a basic wrapper # around several DataLoader objects). - if config.deterministic_algorithm: + if config["deterministic_algorithm"]: train_dl = DataLoader( train_ds, batch_size=bs, @@ -266,15 +269,15 @@ def train(model, variables, train_data, test_data, project_path, config): optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Activate early stopping - if config.early_stopping: + if config["early_stopping"]: early_stopping = utils.EarlyStopping( - patience=config.early_stopping_patience, min_delta=config.min_delta + patience=config["early_stopping_patience"], min_delta=config["min_delta"] ) # Changes to patience & min_delta can be made in configs # Activate LR Scheduler - if config.lr_scheduler: + if config["lr_scheduler"]: lr_scheduler = utils.LRScheduler( - optimizer=optimizer, patience=config.lr_scheduler_patience + optimizer=optimizer, patience=config["lr_scheduler_patience"] ) # Training and Validation of the model @@ -283,7 +286,7 @@ def train(model, variables, train_data, test_data, project_path, config): start = time.time() # Registering hooks for activation extraction - if config.activation_extraction: + if config["activation_extraction"]: hooks = model.store_hooks() for epoch in range(epochs): @@ -299,7 +302,7 @@ def train(model, variables, train_data, test_data, project_path, config): latent_dim=latent_space_size, RHO=rho, l1=l1, - n_dimensions=config.data_dimension, + n_dimensions=config["data_dimension"], ) train_loss.append(train_epoch_loss) @@ -315,9 +318,9 @@ def train(model, variables, train_data, test_data, project_path, config): val_epoch_loss = train_epoch_loss val_loss.append(val_epoch_loss) - if config.lr_scheduler: + if config["lr_scheduler"]: lr_scheduler(val_epoch_loss) - if config.early_stopping: + if config["early_stopping"]: early_stopping(val_epoch_loss) if early_stopping.early_stop: break @@ -331,7 +334,7 @@ def train(model, variables, train_data, test_data, project_path, config): end = time.time() # Saving activations values - if config.activation_extraction: + if config["activation_extraction"]: activations = diagnostics.dict_to_square_matrix(model.get_activations()) model.detach_hooks(hooks) np.save(os.path.join(project_path, "activations.npy"), activations) @@ -341,7 +344,7 @@ def train(model, variables, train_data, test_data, project_path, config): os.path.join(project_path, "loss_data.npy"), np.array([train_loss, val_loss]) ) - if config.model_type == "convolutional": + if config.get("model_type") == "convolutional": final_layer = model.get_final_layer_dims() np.save(os.path.join(project_path, "final_layer.npy"), np.array(final_layer)) diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..db18199e --- /dev/null +++ b/config.yaml @@ -0,0 +1,29 @@ +config: + data_dimension: 1 + compression_ratio: 2.0 + apply_normalization: True + model_name: "AE" + model_type: "dense" + epochs: 5 + lr: 0.001 + batch_size: 512 + early_stopping: True + lr_scheduler: True + early_stopping_patience: 100 + min_delta: 0 + lr_scheduler_patience: 50 + custom_norm: False + reg_param: 0.001 + RHO: 0.05 + test_size: 0 + #number_of_columns: 24 + #latent_space_size: 12 + extra_compression: False + intermittent_model_saving: False + intermittent_saving_patience: 100 + mse_avg: False + mse_sum: True + emd: False + l1: True + activation_extraction: False + deterministic_algorithm: True diff --git a/poetry.lock b/poetry.lock index 7b949e57..ad2628cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -407,13 +407,13 @@ typing = ["typing-extensions (>=4.8)"] [[package]] name = "flatbuffers" -version = "23.5.26" +version = "24.3.6" description = "The FlatBuffers serialization format for Python" optional = true python-versions = "*" files = [ - {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, - {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, + {file = "flatbuffers-24.3.6-py2.py3-none-any.whl", hash = "sha256:0bc1a9d968c0ba996f97b8c255214ec005cf21b962ef60157f7aa2fc647481f1"}, + {file = "flatbuffers-24.3.6.tar.gz", hash = "sha256:8d90a756ad5754be1fcdaa77065065125c9832ed045b4078875b4d3bc1953352"}, ] [[package]] @@ -724,32 +724,32 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.0.1" +version = "7.0.2" description = "Read metadata from Python packages" optional = true python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"}, - {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"}, + {file = "importlib_metadata-7.0.2-py3-none-any.whl", hash = "sha256:f4bc4c0c070c490abf4ce96d715f68e95923320370efb66143df00199bb6c100"}, + {file = "importlib_metadata-7.0.2.tar.gz", hash = "sha256:198f568f3230878cb1b44fbd7975f87906c22336dba2e4a7f05278c281fbd792"}, ] [package.dependencies] zipp = ">=0.5" [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] [[package]] name = "importlib-resources" -version = "6.1.2" +version = "6.1.3" description = "Read resources from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_resources-6.1.2-py3-none-any.whl", hash = "sha256:9a0a862501dc38b68adebc82970140c9e4209fc99601782925178f8386339938"}, - {file = "importlib_resources-6.1.2.tar.gz", hash = "sha256:308abf8474e2dba5f867d279237cd4076482c3de7104a40b41426370e891549b"}, + {file = "importlib_resources-6.1.3-py3-none-any.whl", hash = "sha256:4c0269e3580fe2634d364b39b38b961540a7738c02cb984e98add8b4221d793d"}, + {file = "importlib_resources-6.1.3.tar.gz", hash = "sha256:56fb4525197b78544a3354ea27793952ab93f935bb4bf746b846bb1015020f2b"}, ] [package.dependencies] @@ -757,7 +757,7 @@ zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] +testing = ["jaraco.collections", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] [[package]] name = "iniconfig" @@ -1777,7 +1777,7 @@ six = ">=1.5" name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, @@ -2488,4 +2488,4 @@ hls4ml = ["hls4ml", "tensorflow"] [metadata] lock-version = "2.0" python-versions = ">=3.8, <3.11.10" -content-hash = "0b0cf4871b8e171e15c6001e9f68266ef3585ecf750d1aee374b68f94ee3cf72" +content-hash = "08a5afdbc389785e91457840486a9ba5a87b8aed19ac7bd82e6c76e2cc6ccaba" diff --git a/pyproject.toml b/pyproject.toml index 193d2759..04496ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ scikit-learn = "^1.2.0" hls4ml = { version = "^0.7.1", optional = true } tensorflow = { version = "^2.12.0", optional = true } numpy = "1.23.5" +pyyaml = "^6.0.1" [tool.poetry.group.dev.dependencies] pytest = "^7.2.1" diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py index 1cfc479e..390d1b4e 100644 --- a/tests/test_data_processing.py +++ b/tests/test_data_processing.py @@ -23,17 +23,6 @@ from baler.modules import helper -def test_import_config_success(): - # Call the import_config function with the sample config file path - config = helper.Config - config.Foo = "Bar" - config.Baz = 10 - - # Assert that the result is equal to the expected config - # This checks that the import_config function correctly loads the JSON file and returns the expected dictionary - assert config.Foo == "Bar" - - def test_save_model(): # Test data model = torch.nn.Linear(3, 2) diff --git a/tests/unit/test_config_creation.py b/tests/unit/test_config_creation.py new file mode 100644 index 00000000..10384cb2 --- /dev/null +++ b/tests/unit/test_config_creation.py @@ -0,0 +1,6 @@ +from baler.modules.config.config_service import load_config + + +def test_read_config(): + config = load_config("config.yaml") + assert config is not None From d017c52b3b49530b3f78666fa4d337074f8189e8 Mon Sep 17 00:00:00 2001 From: axelgallen Date: Thu, 7 Mar 2024 16:54:18 +0000 Subject: [PATCH 2/4] Fixed place where old config notation was left --- baler/modules/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baler/modules/plotting.py b/baler/modules/plotting.py index f7d9b9be..849d2f2f 100644 --- a/baler/modules/plotting.py +++ b/baler/modules/plotting.py @@ -385,7 +385,7 @@ def plot_2D(project_path, config): "data" ] - if config.convert_to_blocks: + if config["convert_to_blocks"]: data_decompressed = data_decompressed.reshape( data.shape[0], data.shape[1], data.shape[2] ) From bda6b871921ddda8efc26ab697526df136ef672f Mon Sep 17 00:00:00 2001 From: Fritjof Bengtsson Date: Fri, 8 Mar 2024 13:44:57 +0000 Subject: [PATCH 3/4] add empty input_path to template config --- config.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/config.yaml b/config.yaml index db18199e..07158de6 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,5 @@ config: + input_path: "" data_dimension: 1 compression_ratio: 2.0 apply_normalization: True @@ -27,3 +28,30 @@ config: l1: True activation_extraction: False deterministic_algorithm: True + type_list: [ + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "float64", + "int", + "int", + "int", + "int", + "int", + "int", + "int", + "float64", + "float64", + "float64", + "int", + "int", + ] + convert_to_blocks: False \ No newline at end of file From f956fd92db63e3c70c26e5ec77c3f5eedfc0b206 Mon Sep 17 00:00:00 2001 From: Fritjof Bengtsson Date: Fri, 8 Mar 2024 13:48:19 +0000 Subject: [PATCH 4/4] improve error message when input_path is unset --- baler/modules/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baler/modules/helper.py b/baler/modules/helper.py index b8af4a0b..a9bee818 100644 --- a/baler/modules/helper.py +++ b/baler/modules/helper.py @@ -214,8 +214,8 @@ def process( normalization features. """ - if input_path is None: - raise ValueError("Input path is None") + if input_path is None or input_path == "": + raise ValueError("Input path is None, did you forget to set it in the configuration file?") loaded = np.load(input_path) data = loaded["data"]