diff --git a/examples/eam/NiNb_EAM_bulk.json b/examples/eam/NiNb_EAM_bulk.json new file mode 100644 index 000000000..8fdfcd7f8 --- /dev/null +++ b/examples/eam/NiNb_EAM_bulk.json @@ -0,0 +1,66 @@ +{ + "Verbosity": { + "level": 2 + }, + "Dataset": { + "name": "NiNb", + "path": {"total": "BULK_DATA"}, + "format": "CFG", + "compositional_stratified_splitting": true, + "rotational_invariance": true, + "node_features": { + "name": ["num_of_protons", "atomic_masses", "atomic_energy"], + "dim": [1,1,1], + "column_index": [0,1,2] + }, + "graph_features":{ + "name": ["bulk_modulus"], + "dim": [1], + "column_index": [2] + } + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "PNA", + "radius": 3, + "max_neighbours": 100000, + "edge_features": ["lengths"], + "periodic_boundary_conditions": true, + "hidden_dim": 50, + "num_conv_layers": 10, + "output_heads": { + "graph":{ + "num_sharedlayers": 2, + "dim_sharedlayers": 5, + "num_headlayers": 2, + "dim_headlayers": [50,25] + }, + "node": { + "num_headlayers": 2, + "dim_headlayers": [50,25], + "type": "mlp" + } + }, + "task_weights": [1.0] + }, + "Variables_of_interest": { + "input_node_features": [0], + "output_names": ["bulk_modulus"], + "output_index": [0], + "type": ["graph"], + "denormalize_output": true + }, + "Training": { + "num_epoch": 2, + "perc_train": 0.7, + "learning_rate": 1e-3, + "batch_size": 64, + "continue": 0 + } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/eam/NiNb_EAM_bulk_multitask.json b/examples/eam/NiNb_EAM_bulk_multitask.json new file mode 100644 index 000000000..7e47981ea --- /dev/null +++ b/examples/eam/NiNb_EAM_bulk_multitask.json @@ -0,0 +1,67 @@ +{ + "Verbosity": { + "level": 2 + }, + "Dataset": { + "name": "NiNb", + "path": {"total": "FCC_Bulk_small"}, + "format": "CFG", + "compositional_stratified_splitting": true, + "rotational_invariance": true, + "node_features": { + "name": ["num_of_protons", "atomic_mass", "atomic_energy", "atomic_force"], + "dim": [1,1,1,3], + "column_index": [0,1,2,3] + }, + "graph_features":{ + "name": ["bulk_modulus"], + "dim": [1], + "column_index": [2] + } + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "PNA", + "radius": 3, + "max_neighbours": 100000, + "edge_features": ["lengths"], + "periodic_boundary_conditions": true, + "hidden_dim": 50, + "num_conv_layers": 6, + "output_heads": { + "graph":{ + "num_sharedlayers": 2, + "dim_sharedlayers": 5, + "num_headlayers": 2, + "dim_headlayers": [50,25] + }, + "node": { + "num_headlayers": 2, + "dim_headlayers": [50,25], + "type": "mlp" + } + }, + "task_weights": [1.0, 1.0, 1.0] + }, + "Variables_of_interest": { + "input_node_features": [0], + "output_names": ["bulk_modulus","atomic_energy","atomic_forces"], + "output_index": [0, 2, 3], + "type": ["graph","node","node"], + "denormalize_output": true + }, + "Training": { + "num_epoch": 2, + "perc_train": 0.7, + "learning_rate": 1e-3, + "batch_size": 64, + "continue": 0, + "startfrom": "existing_model" + } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/eam/NiNb_EAM_energy.json b/examples/eam/NiNb_EAM_energy.json new file mode 100644 index 000000000..3e0fd137b --- /dev/null +++ b/examples/eam/NiNb_EAM_energy.json @@ -0,0 +1,66 @@ +{ + "Verbosity": { + "level": 2 + }, + "Dataset": { + "name": "NiNb", + "path": {"total": "FCC_Ni_Nb"}, + "format": "CFG", + "compositional_stratified_splitting": true, + "rotational_invariance": true, + "node_features": { + "name": ["num_of_protons", "atomic_masses", "atomic_energy"], + "dim": [1,1,1], + "column_index": [0,1,2] + }, + "graph_features":{ + "name": [], + "dim": [], + "column_index": [] + } + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "PNA", + "radius": 3, + "max_neighbours": 100000, + "edge_features": ["lengths"], + "periodic_boundary_conditions": true, + "hidden_dim": 50, + "num_conv_layers": 10, + "output_heads": { + "graph":{ + "num_sharedlayers": 2, + "dim_sharedlayers": 5, + "num_headlayers": 2, + "dim_headlayers": [50,25] + }, + "node": { + "num_headlayers": 2, + "dim_headlayers": [50,25], + "type": "mlp" + } + }, + "task_weights": [1.0] + }, + "Variables_of_interest": { + "input_node_features": [0], + "output_names": ["atomic_energy"], + "output_index": [2], + "type": ["node"], + "denormalize_output": true + }, + "Training": { + "num_epoch": 200, + "perc_train": 0.7, + "learning_rate": 1e-3, + "batch_size": 64, + "continue": 0 + } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/eam/NiNb_EAM_multitask.json b/examples/eam/NiNb_EAM_multitask.json new file mode 100644 index 000000000..5f21e4c4d --- /dev/null +++ b/examples/eam/NiNb_EAM_multitask.json @@ -0,0 +1,67 @@ +{ + "Verbosity": { + "level": 2 + }, + "Dataset": { + "name": "NiNb", + "path": {"total": "FCC_Ni_Nb"}, + "format": "CFG", + "compositional_stratified_splitting": true, + "rotational_invariance": true, + "node_features": { + "name": ["num_of_protons", "atomic_mass", "atomic_energy", "atomic_force"], + "dim": [1,1,1,3], + "column_index": [0,1,2,3] + }, + "graph_features":{ + "name": [], + "dim": [], + "column_index": [] + } + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "PNA", + "radius": 3, + "max_neighbours": 100000, + "edge_features": ["lengths"], + "periodic_boundary_conditions": true, + "hidden_dim": 50, + "num_conv_layers": 6, + "output_heads": { + "graph":{ + "num_sharedlayers": 2, + "dim_sharedlayers": 5, + "num_headlayers": 2, + "dim_headlayers": [50,25] + }, + "node": { + "num_headlayers": 2, + "dim_headlayers": [50,25], + "type": "mlp" + } + }, + "task_weights": [1.0, 1.0] + }, + "Variables_of_interest": { + "input_node_features": [0], + "output_names": ["atomic_energy","atomic_forces"], + "output_index": [2, 3], + "type": ["node","node"], + "denormalize_output": true + }, + "Training": { + "num_epoch": 20, + "perc_train": 0.7, + "learning_rate": 1e-3, + "batch_size": 64, + "continue": 0, + "startfrom": "existing_model" + } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/eam/eam.py b/examples/eam/eam.py new file mode 100644 index 000000000..95df36a15 --- /dev/null +++ b/examples/eam/eam.py @@ -0,0 +1,5 @@ +import os +import hydragnn + +filepath = os.path.join(os.path.dirname(__file__), "NiNb_EAM_bulk_multitask.json") +hydragnn.run_training(filepath) diff --git a/examples/ising_model/ising_model.json b/examples/ising_model/ising_model.json index 7dfe247f4..5f2793bc9 100644 --- a/examples/ising_model/ising_model.json +++ b/examples/ising_model/ising_model.json @@ -54,5 +54,10 @@ "batch_size": 64, "continue": 0 } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true } } diff --git a/examples/lsms/lsms.json b/examples/lsms/lsms.json index 9403fd3fd..5a1207c86 100644 --- a/examples/lsms/lsms.json +++ b/examples/lsms/lsms.json @@ -4,9 +4,10 @@ }, "Dataset": { "name": "FePt_32atoms", - "path": {"total": "FePt"}, + "path": {"total": "FePt_enthalpy"}, "format": "LSMS", "compositional_stratified_splitting": true, + "rotational_invariance": true, "node_features": { "name": ["num_of_protons","charge_density", "magnetic_moment"], "dim": [1,1,1], @@ -56,5 +57,10 @@ "continue": 0, "startfrom": "existing_model" } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true } } diff --git a/examples/md17/md17.json b/examples/md17/md17.json index 407aba6bb..cf8680125 100644 --- a/examples/md17/md17.json +++ b/examples/md17/md17.json @@ -39,5 +39,10 @@ "continue": 0, "startfrom": "existing_model" } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true } } diff --git a/examples/md17/md17.py b/examples/md17/md17.py index 88d50432f..e1aeee366 100644 --- a/examples/md17/md17.py +++ b/examples/md17/md17.py @@ -17,9 +17,13 @@ def md17_pre_transform(data): data.x = data.z.float().view(-1, 1) # Only predict energy (index 0 of 2 properties) for this run. data.y = data.energy / len(data.x) + graph_features_dim = [1] + node_feature_dim = [1] hydragnn.preprocess.update_predicted_values( var_config["type"], var_config["output_index"], + graph_features_dim, + node_feature_dim, data, ) data = compute_edges(data) diff --git a/examples/qm9/qm9.json b/examples/qm9/qm9.json index 645529a2c..91c2d7ecf 100644 --- a/examples/qm9/qm9.json +++ b/examples/qm9/qm9.json @@ -39,5 +39,10 @@ "continue": 0, "startfrom": "existing_model" } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true } } diff --git a/examples/qm9/qm9.py b/examples/qm9/qm9.py index 610a1bbe3..243bbb626 100644 --- a/examples/qm9/qm9.py +++ b/examples/qm9/qm9.py @@ -17,9 +17,13 @@ def qm9_pre_transform(data): data.x = data.z.float().view(-1, 1) # Only predict free energy (index 10 of 19 properties) for this run. data.y = data.y[:, 10] / len(data.x) + graph_features_dim = [1] + node_feature_dim = [1] hydragnn.preprocess.update_predicted_values( var_config["type"], var_config["output_index"], + graph_features_dim, + node_feature_dim, data, ) device = hydragnn.utils.get_device() diff --git a/hydragnn/postprocess/postprocess.py b/hydragnn/postprocess/postprocess.py index c28dd39d8..e7d33c66f 100644 --- a/hydragnn/postprocess/postprocess.py +++ b/hydragnn/postprocess/postprocess.py @@ -15,12 +15,12 @@ def output_denormalize(y_minmax, true_values, predicted_values): ymin = y_minmax[ihead][0] ymax = y_minmax[ihead][1] for isample in range(len(predicted_values[ihead])): - for iatom in range(len(predicted_values[ihead][isample])): - predicted_values[ihead][isample][iatom] = ( - predicted_values[ihead][isample][iatom] * (ymax - ymin) + ymin + for ifeat in range(len(predicted_values[ihead][isample])): + predicted_values[ihead][isample][ifeat] = ( + predicted_values[ihead][isample][ifeat] * (ymax - ymin) + ymin ) - true_values[ihead][isample][iatom] = ( - true_values[ihead][isample][iatom] * (ymax - ymin) + ymin + true_values[ihead][isample][ifeat] = ( + true_values[ihead][isample][ifeat] * (ymax - ymin) + ymin ) return true_values, predicted_values diff --git a/hydragnn/preprocess/__init__.py b/hydragnn/preprocess/__init__.py index a2a1b1d74..bae9683f1 100644 --- a/hydragnn/preprocess/__init__.py +++ b/hydragnn/preprocess/__init__.py @@ -2,6 +2,8 @@ from .utils import ( check_if_graph_size_variable, + get_radius_graph, + get_radius_graph_pbc, get_radius_graph_config, get_radius_graph_pbc_config, RadiusGraphPBC, diff --git a/hydragnn/preprocess/load_data.py b/hydragnn/preprocess/load_data.py index f9c3af366..2dcc4272d 100644 --- a/hydragnn/preprocess/load_data.py +++ b/hydragnn/preprocess/load_data.py @@ -120,11 +120,9 @@ def load_train_val_test_sets(config): else: files_dir = f"{os.environ['SERIALIZED_DATA_PATH']}/serialized_dataset/{config['Dataset']['name']}_{dataset_name}.pkl" # loading serialized data and recalculating neighbourhoods depending on the radius and max num of neighbours - loader = SerializedDataLoader(config["Verbosity"]["level"]) - dataset = loader.load_serialized_data( - dataset_path=files_dir, - config=config, - ) + loader = SerializedDataLoader(config) + dataset = loader.load_serialized_data(dataset_path=files_dir) + dataset_list.append(dataset) datasetname_list.append(dataset_name) diff --git a/hydragnn/preprocess/raw_dataset_loader.py b/hydragnn/preprocess/raw_dataset_loader.py index bed5fd75c..2792a2f5c 100644 --- a/hydragnn/preprocess/raw_dataset_loader.py +++ b/hydragnn/preprocess/raw_dataset_loader.py @@ -12,12 +12,13 @@ import os import numpy as np import pickle -import pathlib import torch from torch_geometric.data import Data from torch import tensor +from ase.io.cfg import read_cfg + # WARNING: DO NOT use collective communication calls here because only rank 0 uses this routines @@ -56,16 +57,29 @@ def __init__(self, config): """ self.dataset_list = [] self.serial_data_name_list = [] - self.node_feature_name = config["node_features"]["name"] + self.node_feature_name = ( + config["node_features"]["name"] + if config["node_features"]["name"] is not None + else None + ) self.node_feature_dim = config["node_features"]["dim"] self.node_feature_col = config["node_features"]["column_index"] - self.graph_feature_name = config["graph_features"]["name"] + self.graph_feature_name = ( + config["graph_features"]["name"] + if config["graph_features"]["name"] is not None + else None + ) self.graph_feature_dim = config["graph_features"]["dim"] self.graph_feature_col = config["graph_features"]["column_index"] self.raw_dataset_name = config["name"] self.data_format = config["format"] self.path_dictionary = config["path"] + assert len(self.node_feature_name) == len(self.node_feature_dim) + assert len(self.node_feature_name) == len(self.node_feature_col) + assert len(self.graph_feature_name) == len(self.graph_feature_dim) + assert len(self.graph_feature_name) == len(self.graph_feature_col) + def load_raw_data(self): """Loads the raw files from specified path, performs the transformation to Data objects and normalization of values. After that the serialized data is stored to the serialized_dataset directory. @@ -86,13 +100,26 @@ def load_raw_data(self): len(os.listdir(raw_data_path)) > 0 ), "No data files provided in {}!".format(raw_data_path) - for filename in os.listdir(raw_data_path): - if filename == ".DS_Store": + for name in os.listdir(raw_data_path): + if name == ".DS_Store": continue - data_object = self.__transform_input_to_data_object_base( - filepath=os.path.join(raw_data_path, filename) - ) - dataset.append(data_object) + # if the directory contains file, iterate over them + if os.path.isfile(os.path.join(raw_data_path, name)): + data_object = self.__transform_input_to_data_object_base( + filepath=os.path.join(raw_data_path, name) + ) + if not isinstance(data_object, type(None)): + dataset.append(data_object) + # if the directory contains subdirectories, explore their content + elif os.path.isdir(os.path.join(raw_data_path, name)): + dir_name = os.path.join(raw_data_path, name) + for subname in os.listdir(dir_name): + if os.path.isfile(os.path.join(dir_name, subname)): + data_object = self.__transform_input_to_data_object_base( + filepath=os.path.join(dir_name, subname) + ) + if not isinstance(data_object, type(None)): + dataset.append(data_object) if self.data_format == "LSMS": for idx, data_object in enumerate(dataset): @@ -121,7 +148,18 @@ def load_raw_data(self): pickle.dump(dataset_normalized, f) def __transform_input_to_data_object_base(self, filepath): - """Transforms lines of strings read from the raw data file to Data object and returns it. + if self.data_format == "LSMS" or self.data_format == "unit_test": + data_object = self.__transform_LSMS_input_to_data_object_base( + filepath=filepath + ) + elif self.data_format == "CFG": + data_object = self.__transform_CFG_input_to_data_object_base( + filepath=filepath + ) + return data_object + + def __transform_CFG_input_to_data_object_base(self, filepath): + """Transforms lines of strings read from the raw data CFG file to Data object and returns it. Parameters ---------- @@ -133,11 +171,76 @@ def __transform_input_to_data_object_base(self, filepath): Data object representing structure of a graph sample. """ - f = open(filepath, "r", encoding="utf-8") - lines = f.readlines() + if filepath.endswith(".cfg"): + + data_object = self.__transform_ASE_object_to_data_object(filepath) + + return data_object + + else: + return None + + def __transform_ASE_object_to_data_object(self, filepath): + + # FIXME: + # this still assumes bulk modulus is specific to the CFG format. + # To deal with multiple files across formats, one should generalize this function + # by moving the reading of the .bulk file in a standalone routine. + # Morevoer, this approach assumes tha there is only one global feature to look at, + # and that this global feature is specicially retrieveable in a file with the string *bulk* inside. + + ase_object = read_cfg(filepath) + + data_object = Data() + + data_object.supercell_size = tensor(ase_object.cell.array).float() + data_object.pos = tensor(ase_object.arrays["positions"]).float() + proton_numbers = np.expand_dims(ase_object.arrays["numbers"], axis=1) + masses = np.expand_dims(ase_object.arrays["masses"], axis=1) + c_peratom = np.expand_dims(ase_object.arrays["c_peratom"], axis=1) + fx = np.expand_dims(ase_object.arrays["fx"], axis=1) + fy = np.expand_dims(ase_object.arrays["fy"], axis=1) + fz = np.expand_dims(ase_object.arrays["fz"], axis=1) + node_feature_matrix = np.concatenate( + (proton_numbers, masses, c_peratom, fx, fy, fz), axis=1 + ) + data_object.x = tensor(node_feature_matrix).float() + + filename_without_extension = os.path.splitext(filepath)[0] + + if os.path.exists(os.path.join(filename_without_extension + ".bulk")): + filename_bulk = os.path.join(filename_without_extension + ".bulk") + f = open(filename_bulk, "r", encoding="utf-8") + lines = f.readlines() + graph_feat = lines[0].split(None, 2) + g_feature = [] + # collect graph features + for item in range(len(self.graph_feature_dim)): + for icomp in range(self.graph_feature_dim[item]): + it_comp = self.graph_feature_col[item] + icomp + g_feature.append(float(graph_feat[it_comp].strip())) + data_object.y = tensor(g_feature) + + return data_object + + def __transform_LSMS_input_to_data_object_base(self, filepath): + """Transforms lines of strings read from the raw data LSMS file to Data object and returns it. + + Parameters + ---------- + lines: + content of data file with all the graph information + Returns + ---------- + Data + Data object representing structure of a graph sample. + """ data_object = Data() + f = open(filepath, "r", encoding="utf-8") + + lines = f.readlines() graph_feat = lines[0].split(None, 2) g_feature = [] # collect graph features @@ -202,20 +305,22 @@ def __scale_features_by_num_nodes(self, dataset): ] for idx, data_object in enumerate(dataset): - dataset[idx].y[scaled_graph_feature_index] = ( - dataset[idx].y[scaled_graph_feature_index] / data_object.num_nodes - ) - dataset[idx].x[:, scaled_node_feature_index] = ( - dataset[idx].x[:, scaled_node_feature_index] / data_object.num_nodes - ) + if dataset[idx].y is not None: + dataset[idx].y[scaled_graph_feature_index] = ( + dataset[idx].y[scaled_graph_feature_index] / data_object.num_nodes + ) + if dataset[idx].x is not None: + dataset[idx].x[:, scaled_node_feature_index] = ( + dataset[idx].x[:, scaled_node_feature_index] / data_object.num_nodes + ) return dataset def __normalize_dataset(self): """Performs the normalization on Data objects and returns the normalized dataset.""" - num_node_features = self.dataset_list[0][0].x.shape[1] - num_graph_features = len(self.dataset_list[0][0].y) + num_node_features = len(self.node_feature_dim) + num_graph_features = len(self.graph_feature_dim) self.minmax_graph_feature = np.full((2, num_graph_features), np.inf) # [0,...]:minimum values; [1,...]: maximum values @@ -225,38 +330,59 @@ def __normalize_dataset(self): for dataset in self.dataset_list: for data in dataset: # find maximum and minimum values for graph level features + g_index_start = 0 for ifeat in range(num_graph_features): + g_index_end = g_index_start + self.graph_feature_dim[ifeat] self.minmax_graph_feature[0, ifeat] = min( - data.y[ifeat], self.minmax_graph_feature[0, ifeat] + torch.min(data.y[g_index_start:g_index_end]), + self.minmax_graph_feature[0, ifeat], ) self.minmax_graph_feature[1, ifeat] = max( - data.y[ifeat], self.minmax_graph_feature[1, ifeat] + torch.max(data.y[g_index_start:g_index_end]), + self.minmax_graph_feature[1, ifeat], ) + g_index_start = g_index_end + # find maximum and minimum values for node level features + n_index_start = 0 for ifeat in range(num_node_features): - self.minmax_node_feature[0, ifeat] = np.minimum( - np.amin(data.x[:, ifeat].numpy()), + n_index_end = n_index_start + self.node_feature_dim[ifeat] + self.minmax_node_feature[0, ifeat] = min( + torch.min(data.x[:, n_index_start:n_index_end]), self.minmax_node_feature[0, ifeat], ) - self.minmax_node_feature[1, ifeat] = np.maximum( - np.amax(data.x[:, ifeat].numpy()), + self.minmax_node_feature[1, ifeat] = max( + torch.max(data.x[:, n_index_start:n_index_end]), self.minmax_node_feature[1, ifeat], ) + n_index_start = n_index_end for dataset in self.dataset_list: for data in dataset: + g_index_start = 0 for ifeat in range(num_graph_features): - data.y[ifeat] = tensor_divide( - (data.y[ifeat] - self.minmax_graph_feature[0, ifeat]), + g_index_end = g_index_start + self.graph_feature_dim[ifeat] + data.y[g_index_start:g_index_end] = tensor_divide( + ( + data.y[g_index_start:g_index_end] + - self.minmax_graph_feature[0, ifeat] + ), ( self.minmax_graph_feature[1, ifeat] - self.minmax_graph_feature[0, ifeat] ), ) + g_index_start = g_index_end + n_index_start = 0 for ifeat in range(num_node_features): - data.x[:, ifeat] = tensor_divide( - (data.x[:, ifeat] - self.minmax_node_feature[0, ifeat]), + n_index_end = n_index_start + self.node_feature_dim[ifeat] + data.x[:, n_index_start:n_index_end] = tensor_divide( + ( + data.x[:, n_index_start:n_index_end] + - self.minmax_node_feature[0, ifeat] + ), ( self.minmax_node_feature[1, ifeat] - self.minmax_node_feature[0, ifeat] ), ) + n_index_start = n_index_end diff --git a/hydragnn/preprocess/serialized_dataset_loader.py b/hydragnn/preprocess/serialized_dataset_loader.py index 61097283e..002cfcd0d 100644 --- a/hydragnn/preprocess/serialized_dataset_loader.py +++ b/hydragnn/preprocess/serialized_dataset_loader.py @@ -9,7 +9,6 @@ # SPDX-License-Identifier: BSD-3-Clause # ############################################################################## -import numpy as np import pickle from sklearn.model_selection import StratifiedShuffleSplit @@ -22,24 +21,56 @@ from hydragnn.utils.distributed import get_device from hydragnn.utils.print_utils import print_distributed, iterate_tqdm from hydragnn.preprocess.utils import ( + get_radius_graph, + get_radius_graph_pbc, get_radius_graph_config, get_radius_graph_pbc_config, ) class SerializedDataLoader: + """A class used for loading existing structures from files that are lists of serialized structures. + Most of the class methods are hidden, because from outside a caller needs only to know about + load_serialized_data method. + """ """ Constructor """ - def __init__(self, verbosity: int): - self.verbosity = verbosity - - """A class used for loading existing structures from files that are lists of serialized structures. - Most of the class methods are hidden, because from outside a caller needs only to know about - load_serialized_data method. + def __init__(self, config): + self.verbosity = config["Verbosity"]["level"] + self.node_feature_name = config["Dataset"]["node_features"]["name"] + self.node_feature_dim = config["Dataset"]["node_features"]["dim"] + self.node_feature_col = config["Dataset"]["node_features"]["column_index"] + self.graph_feature_name = config["Dataset"]["graph_features"]["name"] + self.graph_feature_dim = config["Dataset"]["graph_features"]["dim"] + self.graph_feature_col = config["Dataset"]["graph_features"]["column_index"] + self.rotational_invariance = config["Dataset"]["rotational_invariance"] + self.periodic_boundary_conditions = config["NeuralNetwork"]["Architecture"][ + "periodic_boundary_conditions" + ] + self.radius = config["NeuralNetwork"]["Architecture"]["radius"] + self.max_neighbours = config["NeuralNetwork"]["Architecture"]["max_neighbours"] + self.variables = config["NeuralNetwork"]["Variables_of_interest"] + self.variables_type = config["NeuralNetwork"]["Variables_of_interest"]["type"] + self.output_index = config["NeuralNetwork"]["Variables_of_interest"][ + "output_index" + ] + self.input_node_features = config["NeuralNetwork"]["Variables_of_interest"][ + "input_node_features" + ] + self.subsample_percentage = None + + # In situations where someone already provides the .pkl filed with data + # the asserts from raw_dataset_loader are not performed + # Therefore, we need to re-check consistency + assert len(self.node_feature_name) == len(self.node_feature_dim) + assert len(self.node_feature_name) == len(self.node_feature_col) + assert len(self.graph_feature_name) == len(self.graph_feature_dim) + assert len(self.graph_feature_name) == len(self.graph_feature_col) + """ Methods ------- load_serialized_data(dataset_path: str, config: dict) @@ -47,7 +78,7 @@ def __init__(self, verbosity: int): atom and structure features are updated. """ - def load_serialized_data(self, dataset_path: str, config): + def load_serialized_data(self, dataset_path: str): """Loads the serialized structures data from specified path, computes new edges for the structures based on the maximum number of neighbours and radius. Additionally, atom and structure features are updated. @@ -69,24 +100,28 @@ def load_serialized_data(self, dataset_path: str, config): dataset = pickle.load(f) rotational_invariance = NormalizeRotation(max_points=-1, sort=False) - if config["Dataset"]["rotational_invariance"]: + if self.rotational_invariance: dataset[:] = [rotational_invariance(data) for data in dataset] - if config["NeuralNetwork"]["Architecture"]["periodic_boundary_conditions"]: + if self.periodic_boundary_conditions: # edge lengths already added manually if using PBC, so no need to call Distance. - compute_edges = get_radius_graph_pbc_config( - config["NeuralNetwork"]["Architecture"] + compute_edges = get_radius_graph_pbc( + radius=self.radius, + loop=False, + max_neighbours=self.max_neighbours, ) else: - compute_edges = get_radius_graph_config( - config["NeuralNetwork"]["Architecture"] + compute_edges = get_radius_graph( + radius=self.radius, + loop=False, + max_neighbours=self.max_neighbours, ) compute_edge_lengths = Distance(norm=False, cat=True) dataset[:] = [compute_edges(data) for data in dataset] # edge lengths already added manually if using PBC. - if not config["NeuralNetwork"]["Architecture"]["periodic_boundary_conditions"]: + if not self.periodic_boundary_conditions: compute_edge_lengths = Distance(norm=False, cat=True) dataset[:] = [compute_edge_lengths(data) for data in dataset] @@ -104,24 +139,19 @@ def load_serialized_data(self, dataset_path: str, config): for data in dataset: data.to(device) update_predicted_values( - config["NeuralNetwork"]["Variables_of_interest"]["type"], - config["NeuralNetwork"]["Variables_of_interest"]["output_index"], - data, - ) - self.__update_atom_features( - config["NeuralNetwork"]["Variables_of_interest"]["input_node_features"], + self.variables_type, + self.output_index, + self.graph_feature_dim, + self.node_feature_dim, data, ) - if ( - "subsample_percentage" - in config["NeuralNetwork"]["Variables_of_interest"].keys() - ): + self.__update_atom_features(self.input_node_features, data) + + if "subsample_percentage" in self.variables.keys(): + self.subsample_percentage = self.variables["subsample_percentage"] return self.__stratified_sampling( - dataset=dataset, - subsample_percentage=config["NeuralNetwork"]["Variables_of_interest"][ - "subsample_percentage" - ], + dataset=dataset, subsample_percentage=self.subsample_percentage ) return dataset @@ -157,7 +187,6 @@ def __stratified_sampling(self, dataset: [Data], subsample_percentage: float): [Data] Subsample of the original dataset constructed using stratified sampling. """ - unique_values = torch.unique(dataset[0].x[:, 0]).tolist() dataset_categories = [] print_distributed( self.verbosity, "Computing the categories for the whole dataset." @@ -188,22 +217,44 @@ def __stratified_sampling(self, dataset: [Data], subsample_percentage: float): return subsample -def update_predicted_values(type: list, index: list, data: Data): +def update_predicted_values( + type: list, index: list, graph_feature_dim: list, node_feature_dim: list, data: Data +): """Updates values of the structure we want to predict. Predicted value is represented by integer value. Parameters ---------- type: "graph" level or "node" level index: index/location in data.y for graph level and in data.x for node level + graph_feature_dim: list of integers to trak the dimension of each graph level feature data: Data A Data object representing a structure that has atoms. """ output_feature = [] - data.y_loc = torch.zeros(1, len(type) + 1, dtype=torch.int64, device=data.y.device) + data.y_loc = torch.zeros(1, len(type) + 1, dtype=torch.int64, device=data.x.device) + index_counter_global_y = 0 for item in range(len(type)): if type[item] == "graph": - feat_ = torch.reshape(data.y[index[item]], (1, 1)) + feat_ = torch.reshape( + data.y[ + index_counter_global_y : index_counter_global_y + + graph_feature_dim[item] + ], + (graph_feature_dim[item], 1), + ) + index_counter_global_y = index_counter_global_y + graph_feature_dim[item] + # after the global features are spanned, we need to iterate over the nodal features + # to do so, the counter of the nodal features need to start from the last value of counter for the graph nodel feature elif type[item] == "node": - feat_ = torch.reshape(data.x[:, index[item]], (-1, 1)) + index_counter_nodal_y = index[item] + feat_ = torch.reshape( + data.x[ + :, + index_counter_nodal_y : ( + index_counter_nodal_y + node_feature_dim[index_counter_nodal_y] + ), + ], + (-1, 1), + ) else: raise ValueError("Unknown output type", type[item]) output_feature.append(feat_) diff --git a/hydragnn/preprocess/utils.py b/hydragnn/preprocess/utils.py index a51aa7317..3dd925512 100644 --- a/hydragnn/preprocess/utils.py +++ b/hydragnn/preprocess/utils.py @@ -48,6 +48,22 @@ def check_data_samples_equivalence(data1, data2, tol): return x_bool and pos_bool and y_bool and edge_bool +def get_radius_graph(radius, max_neighbours, loop=False): + return RadiusGraph( + r=radius, + loop=loop, + max_num_neighbors=max_neighbours, + ) + + +def get_radius_graph_pbc(radius, max_neighbours, loop=False): + return RadiusGraphPBC( + r=radius, + loop=loop, + max_num_neighbors=max_neighbours, + ) + + def get_radius_graph_config(config, loop=False): return RadiusGraph( r=config["radius"], @@ -77,11 +93,7 @@ def __call__(self, data): assert hasattr( data, "supercell_size" ), "The data must contain the size of the supercell to apply periodic boundary conditions." - assert hasattr( - data, "atom_types" - ), "The data must contain information about the atoms types. Can be a chemical symbol (str) or an atomic number (int)." ase_atom_object = ase.Atoms( - symbols=data.atom_types, positions=data.pos, cell=data.supercell_size, pbc=True, @@ -103,7 +115,7 @@ def __call__(self, data): 1 ), "Adding periodic boundary conditions would result in duplicate edges. Cutoff radius must be reduced or system size increased." - data.edge_attr = torch.tensor(edge_length) + data.edge_attr = torch.tensor(edge_length, dtype=torch.float).unsqueeze(1) return data diff --git a/hydragnn/run_training.py b/hydragnn/run_training.py index acd383945..d51406b5a 100644 --- a/hydragnn/run_training.py +++ b/hydragnn/run_training.py @@ -67,6 +67,9 @@ def _(config: dict): ) config = update_config(config, train_loader, val_loader, test_loader) + plot_init_solution = config["Visualization"]["plot_init_solution"] + plot_hist_solution = config["Visualization"]["plot_hist_solution"] + create_plots = config["Visualization"]["create_plots"] model = create_model_config( config=config["NeuralNetwork"]["Architecture"], verbosity=verbosity @@ -106,6 +109,9 @@ def _(config: dict): config["NeuralNetwork"], log_name, verbosity, + plot_init_solution, + plot_hist_solution, + create_plots, ) save_model(model, log_name) diff --git a/hydragnn/train/train_validate_test.py b/hydragnn/train/train_validate_test.py index b87e82098..c242976a7 100644 --- a/hydragnn/train/train_validate_test.py +++ b/hydragnn/train/train_validate_test.py @@ -43,6 +43,7 @@ def train_validate_test( verbosity=0, plot_init_solution=True, plot_hist_solution=False, + create_plots=False, ): num_epoch = config["Training"]["num_epoch"] # total loss tracking for train/vali/test @@ -64,16 +65,17 @@ def train_validate_test( node_feature.extend(data.x.tolist()) nodes_num_list.append(data.num_nodes) - visualizer = Visualizer( - model_with_config_name, - node_feature=node_feature, - num_heads=model.num_heads, - head_dims=model.head_dims, - num_nodes_list=nodes_num_list, - ) - visualizer.num_nodes_plot() + if create_plots: + visualizer = Visualizer( + model_with_config_name, + node_feature=node_feature, + num_heads=model.num_heads, + head_dims=model.head_dims, + num_nodes_list=nodes_num_list, + ) + visualizer.num_nodes_plot() - if plot_init_solution: # visualizing of initial conditions + if create_plots and plot_init_solution: # visualizing of initial conditions _, _, true_values, predicted_values = test(test_loader, model, verbosity) visualizer.create_scatter_plots( true_values, @@ -147,28 +149,29 @@ def train_validate_test( config["Variables_of_interest"]["y_minmax"], true_values, predicted_values ) - ######result visualization###### - visualizer.create_plot_global( - true_values, - predicted_values, - output_names=config["Variables_of_interest"]["output_names"], - ) - visualizer.create_scatter_plots( - true_values, - predicted_values, - output_names=config["Variables_of_interest"]["output_names"], - ) - ######plot loss history##### - visualizer.plot_history( - total_loss_train, - total_loss_val, - total_loss_test, - task_loss_train, - task_loss_val, - task_loss_test, - model.loss_weights, - config["Variables_of_interest"]["output_names"], - ) + if create_plots: + ######result visualization###### + visualizer.create_plot_global( + true_values, + predicted_values, + output_names=config["Variables_of_interest"]["output_names"], + ) + visualizer.create_scatter_plots( + true_values, + predicted_values, + output_names=config["Variables_of_interest"]["output_names"], + ) + ######plot loss history##### + visualizer.plot_history( + total_loss_train, + total_loss_val, + total_loss_test, + task_loss_train, + task_loss_val, + task_loss_test, + model.loss_weights, + config["Variables_of_interest"]["output_names"], + ) def get_head_indices(model, data): @@ -287,13 +290,11 @@ def test(loader, model, verbosity): for itask in range(len(tasks_rmse)): tasks_error[itask] += tasks_rmse[itask].item() * data.num_graphs ytrue = data.y - istart = 0 for ihead in range(model.num_heads): head_pre = pred[ihead] - pred_shape = head_pre.shape - iend = istart + pred_shape[0] * pred_shape[1] head_val = ytrue[head_index[ihead]] - istart = iend + if head_val.shape != head_pre.shape: + head_val = torch.reshape(head_val, head_pre.shape) true_values[ihead].extend(head_val.tolist()) predicted_values[ihead].extend(pred[ihead].tolist()) diff --git a/tests/deterministic_graph_data.py b/tests/deterministic_graph_data.py index 13a0a6305..b35a1368a 100755 --- a/tests/deterministic_graph_data.py +++ b/tests/deterministic_graph_data.py @@ -149,12 +149,15 @@ def create_configuration( if linear_only: total_value = torch.sum(node_output_x) else: + total_value_linear = torch.sum(node_output_x) total_value = ( torch.sum(node_output_x) + torch.sum(node_output_x_square) + torch.sum(node_output_x_cube) ) filetxt = numpy.array2string(total_value.detach().numpy()) + if not linear_only: + filetxt += "\t" + numpy.array2string(total_value_linear.detach().numpy()) for index in range(0, number_nodes): numpy_row = updated_table[index, :] diff --git a/tests/inputs/ci.json b/tests/inputs/ci.json index f8fc4b824..8c9ce7477 100644 --- a/tests/inputs/ci.json +++ b/tests/inputs/ci.json @@ -59,5 +59,10 @@ "learning_rate": 0.02, "batch_size": 32 } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true } } diff --git a/tests/inputs/ci_multihead.json b/tests/inputs/ci_multihead.json index 5816b6885..40b88a1d0 100644 --- a/tests/inputs/ci_multihead.json +++ b/tests/inputs/ci_multihead.json @@ -57,5 +57,9 @@ "learning_rate": 0.01, "batch_size": 16 } - } -} + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + }} diff --git a/tests/inputs/ci_vectoroutput.json b/tests/inputs/ci_vectoroutput.json new file mode 100644 index 000000000..5dc5e462e --- /dev/null +++ b/tests/inputs/ci_vectoroutput.json @@ -0,0 +1,66 @@ +{ + "Verbosity": { + "level": 4 + }, + "Dataset": { + "name": "unit_test_multihead_vector", + "format": "unit_test", + "compositional_stratified_splitting": true, + "rotational_invariance": false, + "path": { + "total": "dataset/unit_test_multihead" + }, + "node_features": { + "name": ["x","x2x3_vec"], + "dim": [1, 2], + "column_index": [0, 6] + }, + "graph_features":{ + "name": [ "sums"], + "dim": [2], + "column_index": [0] + } + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "PNA", + "radius": 2.0, + "max_neighbours": 100, + "periodic_boundary_conditions": false, + "hidden_dim": 8, + "num_conv_layers": 2, + "output_heads": { + "graph":{ + "num_sharedlayers": 2, + "dim_sharedlayers": 10, + "num_headlayers": 2, + "dim_headlayers": [10, 10] + }, + "node": { + "num_headlayers": 2, + "dim_headlayers": [40, 10], + "type": "mlp" + } + }, + "task_weights": [1.0, 1.0, 1.0] + }, + "Variables_of_interest": { + "input_node_features": [0], + "output_names": ["sums","x","x2x3_vec"], + "output_index": [0,0,1], + "type": ["graph","node","node"], + "denormalize_output": false + }, + "Training": { + "num_epoch": 100, + "perc_train": 0.7, + "learning_rate": 0.01, + "batch_size": 16 + } + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": false + } +} diff --git a/tests/test_graphs.py b/tests/test_graphs.py index 8249a08eb..818a10c94 100755 --- a/tests/test_graphs.py +++ b/tests/test_graphs.py @@ -128,9 +128,11 @@ def unittest_train_model(model_type, ci_input, use_lengths, overwrite_data=False "GAT": [0.60, 0.70, 0.99], "CGCNN": [0.50, 0.40, 0.95], } - if use_lengths: + if use_lengths and ("vector" not in ci_input): thresholds["CGCNN"] = [0.15, 0.15, 0.40] thresholds["PNA"] = [0.10, 0.10, 0.40] + if use_lengths and "vector" in ci_input: + thresholds["PNA"] = [0.15, 0.10, 0.75] verbosity = 2 for ihead in range(len(true_values)): @@ -149,6 +151,8 @@ def unittest_train_model(model_type, ci_input, use_lengths, overwrite_data=False head_pred = torch.tensor(predicted_values[ihead]) # Check individual samples mae = torch.nn.L1Loss() + if head_true.shape != head_pred.shape: + head_pred = torch.reshape(head_pred, head_true.shape) sample_mean_abs_error = mae(head_true, head_pred) sample_max_abs_error = torch.max(torch.abs(head_true - head_pred)) error_str = ( @@ -185,3 +189,9 @@ def pytest_train_model(model_type, ci_input, overwrite_data=False): @pytest.mark.parametrize("model_type", ["PNA", "CGCNN"]) def pytest_train_model_lengths(model_type, overwrite_data=False): unittest_train_model(model_type, "ci.json", True, overwrite_data) + + +# Test vector output +@pytest.mark.parametrize("model_type", ["PNA"]) +def pytest_train_model_vectoroutput(model_type, overwrite_data=False): + unittest_train_model(model_type, "ci_vectoroutput.json", True, overwrite_data)