From e772691e4d41e4e3e3728fefe8c2f3d704b6e225 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 11:25:32 +0100 Subject: [PATCH] Latest --- fedn/network/combiner/modelservice.py | 2 +- fedn/network/combiner/roundhandler.py | 84 ++++++++++++++++++--------- fedn/network/grpc/server.py | 3 - 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index ef5f9a75a..b2a11861d 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,7 +9,7 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -CHUNK_SIZE = 2 * 1024 * 1024 +CHUNK_SIZE = 1 * 1024 * 1024 def upload_request_generator(mdl, id): diff --git a/fedn/network/combiner/roundhandler.py b/fedn/network/combiner/roundhandler.py index fa3d83e8f..604a77244 100644 --- a/fedn/network/combiner/roundhandler.py +++ b/fedn/network/combiner/roundhandler.py @@ -131,7 +131,8 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) :return: an aggregated model and associated metadata :rtype: model, dict """ - logger.info("ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) + logger.info( + "ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) meta = {} meta["nr_expected_updates"] = len(clients) @@ -142,11 +143,14 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) model_id = config["model_id"] if provided_functions.get("client_settings", False): - global_model_bytes = self.modelservice.temp_model_storage.get(model_id) - client_settings = self.hook_interface.client_settings(global_model_bytes) + global_model_bytes = self.modelservice.temp_model_storage.get( + model_id) + client_settings = self.hook_interface.client_settings( + global_model_bytes) config["client_settings"] = client_settings # Request model updates from all active clients. - self.server.request_model_update(session_id=session_id, model_id=model_id, config=config, clients=clients) + self.server.request_model_update( + session_id=session_id, model_id=model_id, config=config, clients=clients) # If buffer_size is -1 (default), the round terminates when/if all clients have completed. if int(config["buffer_size"]) == -1: @@ -161,7 +165,8 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) data = None try: helper = get_helper(config["helper_type"]) - logger.info("Config delete_models_storage: {}".format(config["delete_models_storage"])) + logger.info("Config delete_models_storage: {}".format( + config["delete_models_storage"])) if config["delete_models_storage"] == "True": delete_models = True else: @@ -173,10 +178,13 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) else: parameters = None if provided_functions.get("aggregate", False): - previous_model_bytes = self.modelservice.temp_model_storage.get(model_id) - model, data = self.hook_interface.aggregate(previous_model_bytes, self.update_handler, helper, delete_models=delete_models) + previous_model_bytes = self.modelservice.temp_model_storage.get( + model_id) + model, data = self.hook_interface.aggregate( + previous_model_bytes, self.update_handler, helper, delete_models=delete_models) else: - model, data = self.aggregator.combine_models(helper=helper, delete_models=delete_models, parameters=parameters) + model, data = self.aggregator.combine_models( + helper=helper, delete_models=delete_models, parameters=parameters) except Exception as e: logger.warning("AGGREGATION FAILED AT COMBINER! {}".format(e)) raise @@ -195,7 +203,8 @@ def _validation_round(self, session_id, model_id, clients): :param model_id: The ID of the model to validate :type model_id: str """ - self.server.request_model_validation(session_id, model_id, clients=clients) + self.server.request_model_validation( + session_id, model_id, clients=clients) def _prediction_round(self, prediction_id: str, model_id: str, clients: list): """Send model prediction requests to clients. @@ -207,7 +216,8 @@ def _prediction_round(self, prediction_id: str, model_id: str, clients: list): :param model_id: The ID of the model to use for prediction :type model_id: str """ - self.server.request_model_prediction(prediction_id, model_id, clients=clients) + self.server.request_model_prediction( + prediction_id, model_id, clients=clients) def stage_model(self, model_id, timeout_retry=3, retry=2): """Download a model from persistent storage and set in modelservice. @@ -221,7 +231,8 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): """ # If the model is already in memory at the server we do not need to do anything. if self.modelservice.temp_model_storage.exist(model_id): - logger.info("Model already exists in memory, skipping model staging.") + logger.info( + "Model already exists in memory, skipping model staging.") return logger.info("Model Staging, fetching model from storage...") # If not, download it and stage it in memory at the combiner. @@ -232,11 +243,13 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): if model: break except Exception: - logger.warning("Could not fetch model from storage backend, retrying.") + logger.warning( + "Could not fetch model from storage backend, retrying.") time.sleep(timeout_retry) tries += 1 if tries > retry: - logger.error("Failed to stage model {} from storage backend!".format(model_id)) + logger.error( + "Failed to stage model {} from storage backend!".format(model_id)) raise self.modelservice.set_model(model, model_id) @@ -256,7 +269,8 @@ def _assign_round_clients(self, n, type="trainers"): elif type == "trainers": clients = self.server.get_active_trainers() else: - logger.error("(ERROR): {} is not a supported type of client".format(type)) + logger.error( + "(ERROR): {} is not a supported type of client".format(type)) # If the number of requested trainers exceeds the number of available, use all available. n = min(n, len(clients)) @@ -278,7 +292,8 @@ def _check_nr_round_clients(self, config): """ active = self.server.nr_active_trainers() if active >= int(config["clients_required"]): - logger.info("Number of clients required ({0}) to start round met {1}.".format(config["clients_required"], active)) + logger.info("Number of clients required ({0}) to start round met {1}.".format( + config["clients_required"], active)) return True else: logger.info("Too few clients to start round.") @@ -290,9 +305,11 @@ def execute_validation_round(self, session_id, model_id): :param round_config: The round config object. :type round_config: dict """ - logger.info("COMBINER orchestrating validation of model {}".format(model_id)) + logger.info( + "COMBINER orchestrating validation of model {}".format(model_id)) self.stage_model(model_id) - validators = self._assign_round_clients(self.server.max_clients, type="validators") + validators = self._assign_round_clients( + self.server.max_clients, type="validators") self._validation_round(session_id, model_id, validators) def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: @@ -301,10 +318,12 @@ def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: :param round_config: The round config object. :type round_config: dict """ - logger.info("COMBINER orchestrating prediction using model {}".format(model_id)) + logger.info( + "COMBINER orchestrating prediction using model {}".format(model_id)) self.stage_model(model_id) # TODO: Implement prediction client type - clients = self._assign_round_clients(self.server.max_clients, type="validators") + clients = self._assign_round_clients( + self.server.max_clients, type="validators") self._prediction_round(prediction_id, model_id, clients) def execute_training_round(self, config): @@ -315,7 +334,8 @@ def execute_training_round(self, config): :return: metadata about the training round. :rtype: dict """ - logger.info("Processing training round, job_id {}".format(config["_job_id"])) + logger.info("Processing training round, job_id {}".format( + config["_job_id"])) data = {} data["config"] = config @@ -324,17 +344,20 @@ def execute_training_round(self, config): # Download model to update and set in temp storage. self.stage_model(config["model_id"]) - provided_functions = self.hook_interface.provided_functions(self.server_functions) + provided_functions = self.hook_interface.provided_functions( + self.server_functions) if provided_functions.get("client_selection", False): - clients = self.hook_interface.client_selection(clients=self.server.get_active_trainers()) + clients = self.hook_interface.client_selection( + clients=self.server.get_active_trainers()) else: clients = self._assign_round_clients(self.server.max_clients) model, meta = self._training_round(config, clients, provided_functions) data["data"] = meta if model is None: - logger.warning("\t Failed to update global model in round {0}!".format(config["round_id"])) + logger.warning( + "\t Failed to update global model in round {0}!".format(config["round_id"])) if model is not None: helper = get_helper(config["helper_type"]) @@ -343,7 +366,8 @@ def execute_training_round(self, config): a.close() data["model_id"] = model_id - logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format(model_id, config["_job_id"])) + logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format( + model_id, config["_job_id"])) # Delete temp model self.modelservice.temp_model_storage.delete(config["model_id"]) @@ -369,11 +393,14 @@ def run(self, polling_interval=1.0): session_id = round_config["session_id"] model_id = round_config["model_id"] tic = time.time() - round_meta = self.execute_training_round(round_config) - round_meta["time_exec_training"] = time.time() - tic + round_meta = self.execute_training_round( + round_config) + round_meta["time_exec_training"] = time.time() - \ + tic round_meta["status"] = "Success" round_meta["name"] = self.server.id - self.server.statestore.set_round_combiner_data(round_meta) + self.server.statestore.set_round_combiner_data( + round_meta) elif round_config["task"] == "validation": session_id = round_config["session_id"] model_id = round_config["model_id"] @@ -381,7 +408,8 @@ def run(self, polling_interval=1.0): elif round_config["task"] == "prediction": prediction_id = round_config["prediction_id"] model_id = round_config["model_id"] - self.execute_prediction_round(prediction_id, model_id) + self.execute_prediction_round( + prediction_id, model_id) else: logger.warning("config contains unkown task type.") else: diff --git a/fedn/network/grpc/server.py b/fedn/network/grpc/server.py index a581c16bf..7f6109324 100644 --- a/fedn/network/grpc/server.py +++ b/fedn/network/grpc/server.py @@ -33,7 +33,6 @@ def __init__(self, servicer, config: ServerConfig): KEEPALIVE_TIMEOUT_MS = 20 * 1000 # max idle time before server terminates the connection (5 minutes) MAX_CONNECTION_IDLE_MS = 5 * 60 * 1000 - MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 self.server = grpc.server( futures.ThreadPoolExecutor(max_workers=350), @@ -42,8 +41,6 @@ def __init__(self, servicer, config: ServerConfig): ("grpc.keepalive_time_ms", KEEPALIVE_TIME_MS), ("grpc.keepalive_timeout_ms", KEEPALIVE_TIMEOUT_MS), ("grpc.max_connection_idle_ms", MAX_CONNECTION_IDLE_MS), - ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), - ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), ], ) self.certificate = None