From 2485fb4e8e816fed17d06c9520cfb0d7f33b41be Mon Sep 17 00:00:00 2001 From: Salman Toor Date: Thu, 27 Jun 2024 12:18:14 +0200 Subject: [PATCH] Docs/SK-901: Fixes related to README format (#646) --- examples/monai-2D-mednist/README.rst | 61 +++++++++++------ examples/monai-2D-mednist/client/data.py | 56 +--------------- .../monai-2D-mednist/client/python_env.yaml | 8 +-- examples/monai-2D-mednist/client/train.py | 8 +-- examples/monai-2D-mednist/client/validate.py | 4 +- .../monai-2D-mednist/client_settings.yaml | 5 +- .../docker-compose.override.yaml | 14 ++-- examples/monai-2D-mednist/prepare_data.py | 66 +++++++++++++++++++ examples/monai-2D-mednist/requirements.txt | 3 + 9 files changed, 133 insertions(+), 92 deletions(-) create mode 100644 examples/monai-2D-mednist/prepare_data.py create mode 100644 examples/monai-2D-mednist/requirements.txt diff --git a/examples/monai-2D-mednist/README.rst b/examples/monai-2D-mednist/README.rst index c2c536f27..cb46047ed 100644 --- a/examples/monai-2D-mednist/README.rst +++ b/examples/monai-2D-mednist/README.rst @@ -1,15 +1,15 @@ FEDn Project: MonAI 2D Classification with the MedNIST Dataset (PyTorch) ------------------------------------------------------------------------ -This is an example FEDn Project based on the MonAI 2D Classification with the MedNIST Dataset. +This is an example FEDn Project based on the MonAI 2D Classification with the MedNIST Dataset. The example is intented as a minimalistic quickstart and automates the handling of training data -by letting the client download and create its partition of the dataset as it starts up. +by letting the client download and create its partition of the dataset as it starts up. + +Links: -Links: - - MonAI: https://monai.io/ - Base example notebook: https://github.com/Project-MONAI/tutorials/blob/main/2d_classification/mednist_tutorial.ipynb -- MedNIST dataset: https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz +- MedNIST dataset: https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz Prerequisites ------------- @@ -17,17 +17,18 @@ Prerequisites Using FEDn Studio: - `Python 3.8, 3.9, 3.10 or 3.11 `__ -- `A FEDn Studio account `__ +- `A FEDn Studio account `__ If using pseudo-distributed mode with docker-compose: - `Docker `__ - `Docker Compose `__ + Creating the compute package and seed model ------------------------------------------- -Install fedn: +Install fedn: .. code-block:: @@ -54,13 +55,31 @@ Next, generate a seed model (the first model in a global model trail): fedn run build --path client -This will create a seed model called 'seed.npz' in the root of the project. This step will take a few minutes, depending on hardware and internet connection (builds a virtualenv). +This will create a seed model called 'seed.npz' in the root of the project. This step will take a few minutes, depending on hardware and internet connection (builds a virtualenv). + +Download and Prepare the data +------------------------------------------- + +Install requirements: + +.. code-block:: + + pip install -r requirements.txt + +Download and divide the data into parts. Set the number of +data parts as an arguments python prepare_data.py NR-OF-DATAPARTS. In the +below command we divide the dataset into 10 parts. +.. code-block:: + + python prepare_data.py 10 + + Using FEDn Studio ----------------- Follow the guide here to set up your FEDn Studio project and learn how to connect clients (using token authentication): `Studio guide `__. -On the step "Upload Files", upload 'package.tgz' and 'seed.npz' created above. +On the step "Upload Files", upload 'package.tgz' and 'seed.npz' created above. Connecting clients: =================== @@ -70,23 +89,27 @@ Connecting clients: .. code-block:: export FEDN_PACKAGE_EXTRACT_DIR=package - export FEDN_DATA_PATH=./data/ + export FEDN_DATA_PATH=/data/ export FEDN_CLIENT_SETTINGS_PATH=/client_settings.yaml + export FEDN_DATA_SPLIT_INDEX=0 + fedn client start -in client.yaml --secure=True --force-ssl Connecting clients using Docker: ================================ -For convenience, there is a Docker image hosted on ghrc.io with fedn preinstalled. To start a client using Docker: +For convenience, there is a Docker image hosted on ghrc.io with fedn preinstalled. To start a client using Docker: .. code-block:: docker run \ -v $PWD/client.yaml:/app/client.yaml \ -v $PWD/client_settings.yaml:/app/client_settings.yaml \ + -v $PWD/data:/app/data \ -e FEDN_PACKAGE_EXTRACT_DIR=package \ - -e FEDN_DATA_PATH=./data/ \ + -e FEDN_DATA_PATH=/app/data/ \ -e FEDN_CLIENT_SETTINGS_PATH=/app/client_settings.yaml \ + -e FEDN_DATA_SPLIT_INDEX=0 \ ghcr.io/scaleoutsystems/fedn/fedn:0.9.0 run client -in client.yaml --force-ssl --secure=True @@ -107,8 +130,8 @@ Start a pseudo-distributed FEDn network using docker-compose: -f docker-compose.override.yaml \ up -This starts up local services for MongoDB, Minio, the API Server, one Combiner and two clients. -You can verify the deployment using these urls: +This starts up local services for MongoDB, Minio, the API Server, one Combiner and two clients. +You can verify the deployment using these urls: - API Server: http://localhost:8092/get_controller_status - Minio: http://localhost:9000 @@ -123,18 +146,18 @@ Upload the package and seed model to FEDn controller using the APIClient. In Pyt client.set_active_package("package.tgz", helper="numpyhelper") client.set_active_model("seed.npz") -You can now start a training session with 5 rounds (default): +You can now start a training session with 5 rounds (default): .. code-block:: client.start_session() -Automate experimentation with several clients +Automate experimentation with several clients ============================================= -If you want to scale the number of clients, you can do so by modifying ``docker-compose.override.yaml``. For example, -in order to run with 3 clients, change the environment variable ``FEDN_NUM_DATA_SPLITS`` to 3, and add one more client -by copying ``client1`` and setting ``FEDN_DATA_PATH`` to ``/app/package/data3/`` +If you want to scale the number of clients, you can do so by modifying ``docker-compose.override.yaml``. For example, +in order to run with 3 clients, change the environment variable ``FEDN_NUM_DATA_SPLITS`` to 3, and add one more client +by copying ``client1``. Access message logs and validation data from MongoDB diff --git a/examples/monai-2D-mednist/client/data.py b/examples/monai-2D-mednist/client/data.py index 0a8b5c306..c8a8a4e0b 100644 --- a/examples/monai-2D-mednist/client/data.py +++ b/examples/monai-2D-mednist/client/data.py @@ -1,11 +1,8 @@ import os import random - import numpy as np import PIL import torch -import yaml -from monai.apps import download_and_extract dir_path = os.path.dirname(os.path.realpath(__file__)) abs_path = os.path.abspath(dir_path) @@ -13,54 +10,6 @@ DATA_CLASSES = {"AbdomenCT": 0, "BreastMRI": 1, "CXR": 2, "ChestCT": 3, "Hand": 4, "HeadCT": 5} -def split_data(data_path="data/MedNIST", splits=100, validation_split=0.9): - # create clients - clients = {"client " + str(i): {"train": [], "validation": []} for i in range(splits)} - - for class_ in os.listdir(data_path): - if os.path.isdir(os.path.join(data_path, class_)): - patients_in_class = [os.path.join(class_, patient) for patient in os.listdir(os.path.join(data_path, class_))] - np.random.shuffle(patients_in_class) - chops = np.int32(np.linspace(0, len(patients_in_class), splits + 1)) - for split in range(splits): - p = patients_in_class[chops[split] : chops[split + 1]] - valsplit = np.int32(len(p) * validation_split) - - clients["client " + str(split)]["train"] += p[:valsplit] - clients["client " + str(split)]["validation"] += p[valsplit:] - - with open(os.path.join(os.path.dirname(data_path), "data_splits.yaml"), "w") as file: - yaml.dump(clients, file, default_flow_style=False) - - -def get_data(out_dir="data"): - """Get data from the external repository. - - :param out_dir: Path to data directory. If doesn't - :type data_dir: str - """ - # Make dir if necessary - if not os.path.exists(out_dir): - os.mkdir(out_dir) - - resource = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz" - md5 = "0bc7306e7427e00ad1c5526a6677552d" - - compressed_file = os.path.join(out_dir, "MedNIST.tar.gz") - - data_dir = os.path.abspath(out_dir) - print("data_dir:", data_dir) - if os.path.exists(data_dir): - print("path exist.") - if not os.path.exists(compressed_file): - print("compressed file does not exist, downloading and extracting data.") - download_and_extract(resource, compressed_file, data_dir, md5) - else: - print("files already exist.") - - split_data() - - def get_classes(data_path): """Get a list of classes from the dataset @@ -148,6 +97,5 @@ def __getitem__(self, index): return (self.transforms(os.path.join(self.data_path, self.image_files[index])), DATA_CLASSES[os.path.dirname(self.image_files[index])]) -if __name__ == "__main__": - # Prepare data if not already done - get_data() + + diff --git a/examples/monai-2D-mednist/client/python_env.yaml b/examples/monai-2D-mednist/client/python_env.yaml index 7580ffb76..ec39b5084 100644 --- a/examples/monai-2D-mednist/client/python_env.yaml +++ b/examples/monai-2D-mednist/client/python_env.yaml @@ -2,11 +2,11 @@ name: monai-2d-mdnist build_dependencies: - pip - setuptools - - wheel==0.37.1 + - wheel dependencies: - torch==2.2.1 - torchvision==0.17.1 - - fedn==0.9.0 + - fedn - monai-weekly[pillow, tqdm] - - scikit-learn - - tensorboard + - numpy==1.26.4 + - scikit-learn diff --git a/examples/monai-2D-mednist/client/train.py b/examples/monai-2D-mednist/client/train.py index e3cb235c0..2ee922865 100644 --- a/examples/monai-2D-mednist/client/train.py +++ b/examples/monai-2D-mednist/client/train.py @@ -22,7 +22,6 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.abspath(dir_path)) - train_transforms = Compose( [ LoadImage(image_only=True), @@ -54,18 +53,16 @@ def train(in_model_path, out_model_path, data_path=None, client_settings_path=No if client_settings_path is None: client_settings_path = os.environ.get("FEDN_CLIENT_SETTINGS_PATH", dir_path + "/client_settings.yaml") - print("client_settings_path: ", client_settings_path) with open(client_settings_path, "r") as fh: # Used by CJG for local training try: client_settings = dict(yaml.safe_load(fh)) except yaml.YAMLError: raise - print("client settings: ", client_settings) batch_size = client_settings["batch_size"] max_epochs = client_settings["local_epochs"] num_workers = client_settings["num_workers"] - split_index = client_settings["split_index"] + split_index = os.environ.get("FEDN_DATA_SPLIT_INDEX") lr = client_settings["lr"] if data_path is None: @@ -76,8 +73,7 @@ def train(in_model_path, out_model_path, data_path=None, client_settings_path=No image_list = clients["client " + str(split_index)]["train"] - train_ds = MedNISTDataset(data_path="data/MedNIST", transforms=train_transforms, image_files=image_list) - + train_ds = MedNISTDataset(data_path=data_path+"/MedNIST/", transforms=train_transforms, image_files=image_list) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers) # Load parmeters and initialize model diff --git a/examples/monai-2D-mednist/client/validate.py b/examples/monai-2D-mednist/client/validate.py index 74292c34f..ff4eb9263 100644 --- a/examples/monai-2D-mednist/client/validate.py +++ b/examples/monai-2D-mednist/client/validate.py @@ -45,7 +45,7 @@ def validate(in_model_path, out_json_path, data_path=None, client_settings_path= num_workers = client_settings["num_workers"] batch_size = client_settings["batch_size"] - split_index = client_settings["split_index"] + split_index = os.environ.get("FEDN_DATA_SPLIT_INDEX") if data_path is None: data_path = os.environ.get("FEDN_DATA_PATH") @@ -55,7 +55,7 @@ def validate(in_model_path, out_json_path, data_path=None, client_settings_path= image_list = clients["client " + str(split_index)]["validation"] - val_ds = MedNISTDataset(data_path="data/MedNIST", transforms=val_transforms, image_files=image_list) + val_ds = MedNISTDataset(data_path=data_path+"/MedNIST/", transforms=val_transforms, image_files=image_list) val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers) diff --git a/examples/monai-2D-mednist/client_settings.yaml b/examples/monai-2D-mednist/client_settings.yaml index f7bccb303..468c78802 100644 --- a/examples/monai-2D-mednist/client_settings.yaml +++ b/examples/monai-2D-mednist/client_settings.yaml @@ -1,6 +1,5 @@ lr: 0.01 -batch_size: 32 -local_epochs: 10 +batch_size: 8 +local_epochs: 1 num_workers: 1 sample_size: 30 -split_index: 4 diff --git a/examples/monai-2D-mednist/docker-compose.override.yaml b/examples/monai-2D-mednist/docker-compose.override.yaml index afeaf1437..88fda24d8 100644 --- a/examples/monai-2D-mednist/docker-compose.override.yaml +++ b/examples/monai-2D-mednist/docker-compose.override.yaml @@ -15,13 +15,15 @@ services: service: client environment: <<: *defaults - FEDN_DATA_PATH: /app/package/client/data/MedNIST - FEDN_CLIENT_SETTINGS_PATH: /app/client_settings.yaml + FEDN_DATA_PATH: /app/data/MedNIST + FEDN_CLIENT_SETTINGS_PATH: /app/client_settings.yaml + FEDN_DATA_SPLIT_INDEX: 0 deploy: replicas: 1 volumes: - ${HOST_REPO_DIR:-.}/fedn:/app/fedn - - ${HOST_REPO_DIR:-.}/examples/monai-2D-mednist/client_settings.yaml:/app/client_settings.yaml + - ${HOST_REPO_DIR:-.}/examples/monai-2D-mednist/client_settings.yaml:/app/client_settings.yaml + - ${HOST_REPO_DIR:-.}/examples/monai-2D-mednist/data:/app/data client2: extends: @@ -29,8 +31,12 @@ services: service: client environment: <<: *defaults - FEDN_DATA_PATH: /app/package/client/data/MedNIST + FEDN_DATA_PATH: /app/data/MedNIST + FEDN_CLIENT_SETTINGS_PATH: /app/client_settings.yaml + FEDN_DATA_SPLIT_INDEX: 1 deploy: replicas: 1 volumes: - ${HOST_REPO_DIR:-.}/fedn:/app/fedn + - ${HOST_REPO_DIR:-.}/examples/monai-2D-mednist/client_settings.yaml:/app/client_settings.yaml + - ${HOST_REPO_DIR:-.}/examples/monai-2D-mednist/data:/app/data diff --git a/examples/monai-2D-mednist/prepare_data.py b/examples/monai-2D-mednist/prepare_data.py new file mode 100644 index 000000000..80c083549 --- /dev/null +++ b/examples/monai-2D-mednist/prepare_data.py @@ -0,0 +1,66 @@ +import os +import sys +import numpy as np + +import yaml +from monai.apps import download_and_extract + + +def split_data(data_path="data/MedNIST", splits=100, validation_split=0.9): + # create clients + clients = {"client " + str(i): {"train": [], "validation": []} for i in range(splits)} + print("splits: ", splits) + for class_ in os.listdir(data_path): + if os.path.isdir(os.path.join(data_path, class_)): + patients_in_class = [os.path.join(class_, patient) for patient in os.listdir(os.path.join(data_path, class_))] + np.random.shuffle(patients_in_class) + chops = np.int32(np.linspace(0, len(patients_in_class), splits + 1)) + for split in range(splits): + p = patients_in_class[chops[split] : chops[split + 1]] + + valsplit = np.int32(len(p) * validation_split) + + clients["client " + str(split)]["train"] += p[:valsplit] + clients["client " + str(split)]["validation"] += p[valsplit:] + + if split == 0: + print("len p: ", len(p)) + print("valsplit: ", valsplit) + print("p[:valsplit]: ", p[:valsplit]) + print("p[valsplit:]: ", p[valsplit:]) + + with open(os.path.join(os.path.dirname(data_path), "data_splits.yaml"), "w") as file: + yaml.dump(clients, file, default_flow_style=False) + + +def get_data(out_dir="data", data_splits=10): + """Get data from the external repository. + + :param out_dir: Path to data directory. If doesn't + :type data_dir: str + """ + # Make dir if necessary + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + resource = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz" + md5 = "0bc7306e7427e00ad1c5526a6677552d" + + compressed_file = os.path.join(out_dir, "MedNIST.tar.gz") + + data_dir = os.path.abspath(out_dir) + print("data_dir:", data_dir) + if os.path.exists(data_dir): + print("path exist.") + if not os.path.exists(compressed_file): + print("compressed file does not exist, downloading and extracting data.") + download_and_extract(resource, compressed_file, data_dir, md5) + else: + print("files already exist.") + + split_data(splits=data_splits) + + +if __name__ == "__main__": + # Prepare data if not already done + get_data(data_splits=int(sys.argv[1])) diff --git a/examples/monai-2D-mednist/requirements.txt b/examples/monai-2D-mednist/requirements.txt new file mode 100644 index 000000000..0e2857824 --- /dev/null +++ b/examples/monai-2D-mednist/requirements.txt @@ -0,0 +1,3 @@ +monai +PyYAML +numpy==1.26.4 \ No newline at end of file