From a12ce6ed3d9eba0fc739418e50231d8be0aaf85f Mon Sep 17 00:00:00 2001 From: jadie1 Date: Mon, 6 Nov 2023 13:06:03 -0700 Subject: [PATCH] Addresing issue 2072 --- ...tting-started-with-data-augmentation.ipynb | 26 +++++++++---------- .../DataAugmentationUtils/__init__.py | 9 ++++++- docs/deep-learning/data-augmentation.md | 24 +++++++++++++---- ...tting-started-with-data-augmentation.ipynb | 24 ++++++++--------- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/Examples/Python/notebooks/tutorials/getting-started-with-data-augmentation.ipynb b/Examples/Python/notebooks/tutorials/getting-started-with-data-augmentation.ipynb index feb840b165..0a400b7f17 100644 --- a/Examples/Python/notebooks/tutorials/getting-started-with-data-augmentation.ipynb +++ b/Examples/Python/notebooks/tutorials/getting-started-with-data-augmentation.ipynb @@ -131,11 +131,11 @@ "\n", "# Get particles path list\n", "model_dir = data_dir + \"shape_models/femur/1024/\" \n", - "local_particle_list = []\n", + "world_particle_list = []\n", "for file in os.listdir(model_dir):\n", - " if \"local\" in file:\n", - " local_particle_list.append(model_dir + file)\n", - "local_particle_list = sorted(local_particle_list)\n", + " if \"world\" in file:\n", + " world_particle_list.append(model_dir + file)\n", + "world_particle_list = sorted(world_particle_list)\n", "\n", "print(\"Total shapes in original dataset: \"+ str(len(img_list)))" ] @@ -150,21 +150,19 @@ "\n", "```python\n", "DataAugmentationUtils.runDataAugmentation(out_dir, img_list, \n", - " local_point_list, num_samples, \n", + " world_point_list, num_samples, \n", " num_dim, percent_variability, \n", - " sampler_type, mixture_num,\n", - " world_point_list)\n", + " sampler_type, mixture_num)\n", "```\n", "**Input arguments:**\n", "\n", "* `out_dir`: Path to the directory where augmented data will be stored\n", "* `img_list`: List of paths to images of the original dataset.\n", - "* `local_point_list`: List of paths to local `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`.\n", + "* `world_point_list`: List of paths to world `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`.\n", "* `num_dim`: The number of dimensions to reduce to in PCA embedding. If zero or not specified, the percent_variability option is used to select the numnber of dimensions.\n", "* `percent_variability`: The proportion of variability in the data to be preserved in embedding. Used if `num_dim` is zero or not specified. Default value is 0.95 which preserves 95% of the varibaility in the data.\n", "* `sampler_type`: The type of parametric distribution to fit and sample from. Options: `gaussian`, `mixture`, or `kde`. Default: `kde`.\n", "* `mixture_num`: Only necessary if `sampler_type` is `mixture`. The number of clusters (i.e., mixture components) to be used in fitting a mixture model. If zero or not specified, the optimal number of clusters will be automatically determined using the [elbow method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)).\n", - "* `world_point_list`: List of paths to world `.particles` files of the original dataset. This is optional and should be provided in cases where procrustes was used for the original optimization, resulting in a difference between world and local particle files. Note, this list should be ordered in correspondence with the `img_list` and `local_point_list`.\n", "\n", "\n", "In this notebook we will keep most arguments the same and explore the effect of changing the `sampler_type`.\n", @@ -191,7 +189,7 @@ "source": [ "output_directory = '../Output/GaussianAugmentation/'\n", "sampler_type = \"gaussian\"\n", - "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, local_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", + "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, world_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", "aug_data_csv = output_directory + \"/TotalData.csv\"" ] }, @@ -245,7 +243,7 @@ "source": [ "output_directory = '../Output/MixtureAugmentation/'\n", "sampler_type = \"mixture\"\n", - "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, local_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", + "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, world_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", "aug_data_csv = output_directory + \"/TotalData.csv\"" ] }, @@ -288,7 +286,7 @@ "source": [ "output_directory = '../Output/KDEAugmentation/'\n", "sampler_type = \"kde\"\n", - "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, local_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", + "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, world_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", "aug_data_csv = output_directory + \"/TotalData.csv\"" ] }, @@ -319,7 +317,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -333,7 +331,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.9.13" }, "toc": { "base_numbering": 1, diff --git a/Python/DataAugmentationUtilsPackage/DataAugmentationUtils/__init__.py b/Python/DataAugmentationUtilsPackage/DataAugmentationUtils/__init__.py index eb6eefb5d5..b35875880a 100644 --- a/Python/DataAugmentationUtilsPackage/DataAugmentationUtils/__init__.py +++ b/Python/DataAugmentationUtilsPackage/DataAugmentationUtils/__init__.py @@ -17,12 +17,19 @@ ''' -def runDataAugmentation(out_dir, img_list, local_point_list, num_samples=3, num_dim=0, percent_variability=0.95, sampler_type="KDE", mixture_num=0, processes=1, world_point_list=None): +def runDataAugmentation(out_dir, img_list, world_point_list, num_samples=3, num_dim=0, percent_variability=0.95, sampler_type="KDE", mixture_num=0, processes=1): + sw_message("Running point based data augmentation.") + num_dim = DataAugmentation.point_based_aug(out_dir, img_list, world_point_list, num_samples, num_dim, percent_variability, sampler_type, mixture_num, processes) + sw_message("Done.") + return num_dim + +def runLocalDataAugmentation(out_dir, img_list, local_point_list, world_point_list, num_samples=3, num_dim=0, percent_variability=0.95, sampler_type="KDE", mixture_num=0, processes=1): sw_message("Running point based data augmentation.") num_dim = DataAugmentation.point_based_aug(out_dir, img_list, local_point_list, num_samples, num_dim, percent_variability, sampler_type, mixture_num, processes, world_point_list) sw_message("Done.") return num_dim + def visualizeAugmentation(data_csv, viz_type='splom', show=True): if viz_type == 'splom': Visualize.splom(data_csv) diff --git a/docs/deep-learning/data-augmentation.md b/docs/deep-learning/data-augmentation.md index c147b339be..c0eec701c8 100644 --- a/docs/deep-learning/data-augmentation.md +++ b/docs/deep-learning/data-augmentation.md @@ -25,23 +25,37 @@ To run the complete data augmentation process as detailed in [Data Augmentation ```python DataAugmentationUtils.runDataAugmentation(out_dir, img_list, - local_point_list, num_samples, + world_point_list, num_samples, num_dim, percent_variability, - sampler_type, mixture_num, - world_point_list) + sampler_type, mixture_num) ``` +This generates image/particle pairs in the world coordinate system and assumes the images in `img_list` are groomed/aligned. + +Local image/particle pairs can alos be generated using: + +```python +DataAugmentationUtils.runLocalDataAugmentation(out_dir, img_list, + world_point_list, local_point_list, + num_samples, num_dim, percent_variability, + sampler_type, mixture_num) +``` +This generates image/particle pairs in the local coordinate system and assumes the images in img_list are the original/unaligned images. The world_point_list needs to be provided in this case so that PCA is done in the world coordinate system. New samples are generated by sampling the world PCA subspace, then mapping it to local points using the transform from world to local of the closest real example. + + **Input arguments:** * `out_dir`: Path to the directory where augmented data will be stored * `img_list`: List of paths to images of the original dataset. -* `local_point_list`: List of paths to local `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`. +* `world_point_list`: List of paths to the world `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`. * `num_dim`: The number of dimensions to reduce to in PCA embedding. If zero or not specified, the percent_variability option is used to select the numnber of dimensions. * `percent_variability`: The proportion of variability in the data to be preserved in embedding. Used if `num_dim` is zero or not specified. Default value is 0.95 which preserves 95% of the varibaility in the data. * `sampler_type`: The type of parametric distribution to fit and sample from. Options: `gaussian`, `mixture`, or `kde`. Default: `kde`. * `mixture_num`: Only necessary if `sampler_type` is `mixture`. The number of clusters (i.e., mixture components) to be used in fitting a mixture model. If zero or not specified, the optimal number of clusters will be automatically determined using the [elbow method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)). -* `world_point_list`: List of paths to world `.particles` files of the original dataset. This is optional and should be provided in cases where procrustes was used for the original optimization, resulting in a difference between world and local particle files. Note, this list should be ordered in correspondence with the `img_list` and `local_point_list`. + +For `runLocalDataAugmentation()` the following argument must also be provided: +* `local_point_list`: List of paths to local `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list` and `world_point_list`. ### Visualizing Data Augmentation diff --git a/docs/notebooks/getting-started-with-data-augmentation.ipynb b/docs/notebooks/getting-started-with-data-augmentation.ipynb index 81c37f2e80..3221e04270 100644 --- a/docs/notebooks/getting-started-with-data-augmentation.ipynb +++ b/docs/notebooks/getting-started-with-data-augmentation.ipynb @@ -131,11 +131,11 @@ "\n", "# Get particles path list\n", "model_dir = data_dir + \"shape_models/femur/1024/\" \n", - "local_particle_list = []\n", + "world_particle_list = []\n", "for file in os.listdir(model_dir):\n", - " if \"local\" in file:\n", - " local_particle_list.append(model_dir + file)\n", - "local_particle_list = sorted(local_particle_list)\n", + " if \"world\" in file:\n", + " world_particle_list.append(model_dir + file)\n", + "world_particle_list = sorted(world_particle_list)\n", "\n", "print(\"Total shapes in original dataset: \"+ str(len(img_list)))" ] @@ -150,21 +150,19 @@ "\n", "```python\n", "DataAugmentationUtils.runDataAugmentation(out_dir, img_list, \n", - " local_point_list, num_samples, \n", + " world_point_list, num_samples, \n", " num_dim, percent_variability, \n", - " sampler_type, mixture_num,\n", - " world_point_list)\n", + " sampler_type, mixture_num)\n", "```\n", "**Input arguments:**\n", "\n", "* `out_dir`: Path to the directory where augmented data will be stored\n", "* `img_list`: List of paths to images of the original dataset.\n", - "* `local_point_list`: List of paths to local `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`.\n", + "* `world_point_list`: List of paths to world `.particles` files of the original dataset. Note, this list should be ordered in correspondence with the `img_list`.\n", "* `num_dim`: The number of dimensions to reduce to in PCA embedding. If zero or not specified, the percent_variability option is used to select the numnber of dimensions.\n", "* `percent_variability`: The proportion of variability in the data to be preserved in embedding. Used if `num_dim` is zero or not specified. Default value is 0.95 which preserves 95% of the varibaility in the data.\n", "* `sampler_type`: The type of parametric distribution to fit and sample from. Options: `gaussian`, `mixture`, or `kde`. Default: `kde`.\n", "* `mixture_num`: Only necessary if `sampler_type` is `mixture`. The number of clusters (i.e., mixture components) to be used in fitting a mixture model. If zero or not specified, the optimal number of clusters will be automatically determined using the [elbow method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)).\n", - "* `world_point_list`: List of paths to world `.particles` files of the original dataset. This is optional and should be provided in cases where procrustes was used for the original optimization, resulting in a difference between world and local particle files. Note, this list should be ordered in correspondence with the `img_list` and `local_point_list`.\n", "\n", "\n", "In this notebook we will keep most arguments the same and explore the effect of changing the `sampler_type`.\n", @@ -191,7 +189,7 @@ "source": [ "output_directory = '../Output/GaussianAugmentation/'\n", "sampler_type = \"gaussian\"\n", - "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, local_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", + "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, world_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", "aug_data_csv = output_directory + \"/TotalData.csv\"" ] }, @@ -245,7 +243,7 @@ "source": [ "output_directory = '../Output/MixtureAugmentation/'\n", "sampler_type = \"mixture\"\n", - "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, local_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", + "embedded_dim = DataAugmentationUtils.runDataAugmentation(output_directory, img_list, world_particle_list, num_samples, num_dim, percent_variability, sampler_type)\n", "aug_data_csv = output_directory + \"/TotalData.csv\"" ] }, @@ -319,7 +317,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -333,7 +331,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.9.13" }, "toc": { "base_numbering": 1,