From be011db067803769f9f88d97aa4099bf45fc96ee Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 10:53:30 +1200 Subject: [PATCH 1/6] Add pre-commit hook for codespell Automate typo-finding using [codespell](https://github.com/codespell-project/codespell/tree/v2.3.0?tab=readme-ov-file#pre-commit-hook). Added LINZ to the ignore list as a start. --- .pre-commit-config.yaml | 4 ++++ pyproject.toml | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5c54a585..3bd568ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,10 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: +- repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..5f8d2662 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.codespell] +ignore-words-list = "linz" From 8bbe0549ffdcdec0c7c832861d2c1d993acbad89 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:48:26 +1200 Subject: [PATCH 2/6] Fix typos in Markdown files --- docs/clay-v0/data_labels.md | 2 +- docs/clay-v0/model_finetuning.md | 6 +++--- docs/clay-v0/run_region.md | 4 ++-- docs/clay-v0/specification-v0.md | 2 +- docs/finetune/classify.md | 4 ++-- docs/finetune/regression.md | 2 +- docs/release-notes/changelog-v1.0.md | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/clay-v0/data_labels.md b/docs/clay-v0/data_labels.md index be097d7b..61fcd1ad 100644 --- a/docs/clay-v0/data_labels.md +++ b/docs/clay-v0/data_labels.md @@ -26,7 +26,7 @@ then followed the same `datacube` creation logic to generate datacubes with Sentinel-1 VV and VH and the Copernicus Digital Elevation Model (DEM). We also ensured that the Sentinel-1 data was within a +/- 3 day interval of each reference Sentinel-2 scene (same method used by the benchmark dataset authors) -and that the Sentinel-1 data was indeed already included in the bechmark +and that the Sentinel-1 data was indeed already included in the benchmark dataset's list of granules. The datacubes generated have all three inputs matching the exact specs of the Foundation model's training data, at 512x512 pixels. diff --git a/docs/clay-v0/model_finetuning.md b/docs/clay-v0/model_finetuning.md index 13cd046b..e16cb2cd 100644 --- a/docs/clay-v0/model_finetuning.md +++ b/docs/clay-v0/model_finetuning.md @@ -2,7 +2,7 @@ Fine-tuning refers to a process in machine learning where a pre-trained model is further trained on a specific dataset to adapt its parameters to a -downstream task characterized by a relevent domain. It's distinct from training +downstream task characterized by a relevant domain. It's distinct from training a model from scratch using the downstream task dataset exclusively. Related to finetuning in the field of training Foundation models is linear @@ -21,7 +21,7 @@ the Foundation model both during its pre-training and afterwards. Let's take a look at how we are finetuning on the benchmark datacube-adapted [Cloud to Street - Microsoft Flood Dataset](https://beta.source.coop/repositories/c2sms/c2smsfloods). As a reminder, that is a downstream -segmentation task for identifiying water pixels in recorded flood events. It's +segmentation task for identifying water pixels in recorded flood events. It's a binary segmentation problem, specifically. We process the datacubes into batches formatted in the way the pretrained Clay @@ -150,7 +150,7 @@ segmentation problem, and on the predictions, we run sigmoid and max functions to obtain final segmentation results. The way we measure relative performance between the finetuned and -"from scratch" model variants happens through calculation of evalution metrics +"from scratch" model variants happens through calculation of evaluation metrics common for segmentation, such as Dice coefficient, Intersection over Union, F1 score, precision and recall. diff --git a/docs/clay-v0/run_region.md b/docs/clay-v0/run_region.md index cb9d5826..6a3994d4 100644 --- a/docs/clay-v0/run_region.md +++ b/docs/clay-v0/run_region.md @@ -3,7 +3,7 @@ This section shows in a few simple steps how the clay model can be run for custom AOIs and over custom date ranges. -## Prepare folder strucutre for data +## Prepare folder structure for data ```bash # Move into the model repository @@ -87,7 +87,7 @@ outside of the AOI specified. To speed up processing in the example below, we use the subset argument to reduce each MGRS tile to a small pixel window. When subsetting, the script -will only download a fraction of each MGRS tile. This will lead to discontinous +will only download a fraction of each MGRS tile. This will lead to discontinuous datasets and should not be used in a real use case. Remove the subset argument when using the script for a real world application, where all the data should be downloaded for each MGRS tile. diff --git a/docs/clay-v0/specification-v0.md b/docs/clay-v0/specification-v0.md index cbaa4370..8525c1ad 100644 --- a/docs/clay-v0/specification-v0.md +++ b/docs/clay-v0/specification-v0.md @@ -19,7 +19,7 @@ The model was trained on AWS on 4 NVIDIA A10G GPUs for 25 epochs (~14h per epoch Model weights are available on HuggingFace [here](https://huggingface.co/made-with-clay/Clay/). -We also generated embeddings for all trainning data, which can be found on Source Cooperative [here](https://source.coop/). +We also generated embeddings for all training data, which can be found on Source Cooperative [here](https://source.coop/). ## Model Architecture diff --git a/docs/finetune/classify.md b/docs/finetune/classify.md index 9a698d04..ac334a41 100644 --- a/docs/finetune/classify.md +++ b/docs/finetune/classify.md @@ -20,7 +20,7 @@ The `Classifier` class is designed for classification tasks, utilizing the Clay In this example, we will use the `Classifier` class to classify images from the [EuroSAT MS dataset](https://github.com/phelber/EuroSAT). The implementation includes data preprocessing, data loading, and model training workflow using [PyTorch Lightning](https://lightning.ai/) & [TorchGeo](https://github.com/microsoft/torchgeo). -In this example we freeze the Clay encoder and only train a very simple 2 layer MLP head for classification. The MLP head recieves as input the Clay class token embedding, which already contains the essence of the image as seen by Clay. The model for classification can then be kept very simple while still guaranteeing high quality results. +In this example we freeze the Clay encoder and only train a very simple 2 layer MLP head for classification. The MLP head receives as input the Clay class token embedding, which already contains the essence of the image as seen by Clay. The model for classification can then be kept very simple while still guaranteeing high quality results. Notice that the EuroSAT dataset comes without date stamps or location information. The Clay model requires encoded versions of a date stamp and a latitude and longitude information. These values can be set to zero if they are not available, which is what we are doing in the datamodule script. @@ -72,7 +72,7 @@ data/ds ``` -### Training the Classifcation Head +### Training the Classification Head The model can be run via LightningCLI using configurations in `finetune/classify/configs/classify_eurosat.yaml`. diff --git a/docs/finetune/regression.md b/docs/finetune/regression.md index 3b0bd9fa..c72c2419 100644 --- a/docs/finetune/regression.md +++ b/docs/finetune/regression.md @@ -157,7 +157,7 @@ Compressed: 729766400 This will take the average of all timesteps available for each tile. The time steps for Sentinel-2 are not complete, not all months are -provided for all tiles. In addtion, the Clay model does not take time +provided for all tiles. In addition, the Clay model does not take time series as input. So aggregating the time element is simplifying but ok for the purpose of this example. diff --git a/docs/release-notes/changelog-v1.0.md b/docs/release-notes/changelog-v1.0.md index 11169407..f8de7c0d 100644 --- a/docs/release-notes/changelog-v1.0.md +++ b/docs/release-notes/changelog-v1.0.md @@ -37,7 +37,7 @@ * Shorten comment line length by @yellowcap in https://github.com/Clay-foundation/model/pull/261 * Refactor docs by moving v0 docs into separate section by @yellowcap in https://github.com/Clay-foundation/model/pull/262 * Docs v1 continued by @yellowcap in https://github.com/Clay-foundation/model/pull/263 -* Documented metadata file for normalization and wavelenghts by @yellowcap in https://github.com/Clay-foundation/model/pull/266 +* Documented metadata file for normalization and wavelengths by @yellowcap in https://github.com/Clay-foundation/model/pull/266 * [small change] add source.coop link by @brunosan in https://github.com/Clay-foundation/model/pull/137 * Segmentation on Clay by @srmsoumya in https://github.com/Clay-foundation/model/pull/257 From 4fe4f46746d06eb7448a3485fff6a962494c58e4 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:48:54 +1200 Subject: [PATCH 3/6] Fix typos in .py files --- finetune/classify/classify.py | 2 +- finetune/regression/regression.py | 2 +- finetune/segment/segment.py | 2 +- trainer.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/finetune/classify/classify.py b/finetune/classify/classify.py index 352afd15..34b5cb7b 100644 --- a/finetune/classify/classify.py +++ b/finetune/classify/classify.py @@ -19,7 +19,7 @@ # %% def cli_main(): """ - Command-line inteface to run Clasifier model with EuroSATDataModule. + Command-line interface to run Clasifier model with EuroSATDataModule. """ cli = LightningCLI(EuroSATClassifier, EuroSATDataModule) return cli diff --git a/finetune/regression/regression.py b/finetune/regression/regression.py index 7ea8ff7e..f691c73d 100644 --- a/finetune/regression/regression.py +++ b/finetune/regression/regression.py @@ -19,7 +19,7 @@ # %% def cli_main(): """ - Command-line inteface to run Regression with BioMastersDataModule. + Command-line interface to run Regression with BioMastersDataModule. """ cli = LightningCLI( BioMastersClassifier, diff --git a/finetune/segment/segment.py b/finetune/segment/segment.py index 7531b4d8..f24bfe94 100644 --- a/finetune/segment/segment.py +++ b/finetune/segment/segment.py @@ -19,7 +19,7 @@ # %% def cli_main(): """ - Command-line inteface to run Segmentation Model with ChesapeakeDataModule. + Command-line interface to run Segmentation Model with ChesapeakeDataModule. """ cli = LightningCLI(ChesapeakeSegmentor, ChesapeakeDataModule) return cli diff --git a/trainer.py b/trainer.py index 986574e8..002a24a6 100644 --- a/trainer.py +++ b/trainer.py @@ -19,7 +19,7 @@ # %% def cli_main(): """ - Command-line inteface to run ClayMAE with ClayDataModule. + Command-line interface to run ClayMAE with ClayDataModule. """ cli = LightningCLI(save_config_kwargs={"overwrite": True}) return cli From 8cc77653c38bfe30a915b3d1e9a8223b254971e3 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:56:31 +1200 Subject: [PATCH 4/6] Fix typos in .ipynb files --- docs/clay-v0/clay-v0-location-embeddings.ipynb | 4 ++-- docs/clay-v0/patch_level_cloud_cover.ipynb | 2 +- docs/finetune/finetune-on-embeddings.ipynb | 2 +- docs/tutorials/clay-v1-wall-to-wall.ipynb | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/clay-v0/clay-v0-location-embeddings.ipynb b/docs/clay-v0/clay-v0-location-embeddings.ipynb index ad412cde..ada98b27 100644 --- a/docs/clay-v0/clay-v0-location-embeddings.ipynb +++ b/docs/clay-v0/clay-v0-location-embeddings.ipynb @@ -223,7 +223,7 @@ "id": "3384f479-ef84-420d-a4e9-e3b038f05497", "metadata": {}, "source": [ - "> Latitude & Longitude map to 768 dimentional vector" + "> Latitude & Longitude map to 768 dimensional vector" ] }, { @@ -231,7 +231,7 @@ "id": "9e419fc9-e7d3-49de-a8ea-72912c365510", "metadata": {}, "source": [ - "## Preform PCA over the location embeddings to visualize them in 2 dimension" + "## Perform PCA over the location embeddings to visualize them in 2 dimension" ] }, { diff --git a/docs/clay-v0/patch_level_cloud_cover.ipynb b/docs/clay-v0/patch_level_cloud_cover.ipynb index af970b35..303177bb 100644 --- a/docs/clay-v0/patch_level_cloud_cover.ipynb +++ b/docs/clay-v0/patch_level_cloud_cover.ipynb @@ -698,7 +698,7 @@ "id": "bd3d1cc1-9d79-4059-a1f6-4ac8cf4d2e51", "metadata": {}, "source": [ - "#### Set up filtered searchs" + "#### Set up filtered searches" ] }, { diff --git a/docs/finetune/finetune-on-embeddings.ipynb b/docs/finetune/finetune-on-embeddings.ipynb index 39dde776..aa9ebf62 100644 --- a/docs/finetune/finetune-on-embeddings.ipynb +++ b/docs/finetune/finetune-on-embeddings.ipynb @@ -365,7 +365,7 @@ "\n", "### Choose your example\n", "\n", - "In the following cell, choose which set of training points to use. The input shoudl be a point dataset\n", + "In the following cell, choose which set of training points to use. The input should be a point dataset\n", "with a `class` column, containing `1` for positive examples, and `0` for negative examples.\n", "\n", "Use your own dataset or use one of the two provided ones." diff --git a/docs/tutorials/clay-v1-wall-to-wall.ipynb b/docs/tutorials/clay-v1-wall-to-wall.ipynb index d22b4615..1aec37eb 100644 --- a/docs/tutorials/clay-v1-wall-to-wall.ipynb +++ b/docs/tutorials/clay-v1-wall-to-wall.ipynb @@ -15,7 +15,7 @@ "3. Load the model checkpoint\n", "4. Prepare data into a format for the model\n", "5. Run the model on the imagery\n", - "6. Analyise the model embeddings output using PCA\n", + "6. Analyse the model embeddings output using PCA\n", "7. Train a Support Vector Machines fine tuning head" ] }, @@ -333,7 +333,7 @@ "source": [ "### Prepare band metadata for passing it to the model\n", "\n", - "This is the most technical part so far. We will take the information in the stack of imagery and convert it into the formate that the model requires. This includes converting the lat/lon and the date of the imagery into normalized values.\n", + "This is the most technical part so far. We will take the information in the stack of imagery and convert it into the format that the model requires. This includes converting the lat/lon and the date of the imagery into normalized values.\n", "\n", "The Clay model will accept any band combination in any order, from different platforms. But for this the model needs to know the wavelength of each band that is passed to it, and normalization parameters for each band as well. It will use that to normalize the data and to interpret each band based on its central wavelength.\n", "\n", @@ -374,7 +374,7 @@ "source": [ "### Convert the band pixel data in to the format for the model\n", "\n", - "We will take the information in the stack of imagery and convert it into the formate that the model requires. This includes converting the lat/lon and the date of the imagery into normalized values." + "We will take the information in the stack of imagery and convert it into the format that the model requires. This includes converting the lat/lon and the date of the imagery into normalized values." ] }, { From 35ff6a89605a97aa4d3b20b8f7fdf38467108388 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:38:03 +1200 Subject: [PATCH 5/6] Ignore some words from LICENSE_MODEL.md and CODE_OF_CONDUCT.md A couple of words in these legal documents to ignore. --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f8d2662..81e086b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,6 @@ [tool.codespell] -ignore-words-list = "linz" +ignore-words-list = [ + "linz", + "socio-economic", + "therefrom", +] From 0e0c34d66eb58d33b2fd27a15eca26a19c25802b Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:47:34 +1200 Subject: [PATCH 6/6] Skip some ipynb notebooks that has false positive misspellings Binary outputs in Jupyter Notebooks are not skipped by codespell, xref https://github.com/codespell-project/codespell/issues/2138. So manually skipping these files after true-positive typos have been fixed. --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 81e086b9..0feabc2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,8 @@ ignore-words-list = [ "socio-economic", "therefrom", ] +skip = [ + "docs/clay-v0/tutorial_digital_earth_pacific_patch_level.ipynb", + "docs/clay-v0/partial-inputs.ipynb", + "docs/tutorials/v1-inference-simsearch-naip-stacchip.ipynb", +]