allow annotations_file to be None, for use in data preprocessing

weecology · Feb 7, 2024 · 144854b · 144854b
1 parent 66fc43e
commit 144854b
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 44 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,10 @@
 DeepForest Change Log
 =====================
 
+**1.3.3**
+
+* Allow for annotations_file to be none in split_raster, for use in data preprocessing.
+
 **1.3.0**
 
 * Removed IoU_Callback to better align with pytorch-lightning API, see https://github.com/Lightning-AI/pytorch-lightning/issues/19101

diff --git a/deepforest/preprocess.py b/deepforest/preprocess.py
@@ -122,23 +122,38 @@ def select_annotations(annotations, windows, index, allow_empty=False):
 
 
 def save_crop(base_dir, image_name, index, crop):
-    """Save window crop as image file to be read by PIL.
+    """
+    Save window crop as an image file to be read by PIL.
+
+    Args:
+        base_dir (str): The base directory to save the image file.
+        image_name (str): The name of the original image.
+        index (int): The index of the window crop.
+        crop (numpy.ndarray): The window crop as a NumPy array.
 
-    Filename should match the image_name + window index
+    Returns:
+        str: The filename of the saved image.
     """
-    # create dir if needed
+    # Create directory if needed
     if not os.path.exists(base_dir):
         os.makedirs(base_dir)
 
+    # Convert NumPy array to PIL image
     im = Image.fromarray(crop)
+
+    # Extract the basename of the image
     image_basename = os.path.splitext(image_name)[0]
+
+    # Generate the filename for the saved image
     filename = "{}/{}_{}.png".format(base_dir, image_basename, index)
+
+    # Save the image
     im.save(filename)
 
     return filename
 
 
-def split_raster(annotations_file,
+def split_raster(annotations_file=None,
                  path_to_raster=None,
                  numpy_image=None,
                  base_dir=None,
@@ -153,18 +168,18 @@ def split_raster(annotations_file,
     Args:
         numpy_image: a numpy object to be used as a raster, usually opened from rasterio.open.read(), in order (height, width, channels)
         path_to_raster: (str): Path to a tile that can be read by rasterio on disk
-        annotations_file (str or pd.DataFrame): A pandas dataframe or path to annotations csv file. In the format -> image_path, xmin, ymin, xmax, ymax, label
+        annotations_file (str or pd.DataFrame): A pandas dataframe or path to annotations csv file to transform to cropped images. In the format -> image_path, xmin, ymin, xmax, ymax, label. If None, allow_empty is ignored and the function will only return the cropped images.
         save_dir (str): Directory to save images
         base_dir (str): Directory to save images
         patch_size (int): Maximum dimensions of square window
         patch_overlap (float): Percent of overlap among windows 0->1
         allow_empty: If True, include images with no annotations
-            to be included in the dataset
+            to be included in the dataset. If annotations_file is None, this is ignored.
         image_name (str): If numpy_image arg is used, what name to give the raster?
 
     Returns:
-        A pandas dataframe with annotations file for training. 
-        A copy of this file is written to save_dir as a side effect.
+        If annotations_file is provided, a pandas dataframe with annotations file for training. A copy of this file is written to save_dir as a side effect.
+        If not, a list of filenames of the cropped images.
     """
     # Set deprecation warning for base_dir and set to save_dir
     if base_dir:
@@ -225,37 +240,41 @@ def split_raster(annotations_file,
         image_name = os.path.basename(path_to_raster)
 
     # Load annotations file and coerce dtype
-    if type(annotations_file) == str:
+    if annotations_file is None:
+        allow_empty = True
+    elif type(annotations_file) == str:
         annotations = pd.read_csv(annotations_file)
     elif type(annotations_file) == pd.DataFrame:
         annotations = annotations_file
     else:
         raise TypeError(
-            "annotations file must either by a path or a pd.Dataframe, found {}".format(
-                type(annotations_file)))
+            "annotations file must either be None, a path or a pd.Dataframe, found {}".
+            format(type(annotations_file)))
 
-    # open annotations file
-    image_annotations = annotations[annotations.image_path == image_name]
+    # Select matching annotations
+    if annotations_file is not None:
+        image_annotations = annotations[annotations.image_path == image_name]
 
     # Sanity checks
-    if image_annotations.empty:
-        raise ValueError(
-            "No image names match between the file:{} and the image_path: {}. "
-            "Reminder that image paths should be the relative "
-            "path (e.g. 'image_name.tif'), not the full path "
-            "(e.g. path/to/dir/image_name.tif)".format(annotations_file, image_name))
-
-    if not all([
-            x in annotations.columns
-            for x in ["image_path", "xmin", "ymin", "xmax", "ymax", "label"]
-    ]):
-        raise ValueError("Annotations file has {} columns, should have "
-                         "format image_path, xmin, ymin, xmax, ymax, label".format(
-                             annotations.shape[1]))
+    if not allow_empty:
+        if image_annotations.empty:
+            raise ValueError(
+                "No image names match between the file:{} and the image_path: {}. "
+                "Reminder that image paths should be the relative "
+                "path (e.g. 'image_name.tif'), not the full path "
+                "(e.g. path/to/dir/image_name.tif)".format(annotations_file, image_name))
+
+        if not all([
+                x in annotations.columns
+                for x in ["image_path", "xmin", "ymin", "xmax", "ymax", "label"]
+        ]):
+            raise ValueError("Annotations file has {} columns, should have "
+                             "format image_path, xmin, ymin, xmax, ymax, label".format(
+                                 annotations.shape[1]))
 
     annotations_files = []
+    crop_filenames = []
     for index, window in enumerate(windows):
-
         # Crop image
         crop = numpy_image[windows[index].indices()]
 
@@ -264,28 +283,37 @@ def split_raster(annotations_file,
             continue
 
         # Find annotations, image_name is the basename of the path
-        crop_annotations = select_annotations(image_annotations, windows, index,
-                                              allow_empty)
+        if annotations_file is not None:
+            crop_annotations = select_annotations(image_annotations, windows, index,
+                                                  allow_empty)
+        else:
+            crop_annotations = None
 
         # If empty images not allowed, select annotations returns None
         if crop_annotations is not None:
             # save annotations
             annotations_files.append(crop_annotations)
 
-            # save image crop
-            save_crop(save_dir, image_name, index, crop)
-    if len(annotations_files) == 0:
-        raise ValueError(
-            "Input file has no overlapping annotations and allow_empty is {}".format(
-                allow_empty))
+        # save image crop
+        if allow_empty or crop_annotations is not None:
+            crop_filename = save_crop(save_dir, image_name, index, crop)
+            crop_filenames.append(crop_filename)
 
-    annotations_files = pd.concat(annotations_files)
+    if annotations_file is not None:
+        # Only concat annotations if there were supplied
+        if len(annotations_files) == 0:
+            raise ValueError(
+                "Input file has no overlapping annotations and allow_empty is {}".format(
+                    allow_empty))
 
-    # Checkpoint csv files, useful for parallelization
-    # Use filename of the raster path to save the annotations
-    image_basename = os.path.splitext(image_name)[0]
-    file_path = image_basename + ".csv"
-    file_path = os.path.join(save_dir, file_path)
-    annotations_files.to_csv(file_path, index=False, header=True)
+        annotations_files = pd.concat(annotations_files)
+
+        # Checkpoint csv files, useful for parallelization and use filename of the raster path to save the annotations
+        image_basename = os.path.splitext(image_name)[0]
+        file_path = image_basename + ".csv"
+        file_path = os.path.join(save_dir, file_path)
+        annotations_files.to_csv(file_path, index=False, header=True)
 
-    return annotations_files
+        return annotations_files
+    else:
+        return crop_filenames
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -99,6 +99,26 @@ def test_split_raster(config, tmpdir, input_type):
     assert not output_annotations.empty
     assert output_annotations.shape[1] == 6
 
+def test_split_raster_no_annotations(config, tmpdir):
+    """Split raster into crops with overlaps to maintain all annotations"""
+    raster = get_data("2019_YELL_2_528000_4978000_image_crop2.png")
+    annotations = utilities.xml_to_annotations(
+        get_data("2019_YELL_2_528000_4978000_image_crop2.xml"))
+    annotations.to_csv("{}/example.csv".format(tmpdir), index=False)
+
+    output_crops = preprocess.split_raster(path_to_raster=raster,
+                                           annotations_file=None,
+                                           base_dir=tmpdir,
+                                           patch_size=500,
+                                           patch_overlap=0)
+
+    # Returns a 6 length list of crops.
+    assert len(output_crops) == 25
+
+    # Assert that all output_crops exist
+    for crop in output_crops:
+        assert os.path.exists(crop)
+
 
 def test_split_raster_empty_crops(config, tmpdir):
     """Split raster into crops with overlaps to maintain all annotations, allow empty crops"""