NERC-CEH · Kzra · Aug 27, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/cyto_ml/data/decollage.py b/cyto_ml/data/decollage.py
@@ -0,0 +1,184 @@
+# Decollages flowcam images (breaks one large tiff into many small ones)
+# Attempts to extract coordinate, date and depth information encoded in the filename
+# Add add those properties to the resulting output in the EXIF headers
+# where file path points to the flowcam data folder which has the collage .tifs and the .lst file inside
+# Originally adapted from https://sarigiering.co/posts/extract-individual-particle-images-from-flowcam/
+import argparse
+import logging
+import os
+import re
+import glob
+
+import pandas as pd
+import numpy as np
+from skimage.io import imread, imsave
+from exiftool import ExifToolHelper
+from exiftool.exceptions import ExifToolExecuteError
+
+logging.basicConfig(level=logging.INFO)
+
+
+def lst_metadata(filename: str) -> pd.DataFrame:
+    """
+    Read the csv-ish ".lst" file from the FlowCam export
+    Return a pandas dataframe
+    """
+    heads = pd.read_csv(filename, sep="|", nrows=53, skiprows=1)
+    colNames = list(heads["num-fields"])
+    meta = pd.read_csv(filename, sep="|", skiprows=55, header=None)
+    meta.columns = colNames
+    return meta
+
+
+def window_slice(
+    image: np.ndarray, x: int, y: int, height: int, width: int
+) -> np.ndarray:
+    return image[y : y + height, x : x + width]  # noqa: E203
+
+
+def headers_from_filename(filename: str) -> dict:
+    """Attempt to extract lon/lat and date, option of depth, from filename
+    Return a dict with key-value pairs for use as EXIF headers
+    """
+    headers = {}
+    pattern = r"_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d{8})(?:_(\d+))?"
+
+    match = re.search(pattern, filename)
+    if match:
+        lat, lon, date, depth = match.groups()
+        # https://exiftool.org/TagNames/GPS.html
+        headers["GPSLatitude"] = lat
+        headers["GPSLongitude"] = lon
+        headers["DateTimeOriginal"] = (
+            date  # better to leave as date than pad with zero hours?
+        )
+        # TODO most depth matches will be spurious, what are the rules (refer to Kelly?
+        headers["GPSAltitude"] = (
+            depth  # can we use negative altitude as bathymetric depth?
+        )
+    return headers
+
+
+def write_headers(filename: str, headers: dict) -> bool:
+    """
+    Given a dictionary of EXIF tag keys and their values, write to filename
+    Returns True if nothing has obviously gone wrong during this process
+    """
+    result = None
+    try:
+        with ExifToolHelper() as et:
+            et.set_tags([filename], tags=headers, params=["-P", "-overwrite_original"])
+        result = True
+    # TODO try failures, observe them
+    except ExifToolExecuteError as err:
+        logging.warning(err)
+        result = False
+    return result
+
+
+def read_headers(filename: str) -> dict:
+    meta = {}
+    with ExifToolHelper() as et:
+        meta = et.get_metadata(filename)
+    return meta
+
+
+class FlowCamSession:
+    """
+    Bundle up all the logic of the decollage script so it can be run
+    without passing commandline arguments
+    """
+
+    def __init__(self, directory: str, output_directory: str, experiment_name: str):
+        """Implements the work of the decollage script:
+
+        directory - path to a directory containing all the images for a FlowCam session
+        output_directory - path to a directory to write the single images, create if needed
+        experiment_name - a tag to use on the image files, could be superfluous
+        """
+        self.directory = directory
+        self.output_directory = output_directory
+        self.experiment_name = experiment_name
+
+        self.read_metadata()
+        self.output_dir()
+
+        self.do_decollage()
+
+    def read_metadata(self) -> None:
+        self.metadata = {}
+
+        files = glob.glob(f"{self.directory}/*.lst")
+        print(files)
+
+        if len(files) == 0:
+            raise FileNotFoundError("no lst file in this directory")
+        else:
+            self.metadata = lst_metadata(files[0])
+
+    def output_dir(self):
+        # create a folder to save the output into
+        if os.path.exists(self.output_directory):
+            pass
+        else:
+            os.mkdir(self.output_directory)
+
+    def do_decollage(self):
+        """Not very lovely single function that replaces the work of the script."""
+        # Reasonably assume that all images in a session have same spatio-temporal metadata
+        # extract the coords, date, possibly depth from directory name
+        collage_headers = headers_from_filename(self.directory)
+
+        # decollage - rather than traverse the index and keep rereading large images,
+        # filter by filename first and traverse that way, should speed up a lot
+        for collage_file in self.metadata.collage_file.unique():
+
+            collage = imread(f"{self.directory}/{collage_file}")
+
+            df = self.metadata[self.metadata.collage_file == collage_file]
+
+            for i in df.index:
+                # extract vignette
+                height = df["image_h"][i]
+                width = df["image_w"][i]
+                img_sub = window_slice(
+                    collage,
+                    df["image_x"][i],
+                    df["image_y"][i],
+                    height,
+                    width,
+                )
+                # write EXIF metadata into the headers
+                headers = collage_headers
+                headers["ImageWidth"] = width
+                headers["ImageHeight"] = height
+
+                # save vignette to decollage folder
+                # we probably need to write to the filesystem to then use exiftool
+                output_file = (
+                    f"{self.directory}/decollage/{self.experiment_name}_{i}.tif"
+                )
+                imsave(output_file, img_sub)
+                write_headers(output_file, headers)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        prog="FlowCam_DeCollager",
+        description="Decollages flow cam images. requires pandas (pip install pandas) and cv2 (pip install opencv-python).",  # noqa: E501
+    )
+    parser.add_argument(
+        "filePath",
+        help="path to the flowcam data file which contains collage .tifs and an .lst file",
+    )
+    parser.add_argument("experimentName", help="name to append to each decollaged file")
+    args = parser.parse_args()
+
+    # Run the decollage process for a whole session
+    FlowCamSession(args.filePath, f"{args.filePath}/decollage", args.experimentName)
+
+    # TODO consider squirting the output straight into the object store API
+
+    # TODO decide whether to do anything with the analytic metadata (circularity etc)
+    # We could pop it into a sqlite store at this stage, but want the file linkages
diff --git a/cyto_ml/tests/conftest.py b/cyto_ml/tests/conftest.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import pytest
 from cyto_ml.models.scivision import (
     load_model,
@@ -8,13 +9,19 @@
 
 
 @pytest.fixture
-def image_dir():
+def fixture_dir():
     """
-    Existing directory of images
+    Base directory for the test fixtures (images, metadata)
     """
-    return os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), "fixtures/test_images/"
-    )
+    return os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../fixtures/")
+
+
+@pytest.fixture
+def image_dir(fixture_dir):
+    """
+    Directory with single plankton images
+    """
+    return os.path.join(fixture_dir, "test_images")
 
 
 @pytest.fixture
@@ -43,3 +50,33 @@ def env_endpoint():
     if endpoint and "https" not in endpoint:
         endpoint = None
     return endpoint
+
+
+@pytest.fixture
+def lst_file(fixture_dir):
+    """Location of a metadata file for a FlowCam image batch"""
+    return os.path.join(
+        fixture_dir,
+        "MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/metadata.lst",
+    )
+
+
+@pytest.fixture
+def collage_file(fixture_dir):
+    """Location of a collage file with a FlowCam image batch"""
+    return os.path.join(
+        fixture_dir,
+        "MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1_images_000001.tif",  # noqa: E501
+    )  # noqa: E501
+
+
+@pytest.fixture
+def exiftest_file(fixture_dir):
+    """This runs in-place so make a copy of the file every time"""
+    orig = os.path.join(
+        fixture_dir,
+        "MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/exiftest.tif",
+    )
+    temp = orig.replace("exiftest", "temp")
+    shutil.copyfile(orig, temp)
+    return temp
diff --git a/cyto_ml/tests/test_decollage.py b/cyto_ml/tests/test_decollage.py
@@ -0,0 +1,41 @@
+import pandas as pd
+from skimage.io import imread
+from cyto_ml.data.decollage import (
+    lst_metadata,
+    window_slice,
+    headers_from_filename,
+    write_headers,
+    read_headers,
+)
+
+
+def test_lst_metadata(lst_file):
+    df = lst_metadata(lst_file)
+    assert isinstance(df, pd.DataFrame)
+
+
+def test_window_slice(collage_file):
+    img = imread(collage_file)
+    win = window_slice(img, 5, 5, 25, 50)
+    assert win.shape == (25, 50, 3)
+
+
+def test_headers_from_filename(collage_file):
+    h = headers_from_filename(collage_file)
+    assert "GPSLatitude" in h and h["GPSLatitude"]
+
+
+def test_write_headers(exiftest_file):
+    # Check we don't have a tagged version from a previous run
+
+    tags = read_headers(exiftest_file)
+    assert "EXIF:GPSLatitude" not in tags
+
+    # Note this has to be a "valid" EXIF header
+    write_headers(exiftest_file, {"GPSLatitude": "42"})
+
+    meta = read_headers(exiftest_file)
+    print(meta[0].keys())
+    assert "EXIF:GPSLatitude" in meta[0]
+
+    meta = write_headers("nonexistent_file.tif", {"GPSLatitude": "42"})
diff --git a/environment.yml b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - python=3.9
-  - pytorch=1.10.0
+  - pytorch=1.10.0  # pinned here for the scivision model
   - mkl=2024.0
   - chromadb=0.5.3
   - intake-xarray
@@ -18,5 +18,6 @@ dependencies:
   - jupytext
   - pip
   - pip:
+    - pyexiftool
     - scivision
     - git+https://github.com/alan-turing-institute/plankton-cefas-scivision@main
diff --git a/...7770_04052023_1/MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1_images_000001.tif b/...7770_04052023_1/MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1_images_000001.tif
diff --git a/...ures/test_images/testymctestface_1091.tif → ...0_54.0143_-2.7770_04052023_1/exiftest.tif b/...ures/test_images/testymctestface_1091.tif → ...0_54.0143_-2.7770_04052023_1/exiftest.tif