Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the "decollage" process for raw microscope output to the package #22

Merged
merged 12 commits into from
Aug 27, 2024
Merged
184 changes: 184 additions & 0 deletions cyto_ml/data/decollage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Decollages flowcam images (breaks one large tiff into many small ones)
# Attempts to extract coordinate, date and depth information encoded in the filename
# Add add those properties to the resulting output in the EXIF headers
# where file path points to the flowcam data folder which has the collage .tifs and the .lst file inside
# Originally adapted from https://sarigiering.co/posts/extract-individual-particle-images-from-flowcam/
import argparse
import logging
import os
import re
import glob

import pandas as pd
import numpy as np
from skimage.io import imread, imsave
from exiftool import ExifToolHelper
from exiftool.exceptions import ExifToolExecuteError

logging.basicConfig(level=logging.INFO)


def lst_metadata(filename: str) -> pd.DataFrame:
"""
Read the csv-ish ".lst" file from the FlowCam export
Return a pandas dataframe
"""
heads = pd.read_csv(filename, sep="|", nrows=53, skiprows=1)
colNames = list(heads["num-fields"])
meta = pd.read_csv(filename, sep="|", skiprows=55, header=None)
meta.columns = colNames
return meta


def window_slice(
image: np.ndarray, x: int, y: int, height: int, width: int
) -> np.ndarray:
return image[y : y + height, x : x + width] # noqa: E203


def headers_from_filename(filename: str) -> dict:
"""Attempt to extract lon/lat and date, option of depth, from filename
Return a dict with key-value pairs for use as EXIF headers
"""
headers = {}
pattern = r"_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d{8})(?:_(\d+))?"

match = re.search(pattern, filename)
if match:
lat, lon, date, depth = match.groups()
# https://exiftool.org/TagNames/GPS.html
headers["GPSLatitude"] = lat
headers["GPSLongitude"] = lon
headers["DateTimeOriginal"] = (
date # better to leave as date than pad with zero hours?
)
# TODO most depth matches will be spurious, what are the rules (refer to Kelly?
headers["GPSAltitude"] = (
depth # can we use negative altitude as bathymetric depth?
)
return headers


def write_headers(filename: str, headers: dict) -> bool:
"""
Given a dictionary of EXIF tag keys and their values, write to filename
Returns True if nothing has obviously gone wrong during this process
"""
result = None
try:
with ExifToolHelper() as et:
et.set_tags([filename], tags=headers, params=["-P", "-overwrite_original"])
result = True
# TODO try failures, observe them
except ExifToolExecuteError as err:
logging.warning(err)
result = False
return result


def read_headers(filename: str) -> dict:
meta = {}
with ExifToolHelper() as et:
meta = et.get_metadata(filename)
return meta


class FlowCamSession:
"""
Bundle up all the logic of the decollage script so it can be run
without passing commandline arguments
"""

def __init__(self, directory: str, output_directory: str, experiment_name: str):
"""Implements the work of the decollage script:

directory - path to a directory containing all the images for a FlowCam session
output_directory - path to a directory to write the single images, create if needed
experiment_name - a tag to use on the image files, could be superfluous
"""
self.directory = directory
self.output_directory = output_directory
self.experiment_name = experiment_name

self.read_metadata()
self.output_dir()

self.do_decollage()

def read_metadata(self) -> None:
self.metadata = {}

files = glob.glob(f"{self.directory}/*.lst")
print(files)

if len(files) == 0:
raise FileNotFoundError("no lst file in this directory")
else:
self.metadata = lst_metadata(files[0])

def output_dir(self):
# create a folder to save the output into
if os.path.exists(self.output_directory):
pass
else:
os.mkdir(self.output_directory)

def do_decollage(self):
"""Not very lovely single function that replaces the work of the script."""
# Reasonably assume that all images in a session have same spatio-temporal metadata
# extract the coords, date, possibly depth from directory name
collage_headers = headers_from_filename(self.directory)

# decollage - rather than traverse the index and keep rereading large images,
# filter by filename first and traverse that way, should speed up a lot
for collage_file in self.metadata.collage_file.unique():

collage = imread(f"{self.directory}/{collage_file}")

df = self.metadata[self.metadata.collage_file == collage_file]

for i in df.index:
# extract vignette
height = df["image_h"][i]
width = df["image_w"][i]
img_sub = window_slice(
collage,
df["image_x"][i],
df["image_y"][i],
height,
width,
)
# write EXIF metadata into the headers
headers = collage_headers
headers["ImageWidth"] = width
headers["ImageHeight"] = height

# save vignette to decollage folder
# we probably need to write to the filesystem to then use exiftool
output_file = (
f"{self.directory}/decollage/{self.experiment_name}_{i}.tif"
)
imsave(output_file, img_sub)
write_headers(output_file, headers)


if __name__ == "__main__":

parser = argparse.ArgumentParser(
prog="FlowCam_DeCollager",
description="Decollages flow cam images. requires pandas (pip install pandas) and cv2 (pip install opencv-python).", # noqa: E501
)
parser.add_argument(
"filePath",
help="path to the flowcam data file which contains collage .tifs and an .lst file",
)
parser.add_argument("experimentName", help="name to append to each decollaged file")
args = parser.parse_args()

# Run the decollage process for a whole session
FlowCamSession(args.filePath, f"{args.filePath}/decollage", args.experimentName)

# TODO consider squirting the output straight into the object store API

# TODO decide whether to do anything with the analytic metadata (circularity etc)
# We could pop it into a sqlite store at this stage, but want the file linkages
47 changes: 42 additions & 5 deletions cyto_ml/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import shutil
import pytest
from cyto_ml.models.scivision import (
load_model,
Expand All @@ -8,13 +9,19 @@


@pytest.fixture
def image_dir():
def fixture_dir():
"""
Existing directory of images
Base directory for the test fixtures (images, metadata)
"""
return os.path.join(
os.path.abspath(os.path.dirname(__file__)), "fixtures/test_images/"
)
return os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../fixtures/")


@pytest.fixture
def image_dir(fixture_dir):
"""
Directory with single plankton images
"""
return os.path.join(fixture_dir, "test_images")


@pytest.fixture
Expand Down Expand Up @@ -43,3 +50,33 @@ def env_endpoint():
if endpoint and "https" not in endpoint:
endpoint = None
return endpoint


@pytest.fixture
def lst_file(fixture_dir):
"""Location of a metadata file for a FlowCam image batch"""
return os.path.join(
fixture_dir,
"MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/metadata.lst",
)


@pytest.fixture
def collage_file(fixture_dir):
"""Location of a collage file with a FlowCam image batch"""
return os.path.join(
fixture_dir,
"MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1_images_000001.tif", # noqa: E501
) # noqa: E501


@pytest.fixture
def exiftest_file(fixture_dir):
"""This runs in-place so make a copy of the file every time"""
orig = os.path.join(
fixture_dir,
"MicrobialMethane_MESO_Tank10_54.0143_-2.7770_04052023_1/exiftest.tif",
)
temp = orig.replace("exiftest", "temp")
shutil.copyfile(orig, temp)
return temp
41 changes: 41 additions & 0 deletions cyto_ml/tests/test_decollage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
from skimage.io import imread
from cyto_ml.data.decollage import (
lst_metadata,
window_slice,
headers_from_filename,
write_headers,
read_headers,
)


def test_lst_metadata(lst_file):
df = lst_metadata(lst_file)
assert isinstance(df, pd.DataFrame)


def test_window_slice(collage_file):
img = imread(collage_file)
win = window_slice(img, 5, 5, 25, 50)
assert win.shape == (25, 50, 3)


def test_headers_from_filename(collage_file):
h = headers_from_filename(collage_file)
assert "GPSLatitude" in h and h["GPSLatitude"]


def test_write_headers(exiftest_file):
# Check we don't have a tagged version from a previous run

tags = read_headers(exiftest_file)
assert "EXIF:GPSLatitude" not in tags

# Note this has to be a "valid" EXIF header
write_headers(exiftest_file, {"GPSLatitude": "42"})

meta = read_headers(exiftest_file)
print(meta[0].keys())
assert "EXIF:GPSLatitude" in meta[0]

meta = write_headers("nonexistent_file.tif", {"GPSLatitude": "42"})
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ channels:
- defaults
dependencies:
- python=3.9
- pytorch=1.10.0
- pytorch=1.10.0 # pinned here for the scivision model
- mkl=2024.0
- chromadb=0.5.3
- intake-xarray
Expand All @@ -18,5 +18,6 @@ dependencies:
- jupytext
- pip
- pip:
- pyexiftool
- scivision
- git+https://github.com/alan-turing-institute/plankton-cefas-scivision@main
Binary file not shown.
Loading