From a57aecf63749dd131acdab8012ab8cf7dc34a25c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 28 Aug 2024 09:12:33 +1200
Subject: [PATCH 001/117] tweak training

---
 src/autobuild.sh               | 10 +++++-----
 src/build.py                   |  2 +-
 src/ml_tools/thermaldataset.py |  8 ++++++++
 src/rebuildDate.py             | 20 ++++++++++++++------
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/autobuild.sh b/src/autobuild.sh
index ca7360a4..193a6e6a 100755
--- a/src/autobuild.sh
+++ b/src/autobuild.sh
@@ -3,9 +3,9 @@
 set -e
 set -x
 config="classifier-thermal.yaml"
-month_ago=$(python3 rebuildDate.py -c $config)
+echo "Saving into $1"
+month_ago=$(python3 rebuildDate.py $1)
 echo $month_ago
-python3 ../../cptv-download/cptv-download.py -l 0 -i 'poor tracking' -i 'untagged' -i 'part' -i 'untagged-by-humans' -i 'unknown' -i 'unidentified' -m 'human-tagged' --start-date "$month_ago" "../clips$month_ago" useremail@email.com userpassword
-echo "Downloading into ../clips$month_ago"
-python3 load.py -target "../clips$month_ago"  -c $config
-python3 build.py -c $config
+python3 ../../cptv-download/cptv-download.py -l 0 -i 'poor tracking' -i 'untagged' -i 'part' -i 'untagged-by-humans' -i 'unknown' -i 'unidentified' -m 'human-tagged' --start-date "$month_ago" "$1" useremail@email.com userpassword
+echo "Downloading into $1"
+python3 build.py -c $config --ext ".cptv" $1
diff --git a/src/build.py b/src/build.py
index 5c48a8c2..7b91dad8 100644
--- a/src/build.py
+++ b/src/build.py
@@ -57,7 +57,7 @@ def parse_args():
     )
     parser.add_argument("--split-file", help="Json file defining a split")
     parser.add_argument(
-        "--ext", default=".hdf5", help="Extension of files to load .mp4,.cptv,.hdf5"
+        "--ext", default=".cptv", help="Extension of files to load .mp4,.cptv,.hdf5"
     )
 
     parser.add_argument("-c", "--config-file", help="Path to config file to use")
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 728b95f1..14125bc7 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -33,12 +33,20 @@ def get_excluded():
         "pest",
         "pig",
         "sealion",
+        "bat",
+        "mammal",
+        "frog",
     ]
 
 
 def get_remapped(multi_label=False):
     land_bird = "land-bird" if multi_label else "bird"
     return {
+        "echidna": "hedgehog",
+        "grey kangaroo": "wallaby",
+        "sambar deer": "deer",
+        "mouse": "rodent",
+        "rat": "rodent",
         "water": "false-positive",
         "insect": "false-positive",
         "allbirds": "bird",
diff --git a/src/rebuildDate.py b/src/rebuildDate.py
index dadf1a6a..7693842d 100644
--- a/src/rebuildDate.py
+++ b/src/rebuildDate.py
@@ -5,15 +5,23 @@
 from config.config import Config
 from datetime import timedelta
 from datetime import date
+from pathlib import Path
+from dateutil.parser import parse as parse_date
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-c", "--config-file", help="Path to config file to use")
+parser.add_argument("data_dir", help="Directory of hdf5 files")
 args = parser.parse_args()
+args.data_dir = Path(args.data_dir)
+latest_date = None
+for db_clip in args.data_dir.glob(f"**/*.cptv"):
+    file_name = db_clip.name
+    hyphen = file_name.index("-")
+    date_s = file_name[hyphen + 1 : hyphen + 16]
+    cptv_dt = parse_date(date_s)
+    if latest_date is None or cptv_dt > latest_date:
+        latest_date = cptv_dt
 
-config = Config.load_from_file(args.config_file)
-db_file = os.path.join(config.tracks_folder, "dataset.hdf5")
-db = TrackDatabase(db_file)
-latest_date = db.latest_date()
-month_ago = latest_date - timedelta(days=30)
+
+month_ago = latest_date - timedelta(days=30 * 6)
 month_ago = month_ago.strftime("%Y-%m-%d 00:00:00")
 print(month_ago)

From ec115ab792228e69a695ac4a99066092f45cdcaf Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 28 Aug 2024 09:19:40 +1200
Subject: [PATCH 002/117] rain to fp

---
 src/autobuild.sh               | 2 ++
 src/ml_tools/thermaldataset.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/autobuild.sh b/src/autobuild.sh
index 193a6e6a..f81f4cc9 100755
--- a/src/autobuild.sh
+++ b/src/autobuild.sh
@@ -9,3 +9,5 @@ echo $month_ago
 python3 ../../cptv-download/cptv-download.py -l 0 -i 'poor tracking' -i 'untagged' -i 'part' -i 'untagged-by-humans' -i 'unknown' -i 'unidentified' -m 'human-tagged' --start-date "$month_ago" "$1" useremail@email.com userpassword
 echo "Downloading into $1"
 python3 build.py -c $config --ext ".cptv" $1
+dt=$(date '+%d%m%Y-%H%M%S');
+python3 train.py $dt 
\ No newline at end of file
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 14125bc7..a10b63b3 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -47,6 +47,7 @@ def get_remapped(multi_label=False):
         "sambar deer": "deer",
         "mouse": "rodent",
         "rat": "rodent",
+        "rain": "false-positive",
         "water": "false-positive",
         "insect": "false-positive",
         "allbirds": "bird",

From a5eb2eef1ab0adb192acf4a79bc8cf208739e86e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 28 Aug 2024 09:21:48 +1200
Subject: [PATCH 003/117] xla

---
 src/autobuild.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/autobuild.sh b/src/autobuild.sh
index f81f4cc9..fe993483 100755
--- a/src/autobuild.sh
+++ b/src/autobuild.sh
@@ -10,4 +10,5 @@ python3 ../../cptv-download/cptv-download.py -l 0 -i 'poor tracking' -i 'untagge
 echo "Downloading into $1"
 python3 build.py -c $config --ext ".cptv" $1
 dt=$(date '+%d%m%Y-%H%M%S');
+export XLA_FLAGS=--xla_gpu_cuda_data_dir=/home/cp/miniconda3/envs/tf/lib/
 python3 train.py $dt 
\ No newline at end of file

From 26857b90cb1e25df2fb14d2055774360b6d5d2e6 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 28 Aug 2024 09:22:24 +1200
Subject: [PATCH 004/117] add config

---
 src/autobuild.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autobuild.sh b/src/autobuild.sh
index fe993483..69f4dfac 100755
--- a/src/autobuild.sh
+++ b/src/autobuild.sh
@@ -11,4 +11,4 @@ echo "Downloading into $1"
 python3 build.py -c $config --ext ".cptv" $1
 dt=$(date '+%d%m%Y-%H%M%S');
 export XLA_FLAGS=--xla_gpu_cuda_data_dir=/home/cp/miniconda3/envs/tf/lib/
-python3 train.py $dt 
\ No newline at end of file
+python3 train.py -c $config $dt 
\ No newline at end of file

From 07084f8924c17f4869837d8903f6a6cb6891500e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 28 Aug 2024 13:09:47 +1200
Subject: [PATCH 005/117] make bash and rename

---
 src/autobuild-cron | 5 +++++
 src/autobuild.sh   | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 src/autobuild-cron

diff --git a/src/autobuild-cron b/src/autobuild-cron
new file mode 100644
index 00000000..083f5c69
--- /dev/null
+++ b/src/autobuild-cron
@@ -0,0 +1,5 @@
+#run the first of every month
+SHELL=/bin/bash
+BASH_ENV=~/.bashrc_conda
+
+* * 1 * * cp ( cd /home/cp/cacophony/classifier-pipeline/src && ./autobuild.sh /data2/cptv-files) 2>&1 | logger --tag classifier-auto-build
\ No newline at end of file
diff --git a/src/autobuild.sh b/src/autobuild.sh
index 69f4dfac..21f0c3b0 100755
--- a/src/autobuild.sh
+++ b/src/autobuild.sh
@@ -1,7 +1,8 @@
-#!/bin/sh
-
+#!/bin/bash
 set -e
 set -x
+conda init bash
+conda activate tf
 config="classifier-thermal.yaml"
 echo "Saving into $1"
 month_ago=$(python3 rebuildDate.py $1)

From 79dccd1b7d445f0c6a743586e02dddd995f3c8fc Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 15 Sep 2024 16:03:45 +0200
Subject: [PATCH 006/117] fix seg width

---
 src/ml_tools/datasetstructures.py |  2 +-
 src/ml_tools/thermaldataset.py    | 27 +++++++++++++++++----------
 src/ml_tools/thermalwriter.py     | 10 ++++++----
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index a48e247f..c31c3159 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -922,8 +922,8 @@ def get_segments(
     track_id,
     start_frame,
     regions,
-    segment_frame_spacing=9,
     segment_width=25,
+    segment_frame_spacing=9,
     label=None,
     segment_min_mass=None,
     ffc_frames=[],
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index a10b63b3..8266685c 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -12,6 +12,7 @@
 
 from ml_tools.featurenorms import mean_v, std_v
 from ml_tools.frame import TrackChannels
+from pathlib import Path
 
 # seed = 1341
 # tf.random.set_seed(seed)
@@ -308,12 +309,13 @@ def tile_images(images):
 # test stuff
 def main():
     init_logging()
-    config = Config.load_from_file()
+    config = Config.load_from_file("classifier-thermal.yaml")
     from .tfdataset import get_dataset, get_distribution
 
     # file = "/home/gp/cacophony/classifier-data/thermal-training/cp-training/training-meta.json"
-    file = f"{config.tracks_folder}/training-meta.json"
-    with open(file, "r") as f:
+    training_folder = Path(config.base_folder) / "training-data"
+    meta_f = training_folder / "training-meta.json"
+    with open(meta_f, "r") as f:
         meta = json.load(f)
     labels = meta.get("labels", [])
     datasets = []
@@ -321,7 +323,7 @@ def main():
     resampled_ds, remapped, labels, epoch_size = get_dataset(
         # dir,
         load_dataset,
-        f"{config.tracks_folder}/training-data/test",
+        training_folder / "test",
         labels,
         batch_size=32,
         image_size=(160, 160),
@@ -332,21 +334,24 @@ def main():
         remapped_labels=get_remapped(),
         excluded_labels=get_excluded(),
         include_track=False,
-        num_frames=1,
+        num_frames=25,
     )
     print("Ecpoh size is", epoch_size)
-    print(get_distribution(resampled_ds, len(labels), extra_meta=False))
+    # print(get_distribution(resampled_ds, len(labels), extra_meta=False))
     # return
     #
-    for e in range(2):
+    save_dir = Path("./test-images")
+    save_dir.mkdir(exist_ok=True)
+    for e in range(1):
+        batch_i = 0
         print("epoch", e)
         for x, y in resampled_ds:
-            show_batch(x, y, labels)
-
+            show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg")
+            batch_i += 1
     # return
 
 
-def show_batch(image_batch, label_batch, labels):
+def show_batch(image_batch, label_batch, labels, save=None):
     plt.figure(figsize=(10, 10))
     print("images in batch", len(image_batch), len(label_batch))
     num_images = min(len(image_batch), 25)
@@ -365,6 +370,8 @@ def show_batch(image_batch, label_batch, labels):
         plt.title(labels[np.argmax(label_batch[n])])
         plt.axis("off")
     # return
+    if save:
+        plt.savefig(save)
     plt.show()
 
 
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 8a3a290b..68dba9a8 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -216,10 +216,12 @@ def get_data(clip_samples, extra_args):
                 # GP All assumes we dont have a track over multiple bins (Whcih we probably never want)
                 if extra_args.get("use_segments", True):
                     track.get_segments(
-                        extra_args.get("segment_frame_spacing", 9),
-                        extra_args.get("segment_width", 25),
-                        extra_args.get("segment_type"),
-                        extra_args.get("segment_min_avg_mass"),
+                        segment_width=extra_args.get("segment_width", 25),
+                        segment_frame_spacing=extra_args.get(
+                            "segment_frame_spacing", 9
+                        ),
+                        segment_type=extra_args.get("segment_type"),
+                        segment_min_avg_mass=extra_args.get("segment_min_avg_mass"),
                         max_segments=extra_args.get("max_segments"),
                         dont_filter=extra_args.get("dont_filter_segment", False),
                         skip_ffc=extra_args.get("skip_ffc", True),

From 504e747f1966665624ff2b24f1bbea29cb520389 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 15 Sep 2024 16:09:07 +0200
Subject: [PATCH 007/117] fix name

---
 src/ml_tools/thermalwriter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 68dba9a8..88f96066 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -221,7 +221,7 @@ def get_data(clip_samples, extra_args):
                             "segment_frame_spacing", 9
                         ),
                         segment_type=extra_args.get("segment_type"),
-                        segment_min_avg_mass=extra_args.get("segment_min_avg_mass"),
+                        segment_min_mass=extra_args.get("segment_min_avg_mass"),
                         max_segments=extra_args.get("max_segments"),
                         dont_filter=extra_args.get("dont_filter_segment", False),
                         skip_ffc=extra_args.get("skip_ffc", True),

From ddaf4fe5fed2beceaee5725ed3b75502f6c84e6f Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 17 Sep 2024 15:15:15 +0200
Subject: [PATCH 008/117] debug

---
 src/ml_tools/thermaldataset.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 8266685c..55686c03 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -37,6 +37,11 @@ def get_excluded():
         "bat",
         "mammal",
         "frog",
+        "grey kangaroo",
+        "sambar deer" "chicken",
+        "fox",
+        "cow",
+        "wombat",
     ]
 
 
@@ -44,8 +49,8 @@ def get_remapped(multi_label=False):
     land_bird = "land-bird" if multi_label else "bird"
     return {
         "echidna": "hedgehog",
-        "grey kangaroo": "wallaby",
-        "sambar deer": "deer",
+        # "grey kangaroo": "wallaby",
+        # "sambar deer": "deer",
         "mouse": "rodent",
         "rat": "rodent",
         "rain": "false-positive",
@@ -59,7 +64,7 @@ def get_remapped(multi_label=False):
         "pheasant": land_bird,
         "pukeko": land_bird,
         "quail": land_bird,
-        "chicken": land_bird,
+        # "chicken": land_bird,
     }
 
 

From a0a67743a965d4fc4076ed9b0d1814901ca3abc8 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 17 Sep 2024 17:21:47 +0200
Subject: [PATCH 009/117] use rust binding

---
 src/ml_tools/rawdb.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 27175e23..9c34c149 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -20,7 +20,7 @@
 from ml_tools.datasetstructures import TrackHeader, ClipHeader
 from track.track import Track
 from track.cliptrackextractor import is_affected_by_ffc
-from cptv import CPTVReader
+from cptv_rs_python_bindings import CptvReader
 from ml_tools.rectangle import Rectangle
 
 special_datasets = [
@@ -62,19 +62,23 @@ def load_frames(self):
         background = None
         tracker_version = self.meta_data.get("tracker_version")
         frame_i = 0
-        with open(self.file, "rb") as f:
-            reader = CPTVReader(f)
-            for frame in reader:
-                if frame.background_frame:
-                    background = frame.pix
-                    # bug in previous tracker version where background was first frame
-                    if tracker_version >= 10:
-                        continue
-                ffc = is_affected_by_ffc(frame)
-                if ffc:
-                    ffc_frames.append(frame_i)
-                cptv_frames.append(frame.pix)
-                frame_i += 1
+        reader = CptvReader(str(self.file))
+        header = reader.get_header()
+        while True:
+            frame = reader.next_frame()
+            if frame is None:
+                break
+            if frame.background_frame:
+                background = frame.pix
+                # bug in previous tracker version where background was first frame
+                if tracker_version >= 10:
+                    continue
+            ffc = is_affected_by_ffc(frame)
+            if ffc:
+                print("GOT FFC")
+                ffc_frames.append(frame_i)
+            cptv_frames.append(frame.pix)
+            frame_i += 1
         frames = np.uint16(cptv_frames)
         if background is None:
             background = np.mean(frames, axis=0)

From 65d962f36a735a93e3c365aba8619419a0def674 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 17 Sep 2024 17:22:56 +0200
Subject: [PATCH 010/117] remove unneeded

---
 src/classify/clipclassifier.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/classify/clipclassifier.py b/src/classify/clipclassifier.py
index 0600b49d..f9e239db 100644
--- a/src/classify/clipclassifier.py
+++ b/src/classify/clipclassifier.py
@@ -11,13 +11,8 @@
 from track.clip import Clip
 from track.cliptrackextractor import ClipTrackExtractor, is_affected_by_ffc
 from ml_tools import tools
-from ml_tools.kerasmodel import KerasModel
 from track.irtrackextractor import IRTrackExtractor
 from ml_tools.previewer import Previewer
-from track.track import Track
-
-from cptv import CPTVReader
-from datetime import datetime
 from ml_tools.interpreter import get_interpreter
 
 

From 2586011e8f31d19dd92e9bb66e3e44ae25d0c963 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 17 Sep 2024 17:35:35 +0200
Subject: [PATCH 011/117] remove unneeded

---
 src/ml_tools/tfdataset.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index f791ba07..437ba16c 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -61,18 +61,7 @@ def get_distribution(dataset, num_labels, batched=True, one_hot=True, extra_meta
 
 
 def get_dataset(load_function, base_dir, labels, **args):
-    land_birds = [
-        "pukeko",
-        "california quail",
-        "brown quail",
-        "black swan",
-        "quail",
-        "pheasant",
-        "penguin",
-        "duck",
-        "chicken",
-        "rooster",
-    ]
+
     excluded_labels = args.get("excluded_labels", [])
     to_remap = args.get("remapped_labels", {})
     logging.info("Excluding %s", excluded_labels)
@@ -193,7 +182,6 @@ def get_dataset(load_function, base_dir, labels, **args):
             dataset = dataset.take(epoch_size)
     else:
         epoch_size = 1
-    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
     batch_size = args.get("batch_size", None)
     if batch_size is not None:
         dataset = dataset.batch(batch_size)

From a63e9a7058462273b0769f7716376b087b270b1d Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 18 Sep 2024 16:32:54 +0200
Subject: [PATCH 012/117] avoid bad regions

---
 src/ml_tools/forestmodel.py | 2 +-
 src/ml_tools/rawdb.py       | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/forestmodel.py b/src/ml_tools/forestmodel.py
index a8ccd95a..c3e17a66 100644
--- a/src/ml_tools/forestmodel.py
+++ b/src/ml_tools/forestmodel.py
@@ -228,7 +228,7 @@ def forest_features(
 
     for i, frame in enumerate(track_frames):
         region = regions[i]
-        if region.blank or region.width == 0 or region.height == 0:
+        if region.blank or region.width > 0 or region.height > 0:
             prev_count = 0
             continue
 
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 9c34c149..d1d2d681 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -31,6 +31,8 @@
     "overlay",
 ]
 
+FPS = 9
+
 
 class RawDatabase:
     def __init__(self, database_filename):

From 91ea9e7f51ec104084b2a5071dfb202b6daaf2fb Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Fri, 20 Sep 2024 14:53:36 +0200
Subject: [PATCH 013/117] update python-cptv

---
 pirequirements.txt            |  2 +-
 pyproject.toml                |  2 +-
 requirements.txt              |  2 +-
 src/ml_tools/thermalwriter.py | 10 ++++++++++
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/pirequirements.txt b/pirequirements.txt
index 72fde5a9..62280bdf 100644
--- a/pirequirements.txt
+++ b/pirequirements.txt
@@ -26,4 +26,4 @@ dbus-python==1.3.2
 importlib_resources==5.10.2
 opencv-python==4.8.0.76
 inotify_simple==1.3.5
-python-cptv==0.0.3
\ No newline at end of file
+python-cptv==0.0.5
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 36570e39..c80c9a77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,7 @@ dependencies = [
   "importlib_resources==5.10.2",
   "opencv-python==4.8.0.76",
   "inotify_simple==1.3.5",
-  "python-cptv==0.0.3"
+  "python-cptv==0.0.5"
 ]
 
 [project.scripts]
diff --git a/requirements.txt b/requirements.txt
index 8458878a..af94548d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,4 @@ joblib
 #requires sudo apt-get install libopencv-dev used for ir track extraction on server
 # pybgs==3.2.0.post1 this was used for ir
 inotify_simple==1.3.5
-python-cptv==0.0.3
\ No newline at end of file
+python-cptv==0.0.5
\ No newline at end of file
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 88f96066..89c93f56 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -306,6 +306,16 @@ def get_data(clip_samples, extra_args):
                         frame.resize_with_aspect(
                             (32, 32), crop_rectangle, keep_edge=True
                         )
+                        if (
+                            np.amax(frame.thermal) > 40000
+                            or np.amin(frame.thermal) < 1000
+                        ):
+                            logging.error(
+                                "Srange values for %s max %s min %s",
+                                clip_id,
+                                np.amax(frame.thermal),
+                                np.amin(frame.thermal),
+                            )
                         frame.thermal -= temp_median
                         np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
 

From 5fb72e986f6e666248f08818dba6edef9c2123ec Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Fri, 20 Sep 2024 15:26:07 +0200
Subject: [PATCH 014/117] save some files to test

---
 src/ml_tools/thermalwriter.py | 16 +++++++++++++++-
 src/ml_tools/tools.py         |  4 ++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 89c93f56..3d3a442b 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -23,6 +23,7 @@
       --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
       --num_shards=100
 """
+import cv2
 from PIL import Image
 from pathlib import Path
 import time
@@ -175,13 +176,19 @@ def get_data(clip_samples, extra_args):
         return None
     data = []
     crop_rectangle = tools.Rectangle(2, 2, 160 - 2 * 2, 140 - 2 * 2)
+
+    out_folder = None
     if clip_samples[0].source_file.suffix == ".hdf5":
         db = TrackDatabase(clip_samples[0].source_file)
+        out_folder = "hdf5"
     else:
         db = RawDatabase(clip_samples[0].source_file)
         db.load_frames()
-        # going to redo segments to get rid of ffc segments
+        out_folder = "raw"
 
+        # going to redo segments to get rid of ffc segments
+    out_folder = Path(out_folder)
+    out_folder.mkdir(exist_ok=True)
     clip_id = clip_samples[0].clip_id
     try:
         background = db.get_clip_background()
@@ -335,6 +342,13 @@ def get_data(clip_samples, extra_args):
                             frame.filtered, min=min_diff, max=max_diff, new_max=255
                         )
 
+                        cv2.imwrite(
+                            str(
+                                out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
+                            ),
+                            np.uint8(frame.thermal),
+                        )
+
                         if not stats[0]:
                             frame.filtered = np.zeros((frame.filtered.shape))
                         f2 = np.uint8(frame.filtered)
diff --git a/src/ml_tools/tools.py b/src/ml_tools/tools.py
index 73355ca4..519c9e2f 100644
--- a/src/ml_tools/tools.py
+++ b/src/ml_tools/tools.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pickle
 import json
-import dateutil
 import datetime
 import glob
 import cv2
@@ -15,6 +14,7 @@
 from PIL import Image, ImageFont, ImageDraw
 from pathlib import Path
 from ml_tools.rectangle import Rectangle
+from dateutil import parser
 
 EPISON = 1e-5
 
@@ -92,7 +92,7 @@ def load_clip_metadata(filename):
         # add in some metadata stats
         meta = json.load(t)
     if meta.get("recordingDateTime"):
-        meta["recordingDateTime"] = dateutil.parser.parse(meta["recordingDateTime"])
+        meta["recordingDateTime"] = parser.parse(meta["recordingDateTime"])
     if meta.get("tracks") is None and meta.get("Tracks"):
         meta["tracks"] = meta["Tracks"]
     return meta

From 38228ac466913be57c732b264746866952f381e7 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 22 Sep 2024 16:50:53 +0200
Subject: [PATCH 015/117] more debugging

---
 src/ml_tools/kerasmodel.py     | 6 ++++++
 src/ml_tools/tfdataset.py      | 3 +++
 src/ml_tools/thermaldataset.py | 1 +
 src/ml_tools/thermalwriter.py  | 2 +-
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 2313c789..9608f606 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -502,6 +502,9 @@ def train_model(
             multi_label=self.params.multi_label,
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
+            deterministic=True,
+            shuffle=False,
+            epoch_size=1000,
         )
         self.remapped = remapped
         self.validate, remapped, _, _ = get_dataset(
@@ -519,6 +522,9 @@ def train_model(
             multi_label=self.params.multi_label,
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
+            deterministic=True,
+            shuffle=False,
+            epoch_size=250,
             # dist=self.dataset_counts["validation"],
         )
 
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index 437ba16c..b18cf960 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -149,6 +149,9 @@ def get_dataset(load_function, base_dir, labels, **args):
             stop_on_empty_dataset=True,
             rerandomize_each_iteration=True,
         )
+    if args.get("epoch_size") is not None:
+        dataset = dataset.take(args.get("epoch_size"))
+        logging.info("Setting dataset to %s", args.get("epoch_size"))
     if args.get("cache", False):
         dataset = dataset.cache()
     if (
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 55686c03..04a6be4c 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -42,6 +42,7 @@ def get_excluded():
         "fox",
         "cow",
         "wombat",
+        "chicken",
     ]
 
 
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 3d3a442b..890cb461 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -346,7 +346,7 @@ def get_data(clip_samples, extra_args):
                             str(
                                 out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
                             ),
-                            np.uint8(frame.thermal),
+                            np.uint8(frame.filtered),
                         )
 
                         if not stats[0]:

From f90c6ed786ce44e2964bd47ee8ee8562095468a0 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 22 Sep 2024 17:27:22 +0200
Subject: [PATCH 016/117] more debugging

---
 src/ml_tools/kerasmodel.py     | 22 ++++++++++++++++++++--
 src/ml_tools/thermaldataset.py | 26 ++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 9608f606..28965b3d 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -503,8 +503,8 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
             deterministic=True,
-            shuffle=False,
             epoch_size=1000,
+            include_Track=True,
         )
         self.remapped = remapped
         self.validate, remapped, _, _ = get_dataset(
@@ -523,10 +523,28 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
             deterministic=True,
-            shuffle=False,
             epoch_size=250,
+            include_track=True,
             # dist=self.dataset_counts["validation"],
         )
+        logging.info("Saving datasets")
+        save_dir = Path("./train-images")
+        save_dir.mkdir(exist_ok=True)
+        batch_i = 0
+        for x, y in self.train:
+            thermaldataset.show_batch(
+                x, y, self.labels, save=save_dir / f"{batch_i}.jpg", tracks=True
+            )
+            batch_i += 1
+
+        save_dir = Path("./val-images")
+        save_dir.mkdir(exist_ok=True)
+        batch_i = 0
+        for x, y in self.validate:
+            thermaldataset.show_batch(
+                x, y, self.labels, save=save_dir / f"{batch_i}.jpg"
+            )
+            batch_i += 1
 
         if weights is not None:
             self.model.load_weights(weights)
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 04a6be4c..51a39258 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -38,11 +38,23 @@ def get_excluded():
         "mammal",
         "frog",
         "grey kangaroo",
-        "sambar deer" "chicken",
+        "sambar deer",
+        "chicken",
         "fox",
         "cow",
         "wombat",
         "chicken",
+        "dog",
+        "sheep" "cat",
+        "duck",
+        "pheasant",
+        "pukeko",
+        "brown quail",
+        "black swan",
+        "quail",
+        "california quail",
+        "sheep",
+        "echidna",
     ]
 
 
@@ -357,10 +369,13 @@ def main():
     # return
 
 
-def show_batch(image_batch, label_batch, labels, save=None):
+def show_batch(image_batch, label_batch, labels, save=None, tracks=False):
     plt.figure(figsize=(10, 10))
     print("images in batch", len(image_batch), len(label_batch))
     num_images = min(len(image_batch), 25)
+    if tracks:
+        track_batch = label_batch[1]
+        label_batch = label_batch[0]
     for n in range(num_images):
         ax = plt.subplot(5, 5, n + 1)
         img = np.uint8(image_batch[n])
@@ -372,8 +387,11 @@ def show_batch(image_batch, label_batch, labels, save=None):
         # if repeat > 0:
         # print(img.shape, " repeating", repeat)
         plt.imshow(img)
-        plt.title("C-" + str(image_batch[n]))
-        plt.title(labels[np.argmax(label_batch[n])])
+        if tracks:
+            plt.title(f"{labels[np.argmax(label_batch[n])]}-{track_batch[n]}")
+        else:
+            plt.title(labels[np.argmax(label_batch[n])])
+
         plt.axis("off")
     # return
     if save:

From 127940de3931c93ee68c7051460af722c2b9dd38 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 22 Sep 2024 17:28:10 +0200
Subject: [PATCH 017/117] double chicken

---
 src/ml_tools/thermaldataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 51a39258..b321754c 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -43,7 +43,6 @@ def get_excluded():
         "fox",
         "cow",
         "wombat",
-        "chicken",
         "dog",
         "sheep" "cat",
         "duck",

From 31f2e866ce7cbcc89ac12bf468d45f302d7db510 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 14:42:27 +0200
Subject: [PATCH 018/117] add missing station id

---
 src/ml_tools/rawdb.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index d1d2d681..a997a584 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -196,6 +196,7 @@ def get_clip_tracks(self, tag_precedence):
                 human_tags=human_tags,
                 source_file=self.file,
                 mega_missed_regions=track_meta.get("mega_missed_regions"),
+                station_id=clip_header.station_id,
                 # frame_temp_median=frame_temp_median,
             )
             clip_header.tracks.append(header)

From 2d31161e12d0773114465411a7cacb384ef26a0d Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 14:44:22 +0200
Subject: [PATCH 019/117] load small

---
 src/ml_tools/dataset.py       |  2 ++
 src/ml_tools/thermalwriter.py | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index b7690999..e9b75ae2 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -192,6 +192,8 @@ def load_clips(
             counter += 1
             if counter % 50 == 0:
                 logging.debug("Dataset loaded %s", counter)
+            if counter == 500:
+                break
         return [counter, counter]
 
     def load_clip(self, db_clip, dont_filter_segment=False):
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 890cb461..7dfbae0e 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -342,12 +342,12 @@ def get_data(clip_samples, extra_args):
                             frame.filtered, min=min_diff, max=max_diff, new_max=255
                         )
 
-                        cv2.imwrite(
-                            str(
-                                out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
-                            ),
-                            np.uint8(frame.filtered),
-                        )
+                        # cv2.imwrite(
+                        #     str(
+                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
+                        #     ),
+                        #     np.uint8(frame.filtered),
+                        # )
 
                         if not stats[0]:
                             frame.filtered = np.zeros((frame.filtered.shape))

From 138bdfbce741a3e081c17fa9057b237de321cc61 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 14:47:41 +0200
Subject: [PATCH 020/117] add check

---
 src/ml_tools/rawdb.py     | 1 -
 src/ml_tools/tfdataset.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index a997a584..29921198 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -77,7 +77,6 @@ def load_frames(self):
                     continue
             ffc = is_affected_by_ffc(frame)
             if ffc:
-                print("GOT FFC")
                 ffc_frames.append(frame_i)
             cptv_frames.append(frame.pix)
             frame_i += 1
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index b18cf960..a77b91c3 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -75,7 +75,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         if excluded in labels:
             new_labels.remove(excluded)
     for remapped_lbl in to_remap.keys():
-        if remapped_lbl in labels:
+        if remapped_lbl in new_labels:
             new_labels.remove(remapped_lbl)
     for l in labels:
         keys.append(labels.index(l))

From 3dfefc3251d88035f6d732040d67a35ecb8e678e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 14:56:57 +0200
Subject: [PATCH 021/117] use model lbls

---
 src/modelevaluate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 397b2446..7676a2f4 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -481,9 +481,11 @@ def main():
             threshold=args.threshold,
         )
     elif args.dataset:
+        model_labels = model.labels.copy()
         model.load_training_meta(base_dir)
-        if model.params.multi_label:
-            model.labels.append("land-bird")
+        model.labels = model_labels
+        # if model.params.multi_label:
+        # model.labels.append("land-bird")
         excluded, remapped = get_excluded(model.data_type)
         files = base_dir / args.dataset
         dataset, _, new_labels, _ = get_dataset(

From cdc6ef637f631367d5b08d1675cfd97280f64b61 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:12:13 +0200
Subject: [PATCH 022/117] remap labels

---
 src/ml_tools/tfdataset.py | 73 ++++++++++++++++++++++++++-------------
 src/modelevaluate.py      |  7 ++--
 2 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index a77b91c3..f710799d 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -61,36 +61,61 @@ def get_distribution(dataset, num_labels, batched=True, one_hot=True, extra_meta
 
 
 def get_dataset(load_function, base_dir, labels, **args):
+    model_labels = args.get("model_labels")
 
     excluded_labels = args.get("excluded_labels", [])
     to_remap = args.get("remapped_labels", {})
-    logging.info("Excluding %s", excluded_labels)
     remapped = {}
     keys = []
     values = []
-    # excluded_labels.append("insect")
-    # excluded_labels.append("cat")
-    new_labels = labels.copy()
-    for excluded in excluded_labels:
-        if excluded in labels:
-            new_labels.remove(excluded)
-    for remapped_lbl in to_remap.keys():
-        if remapped_lbl in new_labels:
-            new_labels.remove(remapped_lbl)
-    for l in labels:
-        keys.append(labels.index(l))
-        if l not in new_labels:
-            remapped[l] = -1
-            values.append(-1)
-            logging.info("Excluding %s", l)
-        else:
-            remapped[l] = [l]
-            values.append(new_labels.index(l))
-    for k, v in to_remap.items():
-        if k in labels and v in labels:
-            remapped[v].append(k)
-            values[labels.index(k)] = new_labels.index(v)
-            del remapped[k]
+    if model_labels is not None:
+        logging.info("Mapping DS labels to model labels ")
+        # if we are loading a model with different labels we need to map the dataset labels
+        # to the equivalent model labels
+        for l_i, og_lbl in enumerate(labels):
+            keys.append(l_i)
+            try:
+                lbl = og_lbl
+                if lbl in to_remap:
+                    lbl = to_remap[lbl]
+                    l_i = labels.index(lbl)
+
+                mdl_i = model_labels.index(lbl)
+                if lbl not in remapped:
+                    remapped[lbl] = []
+                remapped[lbl].append(og_lbl)
+                values.append(mdl_i)
+            except:
+                remapped[og_lbl] = -1
+                values.append(-1)
+
+    else:
+
+        logging.info("Excluding %s", excluded_labels)
+
+        # excluded_labels.append("insect")
+        # excluded_labels.append("cat")
+        new_labels = labels.copy()
+        for excluded in excluded_labels:
+            if excluded in labels:
+                new_labels.remove(excluded)
+        for remapped_lbl in to_remap.keys():
+            if remapped_lbl in new_labels:
+                new_labels.remove(remapped_lbl)
+        for l in labels:
+            keys.append(labels.index(l))
+            if l not in new_labels:
+                remapped[l] = -1
+                values.append(-1)
+                logging.info("Excluding %s", l)
+            else:
+                remapped[l] = [l]
+                values.append(new_labels.index(l))
+        for k, v in to_remap.items():
+            if k in labels and v in labels:
+                remapped[v].append(k)
+                values[labels.index(k)] = new_labels.index(v)
+                del remapped[k]
     remap_lookup = tf.lookup.StaticHashTable(
         initializer=tf.lookup.KeyValueTensorInitializer(
             keys=tf.constant(keys),
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 7676a2f4..009451c8 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -483,15 +483,16 @@ def main():
     elif args.dataset:
         model_labels = model.labels.copy()
         model.load_training_meta(base_dir)
-        model.labels = model_labels
-        # if model.params.multi_label:
-        # model.labels.append("land-bird")
+        # model.labels = model_labels
+        if model.params.multi_label:
+            model.labels.append("land-bird")
         excluded, remapped = get_excluded(model.data_type)
         files = base_dir / args.dataset
         dataset, _, new_labels, _ = get_dataset(
             files,
             model.data_type,
             model.labels,
+            model_labels=model_labels,
             batch_size=64,
             image_size=model.params.output_dim[:2],
             preprocess_fn=model.preprocess_fn,

From deb1a2b0677576b9633d886a8c55e3eab83c34c9 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:14:32 +0200
Subject: [PATCH 023/117] fix new

---
 src/ml_tools/tfdataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index f710799d..6f6ffb84 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -69,6 +69,8 @@ def get_dataset(load_function, base_dir, labels, **args):
     keys = []
     values = []
     if model_labels is not None:
+        new_labels = model_labels
+
         logging.info("Mapping DS labels to model labels ")
         # if we are loading a model with different labels we need to map the dataset labels
         # to the equivalent model labels

From 6731c5bb0f9da3fb3e0008bf2d50e9ebb40a7135 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:24:22 +0200
Subject: [PATCH 024/117] no need to show

---
 src/ml_tools/kerasmodel.py | 56 +++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 28965b3d..3ee4779e 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -527,34 +527,34 @@ def train_model(
             include_track=True,
             # dist=self.dataset_counts["validation"],
         )
-        logging.info("Saving datasets")
-        save_dir = Path("./train-images")
-        save_dir.mkdir(exist_ok=True)
-        batch_i = 0
-        for x, y in self.train:
-            thermaldataset.show_batch(
-                x, y, self.labels, save=save_dir / f"{batch_i}.jpg", tracks=True
-            )
-            batch_i += 1
-
-        save_dir = Path("./val-images")
-        save_dir.mkdir(exist_ok=True)
-        batch_i = 0
-        for x, y in self.validate:
-            thermaldataset.show_batch(
-                x, y, self.labels, save=save_dir / f"{batch_i}.jpg"
-            )
-            batch_i += 1
-
-        if weights is not None:
-            self.model.load_weights(weights)
-        if rebalance:
-            self.class_weights = get_weighting(self.train, self.labels)
-            logging.info(
-                "Training on %s  with class weights %s",
-                self.labels,
-                self.class_weights,
-            )
+        # logging.info("Saving datasets")
+        # save_dir = Path("./train-images")
+        # save_dir.mkdir(exist_ok=True)
+        # batch_i = 0
+        # for x, y in self.train:
+        #     thermaldataset.show_batch(
+        #         x, y, self.labels, save=save_dir / f"{batch_i}.jpg", tracks=True
+        #     )
+        #     batch_i += 1
+
+        # save_dir = Path("./val-images")
+        # save_dir.mkdir(exist_ok=True)
+        # batch_i = 0
+        # for x, y in self.validate:
+        #     thermaldataset.show_batch(
+        #         x, y, self.labels, save=save_dir / f"{batch_i}.jpg"
+        #     )
+        #     batch_i += 1
+
+        # if weights is not None:
+        #     self.model.load_weights(weights)
+        # if rebalance:
+        #     self.class_weights = get_weighting(self.train, self.labels)
+        #     logging.info(
+        #         "Training on %s  with class weights %s",
+        #         self.labels,
+        #         self.class_weights,
+        #     )
 
         self.save_metadata(run_name)
         self.save(run_name)

From e878f0c3e0063aa14dfca59eafad44120c967744 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:30:54 +0200
Subject: [PATCH 025/117] add ext

---
 src/ml_tools/kerasmodel.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 3ee4779e..a903f92a 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -601,12 +601,12 @@ def train_model(
         self.save(run_name, history=history, test_results=test_accuracy)
 
     def checkpoints(self, run_name):
-        checkpoint_file = self.checkpoint_folder / run_name / "cp.ckpt"
+        checkpoint_file = self.checkpoint_folder / run_name / "cp.weights.h5"
 
         cp_callback = tf.keras.callbacks.ModelCheckpoint(
             filepath=checkpoint_file, save_weights_only=True, verbose=1
         )
-        val_loss = self.checkpoint_folder / run_name / "val_loss"
+        val_loss = self.checkpoint_folder / run_name / "val_loss.weights.h5"
 
         checkpoint_loss = tf.keras.callbacks.ModelCheckpoint(
             val_loss,
@@ -616,7 +616,7 @@ def checkpoints(self, run_name):
             save_weights_only=True,
             mode="auto",
         )
-        val_acc = self.checkpoint_folder / run_name / "val_acc"
+        val_acc = self.checkpoint_folder / run_name / "val_acc.weights.h5"
 
         checkpoint_acc = tf.keras.callbacks.ModelCheckpoint(
             val_acc,
@@ -631,7 +631,7 @@ def checkpoints(self, run_name):
             mode="max",
         )
 
-        val_precision = self.checkpoint_folder / run_name / "val_recall"
+        val_precision = self.checkpoint_folder / run_name / "val_recall.weights.h5"
 
         checkpoint_recall = tf.keras.callbacks.ModelCheckpoint(
             val_precision,

From 28808014b9d046d47c8a77bb046b933d3e0bd8a9 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:36:15 +0200
Subject: [PATCH 026/117] add mode

---
 src/ml_tools/kerasmodel.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index a903f92a..00ea6a47 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -503,8 +503,8 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
             deterministic=True,
-            epoch_size=1000,
-            include_Track=True,
+            # epoch_size=1000,
+            # include_Track=True,
         )
         self.remapped = remapped
         self.validate, remapped, _, _ = get_dataset(
@@ -523,8 +523,8 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
             deterministic=True,
-            epoch_size=250,
-            include_track=True,
+            # epoch_size=250,
+            # in2clude_track=True,
             # dist=self.dataset_counts["validation"],
         )
         # logging.info("Saving datasets")
@@ -672,6 +672,7 @@ def checkpoints(self, run_name):
                 if self.params.multi_label
                 else "val_categorical_accuracy"
             ),
+            mode = "max"
             verbose=1,
         )
         return [

From 2657382323ba32204c0026cd043b7d4190103e0c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:40:51 +0200
Subject: [PATCH 027/117] add more debug

---
 src/ml_tools/kerasmodel.py     | 1 +
 src/ml_tools/tfdataset.py      | 1 +
 src/ml_tools/thermaldataset.py | 9 +++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 00ea6a47..0c14fd2f 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -648,6 +648,7 @@ def checkpoints(self, run_name):
                 if self.params.multi_label
                 else "val_categorical_accuracy"
             ),
+            mode = "max"
         )
         # havent found much use in this just takes training time
         # file_writer_cm = tf.summary.create_file_writer(
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index 6f6ffb84..d4790df4 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -206,6 +206,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         logging.info("Setting dataset size to %s", epoch_size)
         if not args.get("only_features", False):
             dataset = dataset.repeat(2)
+        dataset = dataset.take(epoch_size)
         scale_epoch = args.get("scale_epoch", None)
         if scale_epoch:
             epoch_size = epoch_size // scale_epoch
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index b321754c..a73438c9 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -54,6 +54,11 @@ def get_excluded():
         "california quail",
         "sheep",
         "echidna",
+        "mouse",
+        "rodent",
+        "possum",
+        "cat",
+        "dog",
     ]
 
 
@@ -63,8 +68,8 @@ def get_remapped(multi_label=False):
         "echidna": "hedgehog",
         # "grey kangaroo": "wallaby",
         # "sambar deer": "deer",
-        "mouse": "rodent",
-        "rat": "rodent",
+        # "mouse": "rodent",
+        # "rat": "rodent",
         "rain": "false-positive",
         "water": "false-positive",
         "insect": "false-positive",

From a1a37d70f7c043ad8e3394bb1a470623eac14f4b Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:44:07 +0200
Subject: [PATCH 028/117] exclude most

---
 src/ml_tools/thermaldataset.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index a73438c9..1dd0e7ee 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -59,6 +59,11 @@ def get_excluded():
         "possum",
         "cat",
         "dog",
+        "hedgehog",
+        "kiwi",
+        "leporidae",
+        "mustelid",
+        "wallaby",
     ]
 
 

From 61338c77b61deab27fe34b73fa360df1eb1f4142 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:44:39 +0200
Subject: [PATCH 029/117] comma

---
 src/ml_tools/kerasmodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 0c14fd2f..42f79714 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -648,7 +648,7 @@ def checkpoints(self, run_name):
                 if self.params.multi_label
                 else "val_categorical_accuracy"
             ),
-            mode = "max"
+            mode="max",
         )
         # havent found much use in this just takes training time
         # file_writer_cm = tf.summary.create_file_writer(
@@ -673,7 +673,7 @@ def checkpoints(self, run_name):
                 if self.params.multi_label
                 else "val_categorical_accuracy"
             ),
-            mode = "max"
+            mode="max",
             verbose=1,
         )
         return [

From 513fcf336fb1abd01fccf1324d30f119cf3a626b Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 15:45:55 +0200
Subject: [PATCH 030/117] remaining 2

---
 src/ml_tools/thermaldataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 1dd0e7ee..f84e5752 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -64,6 +64,8 @@ def get_excluded():
         "leporidae",
         "mustelid",
         "wallaby",
+        "human",
+        "vehicle",
     ]
 
 

From 8ff17710e932ce1db8958f3e6718a52c6e1222c7 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 16:02:54 +0200
Subject: [PATCH 031/117] dont save strange values

---
 src/ml_tools/thermalwriter.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 7dfbae0e..ee9b67da 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -318,11 +318,20 @@ def get_data(clip_samples, extra_args):
                             or np.amin(frame.thermal) < 1000
                         ):
                             logging.error(
-                                "Srange values for %s max %s min %s",
+                                "Strange values for %s max %s min %s",
                                 clip_id,
                                 np.amax(frame.thermal),
                                 np.amin(frame.thermal),
                             )
+                            raise Exception(
+                                f"Strange values for {clip_id} - {track_id} #{frame_number}"
+                            )
+                        #           cv2.imwrite(
+                        #     str(
+                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
+                        #     ),
+                        #     np.uint8(frame.filtered),
+                        # )
                         frame.thermal -= temp_median
                         np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
 

From ae908717428afebf06b3be356e185a08453eda7c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 23 Sep 2024 16:27:47 +0200
Subject: [PATCH 032/117] weighting

---
 src/ml_tools/kerasmodel.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 42f79714..cd0d5469 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -548,13 +548,13 @@ def train_model(
 
         # if weights is not None:
         #     self.model.load_weights(weights)
-        # if rebalance:
-        #     self.class_weights = get_weighting(self.train, self.labels)
-        #     logging.info(
-        #         "Training on %s  with class weights %s",
-        #         self.labels,
-        #         self.class_weights,
-        #     )
+        if rebalance:
+            self.class_weights = get_weighting(self.train, self.labels)
+            logging.info(
+                "Training on %s  with class weights %s",
+                self.labels,
+                self.class_weights,
+            )
 
         self.save_metadata(run_name)
         self.save(run_name)

From b66fe315aacb39fb39668505d0d157fbf3bc8b3c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 16:22:26 +0200
Subject: [PATCH 033/117] debugging

---
 src/ml_tools/datasetstructures.py | 10 +++++
 src/ml_tools/imageprocessing.py   | 32 +++++++++++----
 src/ml_tools/kerasmodel.py        |  2 +-
 src/ml_tools/thermaldataset.py    | 11 ------
 src/ml_tools/thermalwriter.py     | 66 ++++++++++++++++++++-----------
 5 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index c31c3159..81f1e64c 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -941,6 +941,13 @@ def get_segments(
     dont_filter=False,
     skip_ffc=True,
 ):
+    logging.info(
+        "Getting segments %s min mass %s max %s ffc %s",
+        segment_type,
+        segment_min_mass,
+        max_segments,
+        ffc_frames,
+    )
     if segment_type == SegmentType.ALL_RANDOM_NOMIN:
         segment_min_mass = None
     if min_frames is None:
@@ -950,6 +957,7 @@ def get_segments(
     filtered_stats = {"segment_mass": 0, "too short": 0}
 
     has_no_mass = np.sum(mass_history) == 0
+    before = len(regions)
     frame_indices = [
         region.frame_number
         for region in regions
@@ -963,6 +971,7 @@ def get_segments(
         and region.width > 0
         and region.height > 0
     ]
+    logging.info("Frames are %s / %s", len(frame_indices), before)
     if len(frame_indices) == 0:
         logging.warn("Nothing to load for %s - %s", clip_id, track_id)
         return [], filtered_stats
@@ -1065,6 +1074,7 @@ def get_segments(
             segment_mass = np.sum(mass_slice)
             segment_avg_mass = segment_mass / len(mass_slice)
             filtered = False
+            logging.info("avg mass is %s mass slice %s %s", segment_avg_mass, mass_slice)
             if segment_min_mass and segment_avg_mass < segment_min_mass:
                 if dont_filter:
                     filtered = True
diff --git a/src/ml_tools/imageprocessing.py b/src/ml_tools/imageprocessing.py
index 6b2e4fbf..d9e9f738 100644
--- a/src/ml_tools/imageprocessing.py
+++ b/src/ml_tools/imageprocessing.py
@@ -5,6 +5,7 @@
 from PIL import Image
 from scipy import ndimage
 from PIL import Image
+import logging
 
 
 def resize_and_pad(
@@ -19,10 +20,22 @@ def resize_and_pad(
     extra_v=0,
 ):
     scale_percent = (new_dim[:2] / np.array(frame.shape[:2])).min()
-    width = int(frame.shape[1] * scale_percent)
-    height = int(frame.shape[0] * scale_percent)
+    width = round(frame.shape[1] * scale_percent)
+    height = round(frame.shape[0] * scale_percent)
     width = max(width, 1)
     height = max(height, 1)
+
+    width = min(width, new_dim[0])
+    height = min(height, new_dim[1])
+    logging.info(
+        "Resizing image with dim %s into dim %s height %s and width %s keep edge %s region %s",
+        frame.shape,
+        new_dim,
+        height,
+        width,
+        keep_edge,
+        region,
+    )
     if len(frame.shape) == 3:
         resize_dim = (width, height, frame.shape[2])
     else:
@@ -40,17 +53,20 @@ def resize_and_pad(
     offset_x = (new_dim[1] - frame_width) // 2
     offset_y = (new_dim[0] - frame_height) // 2
     if keep_edge and crop_region is not None:
-        if region.left == crop_region.left:
+        logging.info("Checking region %s against crop %s", region, crop_region)
+        if region.left <= crop_region.left:
             offset_x = 0
-
-        elif region.right == crop_region.right:
+            logging.info("On left offset so setting 0 %s", region)
+        elif region.right >= crop_region.right:
             offset_x = new_dim[1] - frame_width
+            logging.info("On right offset so setting 0 %s", region)
 
-        if region.top == crop_region.top:
+        if region.top <= crop_region.top:
             offset_y = 0
 
-        elif region.bottom == crop_region.bottom:
+        elif region.bottom >= crop_region.bottom:
             offset_y = new_dim[0] - frame_height
+    logging.info("Offsets are %s %s", offset_x, offset_y)
     if len(resized.shape) == 3:
         resized[
             offset_y : offset_y + frame_height, offset_x : offset_x + frame_width, :
@@ -159,7 +175,7 @@ def normalize(data, min=None, max=None, new_max=1):
         max = np.amax(data)
     if min is None:
         min = np.amin(data)
-    # print("normalizing with", max, min, new_max)
+    print("normalizing with", max, "MIN:", min)
     if max == min:
         if max == 0:
             return np.zeros((data.shape)), (False, max, min)
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index cd0d5469..32f1f9c7 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -550,7 +550,7 @@ def train_model(
         #     self.model.load_weights(weights)
         if rebalance:
             self.class_weights = get_weighting(self.train, self.labels)
-            logging.info(
+        logging.info(
                 "Training on %s  with class weights %s",
                 self.labels,
                 self.class_weights,
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index f84e5752..079ad772 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -307,17 +307,6 @@ def read_tfrecord(
     return rgb_image
 
 
-def decode_image(thermals, filtereds, image_size):
-    deoced_thermals = []
-    decoded_filtered = []
-    for thermal, filtered in zip(thermals, filtereds):
-        image = tf.image.decode_png(image, channels=1)
-        filtered = tf.image.decode_png(filtered, channels=1)
-        decoded_thermal.append(image)
-        decoded_filtered.append(filtered)
-    return decoded_thermal, decoded_filtered
-
-
 def tile_images(images):
     index = 0
     image = None
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index ee9b67da..bf87e30c 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -277,10 +277,12 @@ def get_data(clip_samples, extra_args):
                 normalize=True,
                 cropped=True,
             )
-
+            return None
             by_frame_number = {}
+            thermal_max_diff = 0
+            thermal_min_diff = None
             max_diff = 0
-            min_diff = 0
+            min_diff = None
             for f in track_frames:
                 if f.region.blank or f.region.width <= 0 or f.region.height <= 0:
                     continue
@@ -290,11 +292,27 @@ def get_data(clip_samples, extra_args):
                 diff_frame = f.thermal - f.region.subimage(background)
                 new_max = np.amax(diff_frame)
                 new_min = np.amin(diff_frame)
-                if new_min < min_diff:
+                if min_diff is None or new_min < min_diff:
                     min_diff = new_min
+                    # min_diff = max(0, new_min)
                 if new_max > max_diff:
                     max_diff = new_max
 
+                diff_frame = f.thermal - frame_temp_median[f.frame_number]
+                new_max = np.amax(diff_frame)
+                new_min = np.amin(diff_frame)
+                if thermal_min_diff is None or new_min < thermal_min_diff:
+                    thermal_min_diff = new_min
+                    # min_diff = max(0, new_min)
+                if new_max > thermal_max_diff:
+                    thermal_max_diff = new_max
+            logging.info(
+                "Min diff %s max diff %s thermal %s - %s",
+                min_diff,
+                max_diff,
+                thermal_min_diff,
+                thermal_max_diff,
+            )
             # normalize by maximum difference between background and tracked region
             # probably only need to use difference on the frames used for this record
             # also min_diff maybe could just be set to 0 and clip values below 0,
@@ -326,37 +344,39 @@ def get_data(clip_samples, extra_args):
                             raise Exception(
                                 f"Strange values for {clip_id} - {track_id} #{frame_number}"
                             )
-                        #           cv2.imwrite(
-                        #     str(
-                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
-                        #     ),
-                        #     np.uint8(frame.filtered),
-                        # )
+                        logging.info(
+                            "Median is %s median in thermal is %s",
+                            temp_median,
+                            np.median(frame.thermal),
+                        )
                         frame.thermal -= temp_median
-                        np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
 
+                        # np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
                         frame.thermal, stats = imageprocessing.normalize(
-                            frame.thermal, new_max=255
+                            frame.thermal,
+                            min=thermal_min_diff,
+                            max=thermal_max_diff,
+                            new_max=255,
                         )
                         if not stats[0]:
                             frame.thermal = np.zeros((frame.thermal.shape))
-                            # continue
-                        # f2 = frame.filtered.copy()
-                        # frame.filtered, stats = imageprocessing.normalize(
-                        #     frame.filtered, new_max=255
-                        # )
-                        # np.clip(frame.filtered, a_min=min_diff, a_max=None, out=frame.filtered)
 
                         frame.filtered, stats = imageprocessing.normalize(
                             frame.filtered, min=min_diff, max=max_diff, new_max=255
                         )
+                        np.clip(frame.filtered, a_min=0, a_max=None, out=frame.filtered)
 
-                        # cv2.imwrite(
-                        #     str(
-                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
-                        #     ),
-                        #     np.uint8(frame.filtered),
-                        # )
+                        logging.info(
+                            "Normalied %s %s",
+                            np.amin(frame.thermal),
+                            np.amax(frame.thermal),
+                        )
+                        cv2.imwrite(
+                            str(
+                                out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
+                            ),
+                            np.uint8(frame.thermal),
+                        )
 
                         if not stats[0]:
                             frame.filtered = np.zeros((frame.filtered.shape))

From 29f0d69318ae2849808d8174a347f8d677624834 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 17:25:33 +0200
Subject: [PATCH 034/117] fixed resize and keep edge

---
 src/ml_tools/dataset.py           |  4 +++-
 src/ml_tools/datasetstructures.py |  9 -------
 src/ml_tools/imageprocessing.py   | 15 +-----------
 src/ml_tools/thermalwriter.py     | 39 ++++++++++++-------------------
 4 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index e9b75ae2..30bbb224 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -587,7 +587,9 @@ def filter_track(track_header, excluded_tags, filtered_stats={}):
         return True
 
     if track_header.human_tags is not None:
-        found_tags = [tag for tag in track_header.human_tags if tag in excluded_tags]
+        found_tags = [
+            tag[0] for tag in track_header.human_tags if tag[0] in excluded_tags
+        ]
         if len(found_tags) > 0:
             filter_tags = filtered_stats.setdefault("tag_names", set())
             filter_tags |= set(found_tags)
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 81f1e64c..57292b92 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -941,13 +941,6 @@ def get_segments(
     dont_filter=False,
     skip_ffc=True,
 ):
-    logging.info(
-        "Getting segments %s min mass %s max %s ffc %s",
-        segment_type,
-        segment_min_mass,
-        max_segments,
-        ffc_frames,
-    )
     if segment_type == SegmentType.ALL_RANDOM_NOMIN:
         segment_min_mass = None
     if min_frames is None:
@@ -971,7 +964,6 @@ def get_segments(
         and region.width > 0
         and region.height > 0
     ]
-    logging.info("Frames are %s / %s", len(frame_indices), before)
     if len(frame_indices) == 0:
         logging.warn("Nothing to load for %s - %s", clip_id, track_id)
         return [], filtered_stats
@@ -1074,7 +1066,6 @@ def get_segments(
             segment_mass = np.sum(mass_slice)
             segment_avg_mass = segment_mass / len(mass_slice)
             filtered = False
-            logging.info("avg mass is %s mass slice %s %s", segment_avg_mass, mass_slice)
             if segment_min_mass and segment_avg_mass < segment_min_mass:
                 if dont_filter:
                     filtered = True
diff --git a/src/ml_tools/imageprocessing.py b/src/ml_tools/imageprocessing.py
index d9e9f738..4eeebcac 100644
--- a/src/ml_tools/imageprocessing.py
+++ b/src/ml_tools/imageprocessing.py
@@ -27,15 +27,7 @@ def resize_and_pad(
 
     width = min(width, new_dim[0])
     height = min(height, new_dim[1])
-    logging.info(
-        "Resizing image with dim %s into dim %s height %s and width %s keep edge %s region %s",
-        frame.shape,
-        new_dim,
-        height,
-        width,
-        keep_edge,
-        region,
-    )
+
     if len(frame.shape) == 3:
         resize_dim = (width, height, frame.shape[2])
     else:
@@ -53,20 +45,16 @@ def resize_and_pad(
     offset_x = (new_dim[1] - frame_width) // 2
     offset_y = (new_dim[0] - frame_height) // 2
     if keep_edge and crop_region is not None:
-        logging.info("Checking region %s against crop %s", region, crop_region)
         if region.left <= crop_region.left:
             offset_x = 0
-            logging.info("On left offset so setting 0 %s", region)
         elif region.right >= crop_region.right:
             offset_x = new_dim[1] - frame_width
-            logging.info("On right offset so setting 0 %s", region)
 
         if region.top <= crop_region.top:
             offset_y = 0
 
         elif region.bottom >= crop_region.bottom:
             offset_y = new_dim[0] - frame_height
-    logging.info("Offsets are %s %s", offset_x, offset_y)
     if len(resized.shape) == 3:
         resized[
             offset_y : offset_y + frame_height, offset_x : offset_x + frame_width, :
@@ -175,7 +163,6 @@ def normalize(data, min=None, max=None, new_max=1):
         max = np.amax(data)
     if min is None:
         min = np.amin(data)
-    print("normalizing with", max, "MIN:", min)
     if max == min:
         if max == 0:
             return np.zeros((data.shape)), (False, max, min)
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index bf87e30c..345bb9fb 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -277,7 +277,7 @@ def get_data(clip_samples, extra_args):
                 normalize=True,
                 cropped=True,
             )
-            return None
+
             by_frame_number = {}
             thermal_max_diff = 0
             thermal_min_diff = None
@@ -306,13 +306,13 @@ def get_data(clip_samples, extra_args):
                     # min_diff = max(0, new_min)
                 if new_max > thermal_max_diff:
                     thermal_max_diff = new_max
-            logging.info(
-                "Min diff %s max diff %s thermal %s - %s",
-                min_diff,
-                max_diff,
-                thermal_min_diff,
-                thermal_max_diff,
-            )
+            # logging.info(
+            #     "Min diff %s max diff %s thermal %s - %s",
+            #     min_diff,
+            #     max_diff,
+            #     thermal_min_diff,
+            #     thermal_max_diff,
+            # )
             # normalize by maximum difference between background and tracked region
             # probably only need to use difference on the frames used for this record
             # also min_diff maybe could just be set to 0 and clip values below 0,
@@ -344,11 +344,7 @@ def get_data(clip_samples, extra_args):
                             raise Exception(
                                 f"Strange values for {clip_id} - {track_id} #{frame_number}"
                             )
-                        logging.info(
-                            "Median is %s median in thermal is %s",
-                            temp_median,
-                            np.median(frame.thermal),
-                        )
+
                         frame.thermal -= temp_median
 
                         # np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
@@ -366,17 +362,12 @@ def get_data(clip_samples, extra_args):
                         )
                         np.clip(frame.filtered, a_min=0, a_max=None, out=frame.filtered)
 
-                        logging.info(
-                            "Normalied %s %s",
-                            np.amin(frame.thermal),
-                            np.amax(frame.thermal),
-                        )
-                        cv2.imwrite(
-                            str(
-                                out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
-                            ),
-                            np.uint8(frame.thermal),
-                        )
+                        # cv2.imwrite(
+                        #     str(
+                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
+                        #     ),
+                        #     np.uint8(frame.thermal),
+                        # )
 
                         if not stats[0]:
                             frame.filtered = np.zeros((frame.filtered.shape))

From 09be2465c12babb6bf9d5cc9f3d1ce53efceca07 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 17:30:41 +0200
Subject: [PATCH 035/117] fix tf

---
 src/ml_tools/tfdataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index d4790df4..027b7b92 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -99,7 +99,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         # excluded_labels.append("cat")
         new_labels = labels.copy()
         for excluded in excluded_labels:
-            if excluded in labels:
+            if excluded in new_labels:
                 new_labels.remove(excluded)
         for remapped_lbl in to_remap.keys():
             if remapped_lbl in new_labels:

From 79d931eb2be8ed716a12bcc37e1a40b1f01fa392 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 17:59:03 +0200
Subject: [PATCH 036/117] rought balance

---
 src/build.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/build.py b/src/build.py
index 7b91dad8..879985fd 100644
--- a/src/build.py
+++ b/src/build.py
@@ -717,6 +717,39 @@ def dump_split_ids(datasets, out_file="datasplit.json"):
     return
 
 
+def rough_balance(datasets):
+    logging.info("ROUGH BALANCE")
+    print_counts(*datasets)
+
+    for dataset in datasets:
+        lbl_counts = {}
+        counts = []
+        for label in dataset.labels:
+            label_count = len(dataset.samples_by_label.get(label, []))
+            lbl_counts[label] = label_count
+            counts.append(label_count)
+        counts.sort()
+        std_dev = np.std(counts)
+        logging.info("Counts are %s std dev %s", counts, std_dev)
+        if std_dev < 2000:
+            logging.info("Not balancing")
+            continue
+        if len(counts) < 7:
+            cap_at = counts[-2]
+        else:
+            cap_at = counts[-3]
+        logging.info("Capping dataset %s at %s", dataset.name, cap_at)
+        for lbl, count in lbl_counts.items():
+            if count <= cap_at:
+                continue
+            samples_to_remove = count - cap_at
+            by_labels = dataset.samples_by_label[lbl]
+            np.random.shuffle(by_labels)
+            for i in range(samples_to_remove):
+                dataset.remove_sample(by_labels[i])
+    print_counts(*datasets)
+
+
 def main():
     init_logging()
     args = parse_args()
@@ -782,6 +815,8 @@ def main():
         print("Splitting data set into train / validation")
 
         datasets = split_randomly(master_dataset, config, args.date, test_clips)
+
+        rough_balance(datasets)
         validate_datasets(datasets, test_clips, args.date)
         dump_split_ids(datasets, record_dir / "datasplit.json")
 

From 31db14d23abf6e289e94971287e1f07b98697537 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 20:47:51 +0200
Subject: [PATCH 037/117] more debug

---
 src/ml_tools/tfwriter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 3d9129ad..677a975e 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -101,7 +101,14 @@ def create_tf_records(
     samples_by_source = dataset.get_samples_by_source()
     source_files = list(samples_by_source.keys())
     np.random.shuffle(source_files)
-
+    lbl_samples = {}
+    for samples, source in samples_by_source.items():
+        for s in samples:
+            if s.label not in lbl_samples:
+                lbl_samples[s.label] = 0
+            lbl_samples[s.label] += 1
+    for lbl, count in lbl_samples.items():
+        logging.info("%s samples are %s", lbl, count)
     num_labels = len(dataset.labels)
     logging.info(
         "writing to output path: %s for %s samples", output_path, len(samples_by_source)

From 91b196f7a0629a62e218174487d6f0f622b270a7 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 20:59:24 +0200
Subject: [PATCH 038/117] debug source

---
 src/ml_tools/thermaldataset.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 079ad772..9d4f5c27 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -234,9 +234,9 @@ def read_tfrecord(
             )
 
     if include_track:
+        tfrecord_format["image/source_id"] = tf.io.FixedLenFeature((), tf.string)
         tfrecord_format["image/track_id"] = tf.io.FixedLenFeature((), tf.int64, -1)
         tfrecord_format["image/avg_mass"] = tf.io.FixedLenFeature((), tf.int64, -1)
-
     if include_features or only_features:
         tfrecord_format["image/features"] = tf.io.FixedLenSequenceFeature(
             [36 * 5 + 8], dtype=tf.float32, allow_missing=True
@@ -291,9 +291,11 @@ def read_tfrecord(
             if extra_label_map is not None:
                 label = tf.reduce_max(label, axis=0)
         if include_track:
+
+            source_id = tf.cast(example["image/source_id"], tf.string)
             track_id = tf.cast(example["image/track_id"], tf.int32)
             avg_mass = tf.cast(example["image/avg_mass"], tf.int32)
-            label = (label, track_id, avg_mass)
+            label = (label, track_id, avg_mass, source_id)
         if include_features or only_features:
             features = tf.squeeze(example["image/features"])
             if only_features:
@@ -351,7 +353,7 @@ def main():
         include_features=False,
         remapped_labels=get_remapped(),
         excluded_labels=get_excluded(),
-        include_track=False,
+        include_track=True,
         num_frames=25,
     )
     print("Ecpoh size is", epoch_size)
@@ -364,6 +366,10 @@ def main():
         batch_i = 0
         print("epoch", e)
         for x, y in resampled_ds:
+            source = y[3]
+            for s in source:
+                print(s)
+            continue
             show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg")
             batch_i += 1
     # return

From 03316c039094e2dffd380ce0a067a6d090f9f9ec Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 21:03:16 +0200
Subject: [PATCH 039/117] print id

---
 src/ml_tools/tfwriter.py      | 11 ++++++-----
 src/ml_tools/thermalwriter.py |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 677a975e..e0b6ebaa 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -102,13 +102,14 @@ def create_tf_records(
     source_files = list(samples_by_source.keys())
     np.random.shuffle(source_files)
     lbl_samples = {}
-    for samples, source in samples_by_source.items():
+    for source, samples in samples_by_source.items():
         for s in samples:
             if s.label not in lbl_samples:
-                lbl_samples[s.label] = 0
-            lbl_samples[s.label] += 1
-    for lbl, count in lbl_samples.items():
-        logging.info("%s samples are %s", lbl, count)
+                lbl_samples[s.label] = []
+            lbl_samples[s.label].append(s)
+    for lbl, samples in lbl_samples.items():
+        logging.info("%s samples are %s", lbl, len(samples))
+        logging.info("Unique ids are %s", [s.unique_id for s in samples])
     num_labels = len(dataset.labels)
     logging.info(
         "writing to output path: %s for %s samples", output_path, len(samples_by_source)
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 345bb9fb..35f75d5f 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -90,7 +90,7 @@ def create_tf_example(sample, data, features, labels, num_frames):
     average_dim = int(round(np.mean(average_dim) ** 0.5))
     thermals = list(data[0])
     filtereds = list(data[1])
-    image_id = sample.unique_track_id
+    image_id = sample.unique_id
     image_height, image_width = thermals[0].shape
     while len(thermals) < num_frames:
         # ensure 25 frames even if 0s

From 5478116ecd92fb6265e257d2427e1324ba364356 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 21:30:17 +0200
Subject: [PATCH 040/117] max smaples

---
 src/ml_tools/tfwriter.py       | 18 +++++++++---------
 src/ml_tools/thermaldataset.py |  5 +++--
 src/ml_tools/thermalwriter.py  |  1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index e0b6ebaa..402071df 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -101,15 +101,15 @@ def create_tf_records(
     samples_by_source = dataset.get_samples_by_source()
     source_files = list(samples_by_source.keys())
     np.random.shuffle(source_files)
-    lbl_samples = {}
-    for source, samples in samples_by_source.items():
-        for s in samples:
-            if s.label not in lbl_samples:
-                lbl_samples[s.label] = []
-            lbl_samples[s.label].append(s)
-    for lbl, samples in lbl_samples.items():
-        logging.info("%s samples are %s", lbl, len(samples))
-        logging.info("Unique ids are %s", [s.unique_id for s in samples])
+    # lbl_samples = {}
+    # for source, samples in samples_by_source.items():
+    #     for s in samples:
+    #         if s.label not in lbl_samples:
+    #             lbl_samples[s.label] = []
+    #         lbl_samples[s.label].append(s)
+    # for lbl, samples in lbl_samples.items():
+    #     logging.info("%s samples are %s", lbl, len(samples))
+    #     logging.info("Unique ids are %s", [s.unique_id for s in samples])
     num_labels = len(dataset.labels)
     logging.info(
         "writing to output path: %s for %s samples", output_path, len(samples_by_source)
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 9d4f5c27..8508f601 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -367,8 +367,9 @@ def main():
         print("epoch", e)
         for x, y in resampled_ds:
             source = y[3]
-            for s in source:
-                print(s)
+            y_b = y[0]
+            for s, y_s in zip(source, y_b):
+                print(labels[np.argmax(y_s)], s.numpy().decode("utf-8"))
             continue
             show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg")
             batch_i += 1
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 35f75d5f..6e0fd74c 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -233,6 +233,7 @@ def get_data(clip_samples, extra_args):
                         dont_filter=extra_args.get("dont_filter_segment", False),
                         skip_ffc=extra_args.get("skip_ffc", True),
                         ffc_frames=clip_meta.ffc_frames,
+                        max_segments=len(samples),
                     )
                 else:
                     filter_by_lq = extra_args.get("filter_by_lq", False)

From 5c16876c9ec097308ff488902bf79523ebe71ee0 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 21:38:41 +0200
Subject: [PATCH 041/117] more test

---
 src/build.py                   | 2 +-
 src/ml_tools/thermaldataset.py | 9 ++-------
 src/ml_tools/thermalwriter.py  | 1 -
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/build.py b/src/build.py
index 879985fd..155ced26 100644
--- a/src/build.py
+++ b/src/build.py
@@ -731,7 +731,7 @@ def rough_balance(datasets):
         counts.sort()
         std_dev = np.std(counts)
         logging.info("Counts are %s std dev %s", counts, std_dev)
-        if std_dev < 2000:
+        if std_dev < 0:
             logging.info("Not balancing")
             continue
         if len(counts) < 7:
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 8508f601..0ce97e54 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -351,8 +351,8 @@ def main():
         # preprocess_fn=tf.keras.applications.inception_v3.preprocess_input,
         resample=False,
         include_features=False,
-        remapped_labels=get_remapped(),
-        excluded_labels=get_excluded(),
+        # remapped_labels=get_remapped(),
+        # excluded_labels=get_excluded(),
         include_track=True,
         num_frames=25,
     )
@@ -366,11 +366,6 @@ def main():
         batch_i = 0
         print("epoch", e)
         for x, y in resampled_ds:
-            source = y[3]
-            y_b = y[0]
-            for s, y_s in zip(source, y_b):
-                print(labels[np.argmax(y_s)], s.numpy().decode("utf-8"))
-            continue
             show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg")
             batch_i += 1
     # return
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 6e0fd74c..2025ff40 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -229,7 +229,6 @@ def get_data(clip_samples, extra_args):
                         ),
                         segment_type=extra_args.get("segment_type"),
                         segment_min_mass=extra_args.get("segment_min_avg_mass"),
-                        max_segments=extra_args.get("max_segments"),
                         dont_filter=extra_args.get("dont_filter_segment", False),
                         skip_ffc=extra_args.get("skip_ffc", True),
                         ffc_frames=clip_meta.ffc_frames,

From 8d9eed746dd47f1c85c600ff582c3ec1c0c3d69a Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 24 Sep 2024 21:53:03 +0200
Subject: [PATCH 042/117] add lbls

---
 src/ml_tools/thermaldataset.py | 55 +++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 0ce97e54..96c8d9fd 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -43,29 +43,30 @@ def get_excluded():
         "fox",
         "cow",
         "wombat",
-        "dog",
-        "sheep" "cat",
-        "duck",
-        "pheasant",
-        "pukeko",
-        "brown quail",
-        "black swan",
-        "quail",
-        "california quail",
+        # "dog",
         "sheep",
-        "echidna",
-        "mouse",
-        "rodent",
-        "possum",
-        "cat",
-        "dog",
-        "hedgehog",
-        "kiwi",
-        "leporidae",
-        "mustelid",
-        "wallaby",
-        "human",
-        "vehicle",
+        # "cat",
+        # "duck",
+        # "pheasant",
+        # "pukeko",
+        # "brown quail",
+        # "black swan",
+        # "quail",
+        # "california quail",
+        # "sheep",
+        # "echidna",
+        # "mouse",
+        # "rodent",
+        # "possum",
+        # "cat",
+        # "dog",
+        # "hedgehog",
+        # "kiwi",
+        # "leporidae",
+        # "mustelid",
+        # "wallaby",
+        # "human",
+        # "vehicle",
     ]
 
 
@@ -73,10 +74,10 @@ def get_remapped(multi_label=False):
     land_bird = "land-bird" if multi_label else "bird"
     return {
         "echidna": "hedgehog",
-        # "grey kangaroo": "wallaby",
-        # "sambar deer": "deer",
-        # "mouse": "rodent",
-        # "rat": "rodent",
+        "grey kangaroo": "wallaby",
+        "sambar deer": "deer",
+        "mouse": "rodent",
+        "rat": "rodent",
         "rain": "false-positive",
         "water": "false-positive",
         "insect": "false-positive",
@@ -88,7 +89,7 @@ def get_remapped(multi_label=False):
         "pheasant": land_bird,
         "pukeko": land_bird,
         "quail": land_bird,
-        # "chicken": land_bird,
+        "chicken": land_bird,
     }
 
 

From c13aa4b44674e9b3675b0411270590db269c9575 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 14:52:41 +0200
Subject: [PATCH 043/117] tidy up

---
 src/build.py                      |  5 +++--
 src/ml_tools/dataset.py           |  2 --
 src/ml_tools/datasetstructures.py |  2 +-
 src/ml_tools/kerasmodel.py        | 35 ++++---------------------------
 src/ml_tools/tfdataset.py         |  3 ---
 src/ml_tools/tfwriter.py          |  9 --------
 src/ml_tools/thermaldataset.py    | 16 +++++++-------
 src/ml_tools/thermalwriter.py     | 25 +++-------------------
 8 files changed, 19 insertions(+), 78 deletions(-)

diff --git a/src/build.py b/src/build.py
index 155ced26..ee86b56f 100644
--- a/src/build.py
+++ b/src/build.py
@@ -718,7 +718,8 @@ def dump_split_ids(datasets, out_file="datasplit.json"):
 
 
 def rough_balance(datasets):
-    logging.info("ROUGH BALANCE")
+    dev_threshold = 2000
+    logging.info("Roughly Balancing")
     print_counts(*datasets)
 
     for dataset in datasets:
@@ -731,7 +732,7 @@ def rough_balance(datasets):
         counts.sort()
         std_dev = np.std(counts)
         logging.info("Counts are %s std dev %s", counts, std_dev)
-        if std_dev < 0:
+        if std_dev < dev_threshold:
             logging.info("Not balancing")
             continue
         if len(counts) < 7:
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 30bbb224..15748b1e 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -192,8 +192,6 @@ def load_clips(
             counter += 1
             if counter % 50 == 0:
                 logging.debug("Dataset loaded %s", counter)
-            if counter == 500:
-                break
         return [counter, counter]
 
     def load_clip(self, db_clip, dont_filter_segment=False):
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 57292b92..f1840527 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -950,7 +950,7 @@ def get_segments(
     filtered_stats = {"segment_mass": 0, "too short": 0}
 
     has_no_mass = np.sum(mass_history) == 0
-    before = len(regions)
+
     frame_indices = [
         region.frame_number
         for region in regions
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 32f1f9c7..955f0617 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -502,9 +502,6 @@ def train_model(
             multi_label=self.params.multi_label,
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
-            deterministic=True,
-            # epoch_size=1000,
-            # include_Track=True,
         )
         self.remapped = remapped
         self.validate, remapped, _, _ = get_dataset(
@@ -522,39 +519,15 @@ def train_model(
             multi_label=self.params.multi_label,
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
-            deterministic=True,
-            # epoch_size=250,
-            # in2clude_track=True,
-            # dist=self.dataset_counts["validation"],
         )
-        # logging.info("Saving datasets")
-        # save_dir = Path("./train-images")
-        # save_dir.mkdir(exist_ok=True)
-        # batch_i = 0
-        # for x, y in self.train:
-        #     thermaldataset.show_batch(
-        #         x, y, self.labels, save=save_dir / f"{batch_i}.jpg", tracks=True
-        #     )
-        #     batch_i += 1
-
-        # save_dir = Path("./val-images")
-        # save_dir.mkdir(exist_ok=True)
-        # batch_i = 0
-        # for x, y in self.validate:
-        #     thermaldataset.show_batch(
-        #         x, y, self.labels, save=save_dir / f"{batch_i}.jpg"
-        #     )
-        #     batch_i += 1
 
-        # if weights is not None:
-        #     self.model.load_weights(weights)
         if rebalance:
             self.class_weights = get_weighting(self.train, self.labels)
         logging.info(
-                "Training on %s  with class weights %s",
-                self.labels,
-                self.class_weights,
-            )
+            "Training on %s  with class weights %s",
+            self.labels,
+            self.class_weights,
+        )
 
         self.save_metadata(run_name)
         self.save(run_name)
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index 027b7b92..33abeb0a 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -94,9 +94,6 @@ def get_dataset(load_function, base_dir, labels, **args):
     else:
 
         logging.info("Excluding %s", excluded_labels)
-
-        # excluded_labels.append("insect")
-        # excluded_labels.append("cat")
         new_labels = labels.copy()
         for excluded in excluded_labels:
             if excluded in new_labels:
diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 402071df..519f4ffb 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -101,15 +101,6 @@ def create_tf_records(
     samples_by_source = dataset.get_samples_by_source()
     source_files = list(samples_by_source.keys())
     np.random.shuffle(source_files)
-    # lbl_samples = {}
-    # for source, samples in samples_by_source.items():
-    #     for s in samples:
-    #         if s.label not in lbl_samples:
-    #             lbl_samples[s.label] = []
-    #         lbl_samples[s.label].append(s)
-    # for lbl, samples in lbl_samples.items():
-    #     logging.info("%s samples are %s", lbl, len(samples))
-    #     logging.info("Unique ids are %s", [s.unique_id for s in samples])
     num_labels = len(dataset.labels)
     logging.info(
         "writing to output path: %s for %s samples", output_path, len(samples_by_source)
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 96c8d9fd..096de9ff 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -37,14 +37,14 @@ def get_excluded():
         "bat",
         "mammal",
         "frog",
-        "grey kangaroo",
-        "sambar deer",
-        "chicken",
+        # "grey kangaroo",
+        # "sambar deer",
+        # "chicken",
         "fox",
-        "cow",
+        # "cow",
         "wombat",
         # "dog",
-        "sheep",
+        # "sheep",
         # "cat",
         # "duck",
         # "pheasant",
@@ -352,8 +352,8 @@ def main():
         # preprocess_fn=tf.keras.applications.inception_v3.preprocess_input,
         resample=False,
         include_features=False,
-        # remapped_labels=get_remapped(),
-        # excluded_labels=get_excluded(),
+        remapped_labels=get_remapped(),
+        excluded_labels=get_excluded(),
         include_track=True,
         num_frames=25,
     )
@@ -367,7 +367,7 @@ def main():
         batch_i = 0
         print("epoch", e)
         for x, y in resampled_ds:
-            show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg")
+            show_batch(x, y, labels, save=save_dir / f"{batch_i}.jpg", tracks=True)
             batch_i += 1
     # return
 
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 2025ff40..c6a8f40b 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -177,18 +177,13 @@ def get_data(clip_samples, extra_args):
     data = []
     crop_rectangle = tools.Rectangle(2, 2, 160 - 2 * 2, 140 - 2 * 2)
 
-    out_folder = None
     if clip_samples[0].source_file.suffix == ".hdf5":
         db = TrackDatabase(clip_samples[0].source_file)
-        out_folder = "hdf5"
     else:
         db = RawDatabase(clip_samples[0].source_file)
         db.load_frames()
-        out_folder = "raw"
 
-        # going to redo segments to get rid of ffc segments
-    out_folder = Path(out_folder)
-    out_folder.mkdir(exist_ok=True)
+    # going to redo segments to get rid of ffc segments
     clip_id = clip_samples[0].clip_id
     try:
         background = db.get_clip_background()
@@ -303,16 +298,9 @@ def get_data(clip_samples, extra_args):
                 new_min = np.amin(diff_frame)
                 if thermal_min_diff is None or new_min < thermal_min_diff:
                     thermal_min_diff = new_min
-                    # min_diff = max(0, new_min)
                 if new_max > thermal_max_diff:
                     thermal_max_diff = new_max
-            # logging.info(
-            #     "Min diff %s max diff %s thermal %s - %s",
-            #     min_diff,
-            #     max_diff,
-            #     thermal_min_diff,
-            #     thermal_max_diff,
-            # )
+
             # normalize by maximum difference between background and tracked region
             # probably only need to use difference on the frames used for this record
             # also min_diff maybe could just be set to 0 and clip values below 0,
@@ -332,7 +320,7 @@ def get_data(clip_samples, extra_args):
                             (32, 32), crop_rectangle, keep_edge=True
                         )
                         if (
-                            np.amax(frame.thermal) > 40000
+                            np.amax(frame.thermal) > 50000
                             or np.amin(frame.thermal) < 1000
                         ):
                             logging.error(
@@ -362,13 +350,6 @@ def get_data(clip_samples, extra_args):
                         )
                         np.clip(frame.filtered, a_min=0, a_max=None, out=frame.filtered)
 
-                        # cv2.imwrite(
-                        #     str(
-                        #         out_folder / f"{clip_id}-{track_id}-{frame_number}.png"
-                        #     ),
-                        #     np.uint8(frame.thermal),
-                        # )
-
                         if not stats[0]:
                             frame.filtered = np.zeros((frame.filtered.shape))
                         f2 = np.uint8(frame.filtered)

From 670ec4bb02a5d43df4ef164c3d20ce98499a93d3 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:09:44 +0200
Subject: [PATCH 044/117] remove load config

---
 src/build.py                   | 10 ++++++----
 src/classify/clipclassifier.py |  2 +-
 src/config/buildconfig.py      | 32 ++++++++++++++++++++++++++++++++
 src/config/config.py           |  7 +++----
 src/config/loadconfig.py       | 24 +-----------------------
 src/ml_tools/dataset.py        |  8 ++++----
 src/ml_tools/thermalwriter.py  | 31 ++++++++++++++++++-------------
 src/ml_tools/tools.py          |  3 +++
 src/mldataset/makedataset.py   |  4 ++--
 src/modelevaluate.py           |  5 ++---
 10 files changed, 72 insertions(+), 54 deletions(-)

diff --git a/src/build.py b/src/build.py
index ee86b56f..3105ec25 100644
--- a/src/build.py
+++ b/src/build.py
@@ -19,8 +19,8 @@
 from ml_tools.tfwriter import create_tf_records
 from ml_tools.irwriter import save_data as save_ir_data
 from ml_tools.thermalwriter import save_data as save_thermal_data
-
-
+from ml_tools.tools import CustomJSONEncoder
+import attrs
 import numpy as np
 
 from pathlib import Path
@@ -890,8 +890,9 @@ def main():
                         "max_segments": master_dataset.max_segments,
                         "dont_filter_segment": True,
                         "skip_ffc": True,
-                        "tag_precedence": config.load.tag_precedence,
+                        "tag_precedence": config.build.tag_precedence,
                         "min_mass": master_dataset.min_frame_mass,
+                        "thermal_diff_norm": config.build.thermal_diff_norm,
                     }
                 )
             create_tf_records(
@@ -915,10 +916,11 @@ def main():
         "type": config.train.type,
         "counts": dataset_counts,
         "by_label": False,
+        "config": attrs.asdict(config),
     }
 
     with open(meta_filename, "w") as f:
-        json.dump(meta_data, f, indent=4)
+        json.dump(meta_data, f, indent=4, cls=CustomJSONEncoder)
 
 
 if __name__ == "__main__":
diff --git a/src/classify/clipclassifier.py b/src/classify/clipclassifier.py
index f9e239db..9bcaa135 100644
--- a/src/classify/clipclassifier.py
+++ b/src/classify/clipclassifier.py
@@ -129,7 +129,7 @@ def process_file(self, filename, cache=None, reuse_frames=None):
         clip = Clip(track_extractor.config, filename)
         clip.load_metadata(
             meta_data,
-            self.config.load.tag_precedence,
+            self.config.build.tag_precedence,
         )
         track_extractor.parse_clip(clip)
 
diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index 0d203f95..045de438 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -34,6 +34,32 @@ class BuildConfig(DefaultConfig):
     min_frame_mass = attr.ib()
     filter_by_lq = attr.ib()
     max_segments = attr.ib()
+    thermal_diff_norm = attr.ib()
+    tag_precedence = attr.ib()
+    excluded_tags = attr.ib()
+
+    EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
+
+    DEFAULT_GROUPS = {
+        0: [
+            "bird",
+            "false-positive",
+            "hedgehog",
+            "possum",
+            "rodent",
+            "mustelid",
+            "cat",
+            "kiwi",
+            "dog",
+            "leporidae",
+            "human",
+            "insect",
+            "pest",
+        ],
+        1: ["unidentified", "other"],
+        2: ["part", "bad track"],
+        3: ["default"],
+    }
 
     @classmethod
     def load(cls, build):
@@ -46,6 +72,9 @@ def load(cls, build):
             min_frame_mass=build["min_frame_mass"],
             filter_by_lq=build["filter_by_lq"],
             max_segments=build["max_segments"],
+            thermal_diff_norm=build["thermal_diff_norm"],
+            tag_precedence=build["tag_precedence"],
+            excluded_tags=build["excluded_tags"],
         )
 
     @classmethod
@@ -59,6 +88,9 @@ def get_defaults(cls):
             min_frame_mass=10,
             filter_by_lq=False,
             max_segments=5,
+            thermal_diff_norm=True,
+            tag_precedence=BuildConfig.DEFAULT_GROUPS,
+            excluded_tags=BuildConfig.EXCLUDED_TAGS,
         )
 
     def validate(self):
diff --git a/src/config/config.py b/src/config/config.py
index e78feb40..d99cadc9 100644
--- a/src/config/config.py
+++ b/src/config/config.py
@@ -5,7 +5,6 @@
 import logging
 import yaml
 
-from .loadconfig import LoadConfig
 from .trackingconfig import TrackingConfig
 from .trainconfig import TrainConfig
 from .classifyconfig import ClassifyConfig
@@ -31,7 +30,7 @@ class Config(DefaultConfig):
         "wallaby",
     ]
     base_folder = attr.ib()
-    load = attr.ib()
+    # load = attr.ib()
     labels = attr.ib()
     build = attr.ib()
     tracking = attr.ib()
@@ -66,7 +65,7 @@ def load_from_stream(cls, stream):
         return cls(
             base_folder=Path(base_folder),
             tracking=TrackingConfig.load(raw["tracking"]),
-            load=LoadConfig.load(raw["load"]),
+            # load=LoadConfig.load(raw["load"]),
             train=TrainConfig.load(raw["train"], base_folder),
             classify=ClassifyConfig.load(raw["classify"]),
             reprocess=raw["reprocess"],
@@ -89,7 +88,7 @@ def get_defaults(cls):
             worker_threads=0,
             build=BuildConfig.get_defaults(),
             tracking=TrackingConfig.get_defaults(),
-            load=LoadConfig.get_defaults(),
+            # load=LoadConfig.get_defaults(),
             train=TrainConfig.get_defaults(),
             classify=ClassifyConfig.get_defaults(),
             debug=False,
diff --git a/src/config/loadconfig.py b/src/config/loadconfig.py
index bb28f7d3..4d42fba6 100644
--- a/src/config/loadconfig.py
+++ b/src/config/loadconfig.py
@@ -24,29 +24,7 @@
 
 @attr.s
 class LoadConfig(DefaultConfig):
-    EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
-
-    DEFAULT_GROUPS = {
-        0: [
-            "bird",
-            "false-positive",
-            "hedgehog",
-            "possum",
-            "rodent",
-            "mustelid",
-            "cat",
-            "kiwi",
-            "dog",
-            "leporidae",
-            "human",
-            "insect",
-            "pest",
-        ],
-        1: ["unidentified", "other"],
-        2: ["part", "bad track"],
-        3: ["default"],
-    }
-
+    
     enable_compression = attr.ib()
     include_filtered_channel = attr.ib()
     preview = attr.ib()
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 15748b1e..dcb58fb0 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -20,7 +20,7 @@
 from ml_tools import tools
 from track.region import Region
 import json
-from config.loadconfig import LoadConfig
+from config.buildconfig import BuildConfig
 from pathlib import Path
 
 
@@ -64,7 +64,7 @@ def __init__(
         self.label_caps = {}
         self.use_segments = True
         if config:
-            self.tag_precedence = config.load.tag_precedence
+            self.tag_precedence = config.build.tag_precedence
             self.type = config.train.type
             if config.train.type == "IR":
                 self.use_segments = False
@@ -80,13 +80,13 @@ def __init__(
             self.banned_clips = config.build.banned_clips
             self.included_labels = config.labels
             self.segment_min_avg_mass = config.build.segment_min_avg_mass
-            self.excluded_tags = config.load.excluded_tags
+            self.excluded_tags = config.build.excluded_tags
             self.min_frame_mass = config.build.min_frame_mass
             self.filter_by_lq = config.build.filter_by_lq
             self.segment_type = SegmentType.ALL_RANDOM
             self.max_segments = config.build.max_segments
         else:
-            self.tag_precedence = LoadConfig.DEFAULT_GROUPS
+            self.tag_precedence = BuildConfig.DEFAULT_GROUPS
             self.filter_by_lq = False
             # number of seconds each segment should be
             if self.use_segments:
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index c6a8f40b..603698e4 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -274,10 +274,13 @@ def get_data(clip_samples, extra_args):
             )
 
             by_frame_number = {}
-            thermal_max_diff = 0
+            thermal_max_diff = None
             thermal_min_diff = None
-            max_diff = 0
+            max_diff = None
             min_diff = None
+
+            thermal_diff_norm = extra_args.get("thermal_diff_norm", False)
+
             for f in track_frames:
                 if f.region.blank or f.region.width <= 0 or f.region.height <= 0:
                     continue
@@ -290,16 +293,16 @@ def get_data(clip_samples, extra_args):
                 if min_diff is None or new_min < min_diff:
                     min_diff = new_min
                     # min_diff = max(0, new_min)
-                if new_max > max_diff:
+                if max_diff is None or new_max > max_diff:
                     max_diff = new_max
-
-                diff_frame = f.thermal - frame_temp_median[f.frame_number]
-                new_max = np.amax(diff_frame)
-                new_min = np.amin(diff_frame)
-                if thermal_min_diff is None or new_min < thermal_min_diff:
-                    thermal_min_diff = new_min
-                if new_max > thermal_max_diff:
-                    thermal_max_diff = new_max
+                if thermal_diff_norm:
+                    diff_frame = f.thermal - frame_temp_median[f.frame_number]
+                    new_max = np.amax(diff_frame)
+                    new_min = np.amin(diff_frame)
+                    if thermal_min_diff is None or new_min < thermal_min_diff:
+                        thermal_min_diff = new_min
+                    if thermal_max_diff is None or new_max > thermal_max_diff:
+                        thermal_max_diff = new_max
 
             # normalize by maximum difference between background and tracked region
             # probably only need to use difference on the frames used for this record
@@ -334,8 +337,10 @@ def get_data(clip_samples, extra_args):
                             )
 
                         frame.thermal -= temp_median
-
-                        # np.clip(frame.thermal, a_min=0, a_max=None, out=frame.thermal)
+                        if not thermal_diff_norm:
+                            np.clip(
+                                frame.thermal, a_min=0, a_max=None, out=frame.thermal
+                            )
                         frame.thermal, stats = imageprocessing.normalize(
                             frame.thermal,
                             min=thermal_min_diff,
diff --git a/src/ml_tools/tools.py b/src/ml_tools/tools.py
index 519c9e2f..bdfb51de 100644
--- a/src/ml_tools/tools.py
+++ b/src/ml_tools/tools.py
@@ -52,6 +52,9 @@ def default(self, obj):
             return obj.isoformat()
         elif isinstance(obj, Rectangle):
             return obj.meta_dictionary()
+        elif isinstance(obj, Path):
+            return str(obj)
+
         # Let the base class default method raise the TypeError
         return json.JSONEncoder.default(self, obj)
 
diff --git a/src/mldataset/makedataset.py b/src/mldataset/makedataset.py
index 65368843..cbb7e75a 100644
--- a/src/mldataset/makedataset.py
+++ b/src/mldataset/makedataset.py
@@ -131,7 +131,7 @@ def process_file(self, filename, out_dir, config):
         clip = Clip(config.tracking["thermal"], filename)
         clip.load_metadata(
             metadata,
-            config.load.tag_precedence,
+            config.build.tag_precedence,
         )
 
         with h5py.File(out_file, "w") as f:
@@ -263,7 +263,7 @@ def process_file(self, filename, out_dir, config):
                     node_attrs["id"] = track_id
                     tags = track.get("tags", [])
                     tag = Track.get_best_human_tag(
-                        tags, self.config.load.tag_precedence, 0
+                        tags, self.config.build.tag_precedence, 0
                     )
 
                     master_tag = [
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 009451c8..e3ff9d79 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -44,7 +44,7 @@
 from ml_tools.frame import Frame
 from ml_tools import imageprocessing
 import cv2
-from config.loadconfig import LoadConfig
+from config.buildconfig import BuildConfig
 from sklearn.metrics import confusion_matrix
 from multiprocessing import Pool
 
@@ -255,7 +255,6 @@ def evalute_prod_confusion(dir, confusion_file):
                 tag.get("what")
                 for tag in tags
                 if tag.get("automatic") == False
-                # and tag.get("what", "") not in LoadConfig.EXCLUDED_TAGS
             ]
             human_tags = set(human_tags)
             if len(human_tags) > 1:
@@ -299,7 +298,7 @@ def load_clip_data(cptv_file):
     # for clip in dataset.clips:
     reason = {}
     clip_db = RawDatabase(cptv_file)
-    clip = clip_db.get_clip_tracks(LoadConfig.DEFAULT_GROUPS)
+    clip = clip_db.get_clip_tracks(BuildConfig.DEFAULT_GROUPS)
     if clip is None:
         logging.warn("No clip for %s", cptv_file)
         return None

From 1531bec6ce8604d2ba642e3579ab908740c6323c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:09:51 +0200
Subject: [PATCH 045/117] delete load config

---
 src/config/loadconfig.py | 61 ----------------------------------------
 1 file changed, 61 deletions(-)
 delete mode 100644 src/config/loadconfig.py

diff --git a/src/config/loadconfig.py b/src/config/loadconfig.py
deleted file mode 100644
index 4d42fba6..00000000
--- a/src/config/loadconfig.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-classifier-pipeline - this is a server side component that manipulates cptv
-files and to create a classification model of animals present
-Copyright (C) 2018, The Cacophony Project
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import attr
-
-from .defaultconfig import DefaultConfig
-
-
-@attr.s
-class LoadConfig(DefaultConfig):
-    
-    enable_compression = attr.ib()
-    include_filtered_channel = attr.ib()
-    preview = attr.ib()
-    tag_precedence = attr.ib()
-    cache_to_disk = attr.ib()
-    high_quality_optical_flow = attr.ib()
-    excluded_tags = attr.ib()
-
-    @classmethod
-    def load(cls, config):
-        return cls(
-            enable_compression=config["enable_compression"],
-            include_filtered_channel=config["include_filtered_channel"],
-            preview=config["preview"],
-            tag_precedence=config["tag_precedence"],
-            cache_to_disk=config["cache_to_disk"],
-            high_quality_optical_flow=config["high_quality_optical_flow"],
-            excluded_tags=config["excluded_tags"],
-        )
-
-    @classmethod
-    def get_defaults(cls):
-        return cls(
-            enable_compression=False,
-            include_filtered_channel=True,
-            preview=None,
-            tag_precedence=LoadConfig.DEFAULT_GROUPS,
-            cache_to_disk=False,
-            high_quality_optical_flow=True,
-            excluded_tags=LoadConfig.EXCLUDED_TAGS,
-        )
-
-    def validate(self):
-        return True

From c5f2106ba9d526e0c321baaa115bd915618cc237 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:19:19 +0200
Subject: [PATCH 046/117] remove load

---
 requirements.txt           | 2 +-
 src/config/config.py       | 4 ----
 src/ml_tools/kerasmodel.py | 3 ++-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index af94548d..ac2d76b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-tensorflow~=2.14.0
+tensorflow~=2.17.0
 matplotlib~=3.0
 pytz
 cptv~=1.5.4
diff --git a/src/config/config.py b/src/config/config.py
index d99cadc9..78ca23be 100644
--- a/src/config/config.py
+++ b/src/config/config.py
@@ -30,7 +30,6 @@ class Config(DefaultConfig):
         "wallaby",
     ]
     base_folder = attr.ib()
-    # load = attr.ib()
     labels = attr.ib()
     build = attr.ib()
     tracking = attr.ib()
@@ -65,7 +64,6 @@ def load_from_stream(cls, stream):
         return cls(
             base_folder=Path(base_folder),
             tracking=TrackingConfig.load(raw["tracking"]),
-            # load=LoadConfig.load(raw["load"]),
             train=TrainConfig.load(raw["train"], base_folder),
             classify=ClassifyConfig.load(raw["classify"]),
             reprocess=raw["reprocess"],
@@ -88,7 +86,6 @@ def get_defaults(cls):
             worker_threads=0,
             build=BuildConfig.get_defaults(),
             tracking=TrackingConfig.get_defaults(),
-            # load=LoadConfig.get_defaults(),
             train=TrainConfig.get_defaults(),
             classify=ClassifyConfig.get_defaults(),
             debug=False,
@@ -100,7 +97,6 @@ def validate(self):
         self.build.validate()
         for tracker in self.tracking.values():
             tracker.validate()
-        self.load.validate()
         self.train.validate()
         self.classify.validate()
         return True
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 955f0617..4d684489 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -520,7 +520,8 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
         )
-
+        if weights is not None:
+            self.model.load_weights(weights)
         if rebalance:
             self.class_weights = get_weighting(self.train, self.labels)
         logging.info(

From 40d88744c663969e0308a2b12a311011950e22fa Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:19:49 +0200
Subject: [PATCH 047/117] fix base_training default

---
 src/ml_tools/hyperparams.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index 946f4454..cbe05d11 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -105,7 +105,7 @@ def label_smoothing(self):
 
     @property
     def base_training(self):
-        return self.get("base_training", False)
+        return self.get("base_training", True)
 
     @property
     def retrain_layer(self):

From f38b83ac8a43f36e6266aa698698530f883c4d8b Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:22:34 +0200
Subject: [PATCH 048/117] labels

---
 src/ml_tools/thermaldataset.py | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 096de9ff..afbbbbe1 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -37,36 +37,8 @@ def get_excluded():
         "bat",
         "mammal",
         "frog",
-        # "grey kangaroo",
-        # "sambar deer",
-        # "chicken",
-        "fox",
-        # "cow",
-        "wombat",
-        # "dog",
-        # "sheep",
-        # "cat",
-        # "duck",
-        # "pheasant",
-        # "pukeko",
-        # "brown quail",
-        # "black swan",
-        # "quail",
-        # "california quail",
-        # "sheep",
-        # "echidna",
-        # "mouse",
-        # "rodent",
-        # "possum",
-        # "cat",
-        # "dog",
-        # "hedgehog",
-        # "kiwi",
-        # "leporidae",
-        # "mustelid",
-        # "wallaby",
-        # "human",
-        # "vehicle",
+        # "fox",
+        # "wombat",
     ]
 
 

From f8fffb6df495f05d1d1fa7a0ddb9118aaa7c3aed Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 15:23:29 +0200
Subject: [PATCH 049/117] adjusted defaults

---
 src/ml_tools/hyperparams.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index cbe05d11..cbeab17a 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -24,9 +24,8 @@ def insert_defaults(self):
         self["square_width"] = self.square_width
         self["frame_size"] = self.frame_size
         self["segment_width"] = self.segment_width
-
         self["segment_type"] = self.segment_type
-        self["multi_label"] = False
+        self["multi_label"] = True
         self["diff_norm"] = self.diff_norm
         self["smooth_predictions"] = self.smooth_predictions
         self["channels"] = self.channels

From 7720d5918b81b97b7b6137dab50975e2b09ca578 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 16:03:54 +0200
Subject: [PATCH 050/117] remove source id

---
 src/ml_tools/kerasmodel.py     | 10 +---------
 src/ml_tools/thermaldataset.py |  5 ++---
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 4d684489..ed3c39fe 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -803,15 +803,7 @@ def confusion_tracks(self, dataset, filename, threshold=0.8):
             new_smooth = pred.predictions * masses
             new_smooth = np.sum(new_smooth, axis=0)
             new_smooth /= np.sum(masses)
-            # logging.info(
-            #     "Smoothing %s with masses %s", np.round(100 * pred.predictions), masses
-            # )
-            # logging.info(
-            #     "N smooth %s old %s new %s",
-            #     np.round(100 * no_smoothing),
-            #     np.round(100 * old_smoothing),
-            #     np.round(100 * new_smooth),
-            # )
+
             for i, pred_type in enumerate([no_smoothing, old_smoothing, new_smooth]):
                 best_pred = np.argmax(pred_type)
                 confidence = pred_type[best_pred]
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index afbbbbe1..d4a7e9e7 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -37,6 +37,7 @@ def get_excluded():
         "bat",
         "mammal",
         "frog",
+        "cow",
         # "fox",
         # "wombat",
     ]
@@ -207,7 +208,6 @@ def read_tfrecord(
             )
 
     if include_track:
-        tfrecord_format["image/source_id"] = tf.io.FixedLenFeature((), tf.string)
         tfrecord_format["image/track_id"] = tf.io.FixedLenFeature((), tf.int64, -1)
         tfrecord_format["image/avg_mass"] = tf.io.FixedLenFeature((), tf.int64, -1)
     if include_features or only_features:
@@ -265,10 +265,9 @@ def read_tfrecord(
                 label = tf.reduce_max(label, axis=0)
         if include_track:
 
-            source_id = tf.cast(example["image/source_id"], tf.string)
             track_id = tf.cast(example["image/track_id"], tf.int32)
             avg_mass = tf.cast(example["image/avg_mass"], tf.int32)
-            label = (label, track_id, avg_mass, source_id)
+            label = (label, track_id, avg_mass)
         if include_features or only_features:
             features = tf.squeeze(example["image/features"])
             if only_features:

From 1ac2e23f2708f35b909c1552e3098ae6b411e35e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 17:26:15 +0200
Subject: [PATCH 051/117] add config to split data by location

---
 src/build.py              |  6 ++++--
 src/config/buildconfig.py | 14 ++++++++++++++
 src/ml_tools/dataset.py   | 32 +++++++++++++++++++++++++++++---
 src/ml_tools/rawdb.py     | 11 +++++++++--
 src/ml_tools/rectangle.py |  4 ++++
 5 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/src/build.py b/src/build.py
index 3105ec25..ff1edac9 100644
--- a/src/build.py
+++ b/src/build.py
@@ -732,10 +732,12 @@ def rough_balance(datasets):
         counts.sort()
         std_dev = np.std(counts)
         logging.info("Counts are %s std dev %s", counts, std_dev)
-        if std_dev < dev_threshold:
+        if std_dev < dev_threshold or len(counts) == 0:
             logging.info("Not balancing")
             continue
-        if len(counts) < 7:
+        if len(counts) <= 2:
+            cap_at = counts[0]
+        elif len(counts) < 7:
             cap_at = counts[-2]
         else:
             cap_at = counts[-3]
diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index 045de438..31d2aa01 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -22,6 +22,7 @@
 import logging
 from os import path
 from .defaultconfig import DefaultConfig
+from ml_tools.rectangle import Rectangle
 
 
 @attr.s
@@ -37,9 +38,20 @@ class BuildConfig(DefaultConfig):
     thermal_diff_norm = attr.ib()
     tag_precedence = attr.ib()
     excluded_tags = attr.ib()
+    country = attr.ib()
 
     EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
 
+    # country bounding boxs
+    COUNTRY_LOCATIONS = {
+        "AU": Rectangle.from_ltrb(
+            113.338953078, -10.6681857235, 153.569469029, -43.6345972634
+        ),
+        "NZ": Rectangle.from_ltrb(
+            166.509144322, -34.4506617165, 178.517093541, -46.641235447
+        ),
+    }
+
     DEFAULT_GROUPS = {
         0: [
             "bird",
@@ -75,6 +87,7 @@ def load(cls, build):
             thermal_diff_norm=build["thermal_diff_norm"],
             tag_precedence=build["tag_precedence"],
             excluded_tags=build["excluded_tags"],
+            country=build["country"],
         )
 
     @classmethod
@@ -91,6 +104,7 @@ def get_defaults(cls):
             thermal_diff_norm=True,
             tag_precedence=BuildConfig.DEFAULT_GROUPS,
             excluded_tags=BuildConfig.EXCLUDED_TAGS,
+            country="NZ",
         )
 
     def validate(self):
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index dcb58fb0..290f0f1a 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -85,7 +85,9 @@ def __init__(
             self.filter_by_lq = config.build.filter_by_lq
             self.segment_type = SegmentType.ALL_RANDOM
             self.max_segments = config.build.max_segments
+            self.country = config.build.country
         else:
+            self.country = "NZ"
             self.tag_precedence = BuildConfig.DEFAULT_GROUPS
             self.filter_by_lq = False
             # number of seconds each segment should be
@@ -98,6 +100,13 @@ def __init__(
             self.segment_min_avg_mass = 10
             self.min_frame_mass = 16
             self.segment_type = SegmentType.ALL_RANDOM
+
+        self.country_rectangle = BuildConfig.COUNTRY_LOCATIONS.get(self.country)
+        logging.info(
+            "Filtering by country %s have boundying %s",
+            self.country,
+            self.country_rectangle,
+        )
         self.max_frame_mass = None
         self.filtered_stats = {
             "confidence": 0,
@@ -204,7 +213,12 @@ def load_clip(self, db_clip, dont_filter_segment=False):
         except:
             logging.error("Could not load %s", db_clip, exc_info=True)
             return 0
-        if clip_header is None or filter_clip(clip_header):
+        if clip_header is None or filter_clip(
+            clip_header,
+            clip_header.location,
+            self.country_rectangle,
+            self.filtered_stats,
+        ):
             return 0
         filtered = 0
         added = 0
@@ -616,12 +630,24 @@ def filter_track(track_header, excluded_tags, filtered_stats={}):
     return False
 
 
-def filter_clip(clip, filtered_stats={}):
+def filter_clip(clip, location, location_bounds, filtered_stats=None):
     # remove tracks of trapped animals
     if (clip.events is not None and "trap" in clip.events.lower()) or (
         clip.trap is not None and "trap" in clip.trap.lower()
     ):
-        self.filtered_stats["trap"] += 1
+        if filtered_stats is not None:
+            if "trap" in filtered_stats:
+                filtered_stats["trap"] += 1
+            else:
+                filtered_stats["trap"] = 1
         logging.info("Filtered because in trap")
         return True
+
+    if location_bounds is not None and not location_bounds.contains(*location):
+        if filtered_stats is not None:
+            if "location" in filtered_stats:
+                filtered_stats["location"] += 1
+            else:
+                filtered_stats["location"] = 1
+        return True
     return False
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 29921198..2b41b30e 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -113,12 +113,19 @@ def get_clip_tracks(self, tag_precedence):
         self.crop_rectangle = Rectangle(
             edge_pixels, edge_pixels, resx - edge_pixels * 2, resy - edge_pixels * 2
         )
-
+        location = metadata.get("location")
+        lat = None
+        lng = None
+        try:
+            lat = location.get("lat")
+            lng = location.get("lng")
+        except:
+            pass
         clip_header = ClipHeader(
             clip_id=int(metadata["id"]),
             station_id=metadata.get("stationId"),
             source_file=self.file,
-            location=metadata.get("location"),
+            location=None if lat is None or lng is None else (lng, lat),
             camera=metadata.get("deviceId"),
             rec_time=parse_date(metadata["recordingDateTime"]),
             frames_per_second=10 if self.file.suffix == "mp4" else 9,
diff --git a/src/ml_tools/rectangle.py b/src/ml_tools/rectangle.py
index 225a754f..4191654c 100644
--- a/src/ml_tools/rectangle.py
+++ b/src/ml_tools/rectangle.py
@@ -106,6 +106,10 @@ def enlarge(self, border, max=None):
         if max:
             self.crop(max)
 
+    def contains(self, x, y):
+        """Is this point contained in the rectangle"""
+        return self.left <= x and self.right >= x and self.top >= y and self.bottom <= y
+
     @property
     def area(self):
         return int(self.width) * self.height

From b50918eaf27787d6771cac67a0d96ed5f983f646 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 25 Sep 2024 17:56:27 +0200
Subject: [PATCH 052/117] up requirements

---
 pirequirements.txt | 2 +-
 requirements.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pirequirements.txt b/pirequirements.txt
index 62280bdf..48625de4 100644
--- a/pirequirements.txt
+++ b/pirequirements.txt
@@ -9,7 +9,7 @@ scipy==1.9.3
 python-dateutil
 scikit-learn==1.1.3
 tables==3.8.0
-h5py==3.8.0
+h5py==3.10.0
 pyyaml==6.0
 pillow==10.0.1
 attrs==19.2.0
diff --git a/requirements.txt b/requirements.txt
index ac2d76b0..3988be34 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ scipy
 python-dateutil
 scikit-learn
 tables~=3.8.0
-h5py~=3.9.0
+h5py~=3.10.0
 pyyaml>=4.2b1
 pillow~=10.0.1
 attrs~=19.1

From a174fc5e8d74c51b958efba9e093e7fa14f360cc Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 26 Sep 2024 09:21:10 +0200
Subject: [PATCH 053/117] none location check

---
 src/ml_tools/dataset.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 290f0f1a..505dc039 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -643,7 +643,11 @@ def filter_clip(clip, location, location_bounds, filtered_stats=None):
         logging.info("Filtered because in trap")
         return True
 
-    if location_bounds is not None and not location_bounds.contains(*location):
+    if (
+        location is not None
+        and location_bounds is not None
+        and not location_bounds.contains(*location)
+    ):
         if filtered_stats is not None:
             if "location" in filtered_stats:
                 filtered_stats["location"] += 1

From d40f7e6a7cefaeaa1ba3b2e074a99b73eb990e03 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 29 Sep 2024 19:34:04 +0200
Subject: [PATCH 054/117] added fine tune option

---
 src/ml_tools/hyperparams.py |  8 ++++
 src/ml_tools/kerasmodel.py  | 78 ++++++++++++++++++++++++++++++++++---
 src/ml_tools/tfdataset.py   |  7 ++++
 src/train.py                |  2 +
 src/train/train.py          |  4 +-
 5 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index cbeab17a..b4b57055 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -50,6 +50,14 @@ def output_dim(self):
     def smooth_predictions(self):
         return self.get("smooth_predictions", True)
 
+    @property
+    def excluded_labels(self):
+        return self.get("excluded_labels", None)
+
+    @property
+    def remapped_labels(self):
+        return self.get("remapped_labels", None)
+
     @property
     def diff_norm(self):
         return self.get("diff_norm", True)
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index ed3c39fe..937ce9c9 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -362,6 +362,59 @@ def build_model(
             ],
         )
 
+    def adjust_final_layer(self):
+        # Adjust final layer to a new set of labels, by removing it and re adding
+        # new_model = tf.keras.models.Sequential(self.model.layers[:-3])
+        self.model = tf.keras.Model(
+            inputs=self.model.input, outputs=self.model.layers[-2].output
+        )
+
+        # model = tf.keras.Model(inputs=self.model.input, outputs=x)
+
+        activation = "softmax"
+        if self.params.multi_label:
+            activation = "sigmoid"
+
+        retrain_from = self.params.retrain_layer
+        if retrain_from:
+            for i, layer in enumerate(self.model.layers):
+                if isinstance(layer, tf.keras.layers.BatchNormalization):
+                    # apparently this shouldn't matter as we set base_training = False
+                    layer.trainable = False
+                    logging.info("dont train %s %s", i, layer.name)
+                else:
+                    layer.trainable = i >= retrain_from
+        else:
+            self.model.trainable = self.params.base_training
+
+        # add final layer after as always want this trainable
+        logging.info(
+            "Adding new final layer with %s activation and %s labels ",
+            activation,
+            len(self.labels),
+        )
+        preds = tf.keras.layers.Dense(
+            len(self.labels), activation=activation, name="prediction"
+        )(self.model.output)
+
+        self.model = tf.keras.models.Model(self.model.inputs, outputs=preds)
+        if self.params.multi_label:
+            acc = tf.metrics.binary_accuracy
+        else:
+            acc = tf.metrics.categorical_accuracy
+        logging.info("Using acc %s", acc)
+        self.model.summary()
+        self.model.compile(
+            optimizer=optimizer(self.params),
+            loss=loss(self.params),
+            metrics=[
+                acc,
+                tf.keras.metrics.AUC(),
+                tf.keras.metrics.Recall(),
+                tf.keras.metrics.Precision(),
+            ],
+        )
+
     def load_model(self, model_file, training=False, weights=None):
         model_file = Path(model_file)
         super().__init__(model_file)
@@ -450,14 +503,26 @@ def close(self):
         gc.collect()
 
     def train_model(
-        self, epochs, run_name, weights=None, rebalance=False, resample=False
+        self,
+        epochs,
+        run_name,
+        weights=None,
+        rebalance=False,
+        resample=False,
+        fine_tune=None,
     ):
         logging.info(
             "%s Training model for %s epochs with weights %s", run_name, epochs, weights
         )
-        self.excluded_labels, self.remapped_labels = get_excluded(
-            self.data_type, self.params.multi_label
-        )
+
+        if self.params.excluded_labels is None:
+            self.excluded_labels, self.remapped_labels = get_excluded(
+                self.data_type, self.params.multi_label
+            )
+        if self.params.remapped_labels is None:
+            self.remapped_labels, self.remapped_labels = get_excluded(
+                self.data_type, self.params.multi_label
+            )
         train_files = self.data_dir / "train"
         validate_files = self.data_dir / "validation"
         logging.info(
@@ -475,8 +540,11 @@ def train_model(
                 self.labels.remove(l)
         self.log_dir = self.log_base / run_name
         self.log_dir.mkdir(parents=True, exist_ok=True)
+        if fine_tune is not None:
+            self.load_model(fine_tune, weights=weights)
+            self.adjust_final_layer()
 
-        if not self.model:
+        elif not self.model:
             self.build_model(
                 dense_sizes=self.params.dense_sizes,
                 retrain_from=self.params.retrain_layer,
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index 33abeb0a..bcb29027 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -94,6 +94,8 @@ def get_dataset(load_function, base_dir, labels, **args):
     else:
 
         logging.info("Excluding %s", excluded_labels)
+
+        # get new labels after excluding and removing remapped labels
         new_labels = labels.copy()
         for excluded in excluded_labels:
             if excluded in new_labels:
@@ -101,6 +103,8 @@ def get_dataset(load_function, base_dir, labels, **args):
         for remapped_lbl in to_remap.keys():
             if remapped_lbl in new_labels:
                 new_labels.remove(remapped_lbl)
+
+        # initialize remapped dictionary, setting labels that have been removed to -1, these values will be filtered later
         for l in labels:
             keys.append(labels.index(l))
             if l not in new_labels:
@@ -110,11 +114,14 @@ def get_dataset(load_function, base_dir, labels, **args):
             else:
                 remapped[l] = [l]
                 values.append(new_labels.index(l))
+
+        # add the remapped labels to the correct place
         for k, v in to_remap.items():
             if k in labels and v in labels:
                 remapped[v].append(k)
                 values[labels.index(k)] = new_labels.index(v)
                 del remapped[k]
+
     remap_lookup = tf.lookup.StaticHashTable(
         initializer=tf.lookup.KeyValueTensorInitializer(
             keys=tf.constant(keys),
diff --git a/src/train.py b/src/train.py
index 5091b78b..16677575 100644
--- a/src/train.py
+++ b/src/train.py
@@ -45,6 +45,7 @@ def load_config():
     parser.add_argument("-w", "--weights", help="Fine tune using these weights")
     parser.add_argument("-i", "--ignore", help="Ignore clips in this file")
     parser.add_argument("-e", "--epochs", type=int, help="Epochs to train")
+    parser.add_argument("-f", "--fine_tune", help="Model to fine tune")
 
     parser.add_argument(
         "name",
@@ -67,6 +68,7 @@ def main():
         weights=args.weights,
         ignore=args.ignore,
         epochs=args.epochs,
+        fine_tune=args.fine_tune,
     )
 
 
diff --git a/src/train/train.py b/src/train/train.py
index 880678b0..60af5ca8 100644
--- a/src/train/train.py
+++ b/src/train/train.py
@@ -28,7 +28,7 @@ def remove_fp_segments(datasets, ignore_file):
                 print("deleting segment", segment.unique_track_id)
         for delete in delete_me:
             try:
-                datset.remove_track(delete.track_id)
+                dataset.remove_track(delete.track_id)
             except:
                 pass
             dataset.segments.remove(delete)
@@ -44,6 +44,7 @@ def train_model(
     do_grid_search=None,
     ignore=None,
     epochs=None,
+    fine_tune=None,
 ):
     init_logging()
     """Trains a model with the given hyper parameters."""
@@ -77,6 +78,7 @@ def train_model(
             weights=weights,
             resample=False,
             rebalance=False,
+            fine_tune=fine_tune,
         )
     except KeyboardInterrupt:
         pass

From 4ab73d86d4522866cd9c813d585bdff7203eebe3 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 29 Sep 2024 19:42:51 +0200
Subject: [PATCH 055/117] fix load

---
 src/ml_tools/kerasmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 937ce9c9..c4c18dd2 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -427,7 +427,7 @@ def load_model(self, model_file, training=False, weights=None):
         self.model.trainable = training
 
         if weights is not None:
-            self.model.load_weights(weights).expect_partial()
+            self.model.load_weights(weights)
             logging.info("Loaded weight %s", weights)
         # print(self.model.summary())
 

From 552b788b8c9dc6d864a9227a1ea17bc3c157bdb4 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 13:47:03 +0200
Subject: [PATCH 056/117] add date filtering

---
 src/ml_tools/dataset.py | 11 +++++-
 src/modelevaluate.py    | 83 +++++++++++++++++++++--------------------
 2 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 505dc039..4c4b0b63 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -630,7 +630,7 @@ def filter_track(track_header, excluded_tags, filtered_stats={}):
     return False
 
 
-def filter_clip(clip, location, location_bounds, filtered_stats=None):
+def filter_clip(clip, location, location_bounds, filtered_stats=None, after_date=None):
     # remove tracks of trapped animals
     if (clip.events is not None and "trap" in clip.events.lower()) or (
         clip.trap is not None and "trap" in clip.trap.lower()
@@ -654,4 +654,13 @@ def filter_clip(clip, location, location_bounds, filtered_stats=None):
             else:
                 filtered_stats["location"] = 1
         return True
+
+    if after_date is not None and clip.rec_time <= after_date:
+        if filtered_stats is not None:
+            if "date" in filtered_stats:
+                filtered_stats["date"] += 1
+            else:
+                filtered_stats["date"] = 1
+        return True
+
     return False
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index e3ff9d79..e07ed872 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -163,6 +163,9 @@ def load_args():
     parser.add_argument("-d", "--date", help="Use clips after this")
 
     parser.add_argument("--split-file", help="Use split for evaluation")
+    parser.add_argument(
+        "--confusion-from-meta", help="Use metadata to produce a confusion matrix"
+    )
 
     parser.add_argument(
         "confusion",
@@ -213,32 +216,14 @@ def filter_diffs(track_frames, background):
     return min_diff, max_diff
 
 
-def evalute_prod_confusion(dir, confusion_file):
+# evaluate a confusion matrix from metadata of files, already evaluated by our current model on browse
+
+
+def metadata_confusion(dir, confusion_file):
     with open("label_paths.json", "r") as f:
         label_paths = json.load(f)
     label_mapping = get_mappings(label_paths)
-
-    labels = [
-        "bird",
-        "cat",
-        "deer",
-        "dog",
-        "false-positive",
-        "hedgehog",
-        "human",
-        "kiwi",
-        "leporidae",
-        "mustelid",
-        "penguin",
-        "possum",
-        "rodent",
-        "sheep",
-        "vehicle",
-        "wallaby",
-        "land-bird",
-        "None",
-        "unidentified",
-    ]
+    labels = set()
     y_true = []
     y_pred = []
     dir = Path(dir)
@@ -252,9 +237,7 @@ def evalute_prod_confusion(dir, confusion_file):
         for track in meta_data.get("Tracks", []):
             tags = track.get("tags", [])
             human_tags = [
-                tag.get("what")
-                for tag in tags
-                if tag.get("automatic") == False
+                tag.get("what") for tag in tags if tag.get("automatic") == False
             ]
             human_tags = set(human_tags)
             if len(human_tags) > 1:
@@ -264,6 +247,7 @@ def evalute_prod_confusion(dir, confusion_file):
                 continue
             human_tag = human_tags.pop()
             human_tag = label_mapping.get(human_tag, human_tag)
+            labels.add(human_tag)
             ai_tag = [
                 tag.get("what")
                 for tag in tags
@@ -273,9 +257,13 @@ def evalute_prod_confusion(dir, confusion_file):
             y_true.append(human_tag)
             if len(ai_tag) == 0:
                 y_pred.append("None")
+                labels.add("None")
             else:
+                labels.add(ai_tag[0])
                 y_pred.append(ai_tag[0])
-
+    labels = list(labels)
+    labels.sort()
+    logging.info("Using labels %s",labels)
     cm = confusion_matrix(y_true, y_pred, labels=labels)
     # Log the confusion matrix as an image summary.
     figure = plot_confusion_matrix(cm, class_names=labels)
@@ -287,11 +275,13 @@ def evalute_prod_confusion(dir, confusion_file):
 
 EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
 worker_model = None
+after_date = None
 
 
-def init_worker(model):
-    global worker_model
+def init_worker(model, date):
+    global worker_model, after_date
     worker_model = model
+    after_date = date
 
 
 def load_clip_data(cptv_file):
@@ -303,7 +293,7 @@ def load_clip_data(cptv_file):
         logging.warn("No clip for %s", cptv_file)
         return None
 
-    if filter_clip(clip, reason):
+    if filter_clip(clip, reason, after_date=after_date):
         logging.info("Filtering %s", cptv_file)
         return None
     clip.tracks = [
@@ -349,6 +339,7 @@ def evaluate_dir(
     split_file=None,
     split_dataset="test",
     threshold=0.5,
+    after_date=None,
 ):
     logging.info("Evaluating cptv files in %s with threshold %s", dir, threshold)
 
@@ -374,7 +365,14 @@ def evaluate_dir(
     # files = files[:8]
     start = time.time()
     # quite faster with just one process for loading and using main process for predicting
-    with Pool(processes=1, initializer=init_worker, initargs=(model,)) as pool:
+    with Pool(
+        processes=1,
+        initializer=init_worker,
+        initargs=(
+            model,
+            after_date,
+        ),
+    ) as pool:
         for clip_data in pool.imap_unordered(load_clip_data, files):
             if clip_data is None:
                 continue
@@ -468,17 +466,20 @@ def main():
 
     model = KerasModel(train_config=config.train)
     model.load_model(model_file, training=False, weights=weights)
-
     if args.evaluate_dir:
-        evaluate_dir(
-            model,
-            Path(args.evaluate_dir),
-            config,
-            args.confusion,
-            args.split_file,
-            args.dataset,
-            threshold=args.threshold,
-        )
+        if args.confusion_from_meta:
+            evalute_prod_confusion(Path(args.evaluate_dir), args.confusion)
+        else:
+            evaluate_dir(
+                model,
+                Path(args.evaluate_dir),
+                config,
+                args.confusion,
+                args.split_file,
+                args.dataset,
+                threshold=args.threshold,
+                after_date=args.date,
+            )
     elif args.dataset:
         model_labels = model.labels.copy()
         model.load_training_meta(base_dir)

From c52246d2bef9750d684a0e91544b5d2ce6373a29 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 13:49:46 +0200
Subject: [PATCH 057/117] count action

---
 src/modelevaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index e07ed872..0836a206 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -155,6 +155,7 @@ def load_args():
 
     parser.add_argument(
         "--evaluate-dir",
+        actoun="count",
         help="Evalute directory of cptv files",
     )
 
@@ -263,7 +264,7 @@ def metadata_confusion(dir, confusion_file):
                 y_pred.append(ai_tag[0])
     labels = list(labels)
     labels.sort()
-    logging.info("Using labels %s",labels)
+    logging.info("Using labels %s", labels)
     cm = confusion_matrix(y_true, y_pred, labels=labels)
     # Log the confusion matrix as an image summary.
     figure = plot_confusion_matrix(cm, class_names=labels)

From a24daded5d6a235ecd6b79b7e39f5073539290a9 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 14:00:15 +0200
Subject: [PATCH 058/117] adjust

---
 src/modelevaluate.py | 100 ++++++++++++++++++++++---------------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 0836a206..f3b445b9 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -72,8 +72,9 @@
 def model_score(cm, labels):
     cm = np.around(cm.astype("float") / cm.sum(axis=1)[:, np.newaxis], decimals=2)
     cm = np.nan_to_num(cm)
-
-    fp_index = labels.index("false-positive")
+    fp_index = None
+    if "false-positive" in labels:
+        fp_index = labels.index("false-positive")
     none_index = None
     unid_index = None
     if "None" in labels:
@@ -82,7 +83,9 @@ def model_score(cm, labels):
         unid_index = labels.index("unidentified")
     score = 0
     for l_i, l in enumerate(labels):
-        fp_acc = cm[l_i][fp_index]
+        fp_acc = 0
+        if fp_index is not None:
+            fp_acc = cm[l_i][fp_index]
         none_acc = 0
         unid_acc = 0
         accuracy = cm[l_i][l_i]
@@ -155,7 +158,6 @@ def load_args():
 
     parser.add_argument(
         "--evaluate-dir",
-        actoun="count",
         help="Evalute directory of cptv files",
     )
 
@@ -165,7 +167,9 @@ def load_args():
 
     parser.add_argument("--split-file", help="Use split for evaluation")
     parser.add_argument(
-        "--confusion-from-meta", help="Use metadata to produce a confusion matrix"
+        "--confusion-from-meta",
+        action="count",
+        help="Use metadata to produce a confusion matrix",
     )
 
     parser.add_argument(
@@ -253,7 +257,7 @@ def metadata_confusion(dir, confusion_file):
                 tag.get("what")
                 for tag in tags
                 if tag.get("automatic") is True
-                and tag.get("data", {}).get("name") == "Inc3 RF"
+                and tag.get("data", {}).get("name") == "Master"
             ]
             y_true.append(human_tag)
             if len(ai_tag) == 0:
@@ -464,13 +468,13 @@ def main():
     if args.weights:
         weights = model_file / args.weights
     base_dir = Path(config.base_folder) / "training-data"
+    if args.evaluate_dir and args.confusion_from_meta:
+        metadata_confusion(Path(args.evaluate_dir), args.confusion)
+    else:
 
-    model = KerasModel(train_config=config.train)
-    model.load_model(model_file, training=False, weights=weights)
-    if args.evaluate_dir:
-        if args.confusion_from_meta:
-            evalute_prod_confusion(Path(args.evaluate_dir), args.confusion)
-        else:
+        model = KerasModel(train_config=config.train)
+        model.load_model(model_file, training=False, weights=weights)
+        if args.evaluate_dir:
             evaluate_dir(
                 model,
                 Path(args.evaluate_dir),
@@ -481,42 +485,42 @@ def main():
                 threshold=args.threshold,
                 after_date=args.date,
             )
-    elif args.dataset:
-        model_labels = model.labels.copy()
-        model.load_training_meta(base_dir)
-        # model.labels = model_labels
-        if model.params.multi_label:
-            model.labels.append("land-bird")
-        excluded, remapped = get_excluded(model.data_type)
-        files = base_dir / args.dataset
-        dataset, _, new_labels, _ = get_dataset(
-            files,
-            model.data_type,
-            model.labels,
-            model_labels=model_labels,
-            batch_size=64,
-            image_size=model.params.output_dim[:2],
-            preprocess_fn=model.preprocess_fn,
-            augment=False,
-            resample=False,
-            include_features=model.params.mvm,
-            one_hot=True,
-            deterministic=True,
-            shuffle=False,
-            excluded_labels=excluded,
-            remapped_labels=remapped,
-            multi_label=model.params.multi_label,
-            include_track=True,
-            cache=True,
-            channels=model.params.channels,
-        )
-        model.labels = new_labels
-        logging.info(
-            "Dataset loaded %s, using labels %s",
-            args.dataset,
-            model.labels,
-        )
-        model.confusion_tracks(dataset, args.confusion, threshold=args.threshold)
+        elif args.dataset:
+            model_labels = model.labels.copy()
+            model.load_training_meta(base_dir)
+            # model.labels = model_labels
+            if model.params.multi_label:
+                model.labels.append("land-bird")
+            excluded, remapped = get_excluded(model.data_type)
+            files = base_dir / args.dataset
+            dataset, _, new_labels, _ = get_dataset(
+                files,
+                model.data_type,
+                model.labels,
+                model_labels=model_labels,
+                batch_size=64,
+                image_size=model.params.output_dim[:2],
+                preprocess_fn=model.preprocess_fn,
+                augment=False,
+                resample=False,
+                include_features=model.params.mvm,
+                one_hot=True,
+                deterministic=True,
+                shuffle=False,
+                excluded_labels=excluded,
+                remapped_labels=remapped,
+                multi_label=model.params.multi_label,
+                include_track=True,
+                cache=True,
+                channels=model.params.channels,
+            )
+            model.labels = new_labels
+            logging.info(
+                "Dataset loaded %s, using labels %s",
+                args.dataset,
+                model.labels,
+            )
+            model.confusion_tracks(dataset, args.confusion, threshold=args.threshold)
 
 
 if __name__ == "__main__":

From 5d0ab220cf7347c1d153f3de9e25416dbb5d79c9 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 14:06:14 +0200
Subject: [PATCH 059/117] fix confusion

---
 src/modelevaluate.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index f3b445b9..fef33862 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -253,19 +253,23 @@ def metadata_confusion(dir, confusion_file):
             human_tag = human_tags.pop()
             human_tag = label_mapping.get(human_tag, human_tag)
             labels.add(human_tag)
-            ai_tag = [
-                tag.get("what")
-                for tag in tags
-                if tag.get("automatic") is True
-                and tag.get("data", {}).get("name") == "Master"
-            ]
+            ai_tags = []
+            for tag in tags:
+                if tag.get("automatic") is True:
+                    data = tag.get("data", {})
+                    if isinstance(data, str):
+                        if data == "Master":
+                            ai_tags.append(tag["what"])
+                    elif data.get("name") == "Master":
+                        ai_tags.append(tag["what"])
+
             y_true.append(human_tag)
-            if len(ai_tag) == 0:
+            if len(ai_tags) == 0:
                 y_pred.append("None")
                 labels.add("None")
             else:
-                labels.add(ai_tag[0])
-                y_pred.append(ai_tag[0])
+                labels.add(ai_tags[0])
+                y_pred.append(ai_tags[0])
     labels = list(labels)
     labels.sort()
     logging.info("Using labels %s", labels)

From 34d072aa512c501cbc07a00299c85562b566960f Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 14:14:03 +0200
Subject: [PATCH 060/117] add date filter

---
 src/modelevaluate.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index fef33862..3f0beb0d 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -47,6 +47,7 @@
 from config.buildconfig import BuildConfig
 from sklearn.metrics import confusion_matrix
 from multiprocessing import Pool
+from dateutil.parser import parse as parse_date
 
 
 root_logger = logging.getLogger()
@@ -224,7 +225,7 @@ def filter_diffs(track_frames, background):
 # evaluate a confusion matrix from metadata of files, already evaluated by our current model on browse
 
 
-def metadata_confusion(dir, confusion_file):
+def metadata_confusion(dir, confusion_file, after_date=None):
     with open("label_paths.json", "r") as f:
         label_paths = json.load(f)
     label_mapping = get_mappings(label_paths)
@@ -238,7 +239,9 @@ def metadata_confusion(dir, confusion_file):
         with open(meta_f, "r") as t:
             # add in some metadata stats
             meta_data = json.load(t)
-
+        rec_time = parse_date(meta_data["recordingDateTime"])
+        if after_date is not None and rec_time <= after_date:
+            continue
         for track in meta_data.get("Tracks", []):
             tags = track.get("tags", [])
             human_tags = [
@@ -248,7 +251,7 @@ def metadata_confusion(dir, confusion_file):
             if len(human_tags) > 1:
                 print("Conflicting tags for ", track.get("id"), cptv_file)
             if len(human_tags) == 0:
-                print("No humans in ", tags)
+                print("No humans in ", meta_f)
                 continue
             human_tag = human_tags.pop()
             human_tag = label_mapping.get(human_tag, human_tag)
@@ -270,8 +273,12 @@ def metadata_confusion(dir, confusion_file):
             else:
                 labels.add(ai_tags[0])
                 y_pred.append(ai_tags[0])
+    if len(labels) == 0:
+        logging.info("No data found")
+        return
     labels = list(labels)
     labels.sort()
+
     logging.info("Using labels %s", labels)
     cm = confusion_matrix(y_true, y_pred, labels=labels)
     # Log the confusion matrix as an image summary.
@@ -473,7 +480,7 @@ def main():
         weights = model_file / args.weights
     base_dir = Path(config.base_folder) / "training-data"
     if args.evaluate_dir and args.confusion_from_meta:
-        metadata_confusion(Path(args.evaluate_dir), args.confusion)
+        metadata_confusion(Path(args.evaluate_dir), args.confusion, args.date)
     else:
 
         model = KerasModel(train_config=config.train)

From 5ccea12927f18ebbe5c4f59aece9553ec02c1780 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 14:57:12 +0200
Subject: [PATCH 061/117] add loading of metadata

---
 src/modelevaluate.py | 67 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 3f0beb0d..f032f123 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -161,7 +161,10 @@ def load_args():
         "--evaluate-dir",
         help="Evalute directory of cptv files",
     )
-
+    parser.add_argument(
+        "--model-metadata",
+        help="Meta data file for model, used with confusion from meta",
+    )
     parser.add_argument("-c", "--config-file", help="Path to config file to use")
 
     parser.add_argument("-d", "--date", help="Use clips after this")
@@ -225,11 +228,51 @@ def filter_diffs(track_frames, background):
 # evaluate a confusion matrix from metadata of files, already evaluated by our current model on browse
 
 
-def metadata_confusion(dir, confusion_file, after_date=None):
+def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None):
     with open("label_paths.json", "r") as f:
         label_paths = json.load(f)
     label_mapping = get_mappings(label_paths)
-    labels = set()
+    if model_metadata is not None and Path(model_metadata).exists():
+        with open(model_metadata, "r") as t:
+            # add in some metadata stats
+            model_meta = json.load(t)
+        labels = model_meta.get("labels", [])
+        excluded_labels = model_meta.get("excluded_labels", {})
+        remapped = model_meta.get("remapped_labels", {})
+        remapped_labels = {}
+        # slightly different format than from thermaldataset
+        for mapped_to, mapped_labels in remapped.items():
+            for mapped_label in mapped_labels:
+                remapped_labels[mapped_label] = mapped_to
+    else:
+        labels = [
+            "bird",
+            "cat",
+            "deer",
+            "dog",
+            "falsepositive",
+            "hedgehog",
+            "human",
+            "kiwi",
+            "leporidae",
+            "mustelid",
+            "penguin",
+            "possum",
+            "rodent",
+            "sheep",
+            "vehicle",
+            "wallaby",
+            "landbird",
+            "None",
+            "unidentified",
+        ]
+        excluded_labels, remapped_labels = get_excluded("thermal")
+    logging.info(
+        "Labels are %s excluded %s remapped %s",
+        labels,
+        excluded_labels,
+        remapped_labels,
+    )
     y_true = []
     y_pred = []
     dir = Path(dir)
@@ -255,7 +298,14 @@ def metadata_confusion(dir, confusion_file, after_date=None):
                 continue
             human_tag = human_tags.pop()
             human_tag = label_mapping.get(human_tag, human_tag)
-            labels.add(human_tag)
+            if human_tag in excluded_labels:
+                logging.info("Excluding %s", human_tag)
+                continue
+            if human_tag in remapped_labels:
+                logging.info(
+                    "Remapping %s to %s", human_tag, remapped_labels[human_tag]
+                )
+                human_tag = remapped_labels[human_tag]
             ai_tags = []
             for tag in tags:
                 if tag.get("automatic") is True:
@@ -269,17 +319,12 @@ def metadata_confusion(dir, confusion_file, after_date=None):
             y_true.append(human_tag)
             if len(ai_tags) == 0:
                 y_pred.append("None")
-                labels.add("None")
             else:
-                labels.add(ai_tags[0])
                 y_pred.append(ai_tags[0])
     if len(labels) == 0:
         logging.info("No data found")
         return
-    labels = list(labels)
-    labels.sort()
 
-    logging.info("Using labels %s", labels)
     cm = confusion_matrix(y_true, y_pred, labels=labels)
     # Log the confusion matrix as an image summary.
     figure = plot_confusion_matrix(cm, class_names=labels)
@@ -480,7 +525,9 @@ def main():
         weights = model_file / args.weights
     base_dir = Path(config.base_folder) / "training-data"
     if args.evaluate_dir and args.confusion_from_meta:
-        metadata_confusion(Path(args.evaluate_dir), args.confusion, args.date)
+        metadata_confusion(
+            Path(args.evaluate_dir), args.confusion, args.date, args.model_metadata
+        )
     else:
 
         model = KerasModel(train_config=config.train)

From 166c25234724e18e7851463a65ebe0e5359f5b39 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 14:59:18 +0200
Subject: [PATCH 062/117] adjust

---
 src/modelevaluate.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index f032f123..b0d30bed 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -238,12 +238,7 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
             model_meta = json.load(t)
         labels = model_meta.get("labels", [])
         excluded_labels = model_meta.get("excluded_labels", {})
-        remapped = model_meta.get("remapped_labels", {})
-        remapped_labels = {}
-        # slightly different format than from thermaldataset
-        for mapped_to, mapped_labels in remapped.items():
-            for mapped_label in mapped_labels:
-                remapped_labels[mapped_label] = mapped_to
+        remapped_labels = model_meta.get("remapped_labels", {})
     else:
         labels = [
             "bird",

From 68bc3b8a7560bbeb2777647eec235e3fbaf74d61 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:00:36 +0200
Subject: [PATCH 063/117] no land bird

---
 src/modelevaluate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index b0d30bed..51e2a7b5 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -239,6 +239,9 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
         labels = model_meta.get("labels", [])
         excluded_labels = model_meta.get("excluded_labels", {})
         remapped_labels = model_meta.get("remapped_labels", {})
+        for k, v in remapped_labels.items():
+            if v == "land-bird":
+                remapped_labels[k] = "bird"
     else:
         labels = [
             "bird",

From 2d1eb3f087a432ed34b50afe90fed11d319e381f Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:05:04 +0200
Subject: [PATCH 064/117] ignore no meta

---
 src/modelevaluate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 51e2a7b5..b6ab3ba9 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -276,6 +276,8 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
     dir = Path(dir)
     for cptv_file in dir.glob(f"**/*cptv"):
         meta_f = cptv_file.with_suffix(".txt")
+        if not meta_f.exists():
+            continue
         meta_data = None
         with open(meta_f, "r") as t:
             # add in some metadata stats

From 70a4ff89846dd76a2658c10f3083dd0210267063 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:23:12 +0200
Subject: [PATCH 065/117] add none and unid

---
 src/modelevaluate.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index b6ab3ba9..0cde9f5a 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -242,6 +242,10 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
         for k, v in remapped_labels.items():
             if v == "land-bird":
                 remapped_labels[k] = "bird"
+        if "None" not in labels:
+            labels.append("None")
+        if "unidentified" not in labels:
+            labels.append("unidentified")
     else:
         labels = [
             "bird",

From e6eb8b83342c73f177c9cea4275e4edee4dc0e6e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:31:05 +0200
Subject: [PATCH 066/117] catch non existend labels

---
 src/modelevaluate.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 0cde9f5a..cf2a7682 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -321,13 +321,14 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                         ai_tags.append(tag["what"])
 
             y_true.append(human_tag)
+            if human_tag not in labels:
+                labels.append(human_tag)
             if len(ai_tags) == 0:
                 y_pred.append("None")
             else:
                 y_pred.append(ai_tags[0])
-    if len(labels) == 0:
-        logging.info("No data found")
-        return
+                if ai_tags[0] not in labels:
+                    labels.append(ai_tags[0])
 
     cm = confusion_matrix(y_true, y_pred, labels=labels)
     # Log the confusion matrix as an image summary.

From d7e3c102696ad032b1e8a0282f805426d65949f5 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:35:32 +0200
Subject: [PATCH 067/117] use logging

---
 src/modelevaluate.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index cf2a7682..ccfbb604 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -470,16 +470,12 @@ def evaluate_dir(
                     predicted_tag = ",".join(predicted_labels)
                     y_pred.append(predicted_tag)
                 if y_pred[-1] != y_true[-1]:
-                    print(
+                    logging.info("%s predicted %s but should be %s with confidence %s"
                         data[0],
-                        "Got a prediction of",
                         y_pred[-1],
-                        " should be ",
                         label,
                         np.round(100 * prediction.class_best_score),
                     )
-                # if predicted_tag not in model.labels:
-                # model.labels.append(predicted_tag)
     model.labels.append("None")
     model.labels.append("unidentified")
     cm = confusion_matrix(y_true, y_pred, labels=model.labels)

From c6c6abb84c3f69a43173fb9e368fe81826231604 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:43:56 +0200
Subject: [PATCH 068/117] exclude unknown tag

---
 src/modelevaluate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index ccfbb604..2f9478e6 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -310,6 +310,9 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                     "Remapping %s to %s", human_tag, remapped_labels[human_tag]
                 )
                 human_tag = remapped_labels[human_tag]
+            if human_tag not in labels:
+                logging.info("Excluding %s", human_tag)
+  
             ai_tags = []
             for tag in tags:
                 if tag.get("automatic") is True:
@@ -321,8 +324,6 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                         ai_tags.append(tag["what"])
 
             y_true.append(human_tag)
-            if human_tag not in labels:
-                labels.append(human_tag)
             if len(ai_tags) == 0:
                 y_pred.append("None")
             else:

From 4131892522a247c20a35585c501b79c079e72c3a Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:48:29 +0200
Subject: [PATCH 069/117] correct method

---
 src/modelevaluate.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 2f9478e6..3bfb8cd6 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -310,8 +310,8 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                     "Remapping %s to %s", human_tag, remapped_labels[human_tag]
                 )
                 human_tag = remapped_labels[human_tag]
-            if human_tag not in labels:
-                logging.info("Excluding %s", human_tag)
+            # if human_tag not in labels:
+                # logging.info("Excluding %s", human_tag)
   
             ai_tags = []
             for tag in tags:
@@ -324,6 +324,8 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                         ai_tags.append(tag["what"])
 
             y_true.append(human_tag)
+            if human_tag not in labels:
+                labels.append(human_tag)
             if len(ai_tags) == 0:
                 y_pred.append("None")
             else:
@@ -360,7 +362,7 @@ def load_clip_data(cptv_file):
         logging.warn("No clip for %s", cptv_file)
         return None
 
-    if filter_clip(clip, reason, after_date=after_date):
+    if filter_clip(clip, None,None,reason, after_date=after_date):
         logging.info("Filtering %s", cptv_file)
         return None
     clip.tracks = [

From f56fca7a2ef77dfa35389122db0aaaf6c591d8fe Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:49:11 +0200
Subject: [PATCH 070/117] comma

---
 src/modelevaluate.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 3bfb8cd6..5eace7f2 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -311,8 +311,8 @@ def metadata_confusion(dir, confusion_file, after_date=None, model_metadata=None
                 )
                 human_tag = remapped_labels[human_tag]
             # if human_tag not in labels:
-                # logging.info("Excluding %s", human_tag)
-  
+            # logging.info("Excluding %s", human_tag)
+
             ai_tags = []
             for tag in tags:
                 if tag.get("automatic") is True:
@@ -362,7 +362,7 @@ def load_clip_data(cptv_file):
         logging.warn("No clip for %s", cptv_file)
         return None
 
-    if filter_clip(clip, None,None,reason, after_date=after_date):
+    if filter_clip(clip, None, None, reason, after_date=after_date):
         logging.info("Filtering %s", cptv_file)
         return None
     clip.tracks = [
@@ -473,7 +473,8 @@ def evaluate_dir(
                     predicted_tag = ",".join(predicted_labels)
                     y_pred.append(predicted_tag)
                 if y_pred[-1] != y_true[-1]:
-                    logging.info("%s predicted %s but should be %s with confidence %s"
+                    logging.info(
+                        "%s predicted %s but should be %s with confidence %s",
                         data[0],
                         y_pred[-1],
                         label,

From 2b88fe402b7c3943922abda421edf2dedd78607c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:57:06 +0200
Subject: [PATCH 071/117] add get id

---
 src/ml_tools/rawdb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 2b41b30e..e671462e 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -208,6 +208,9 @@ def get_clip_tracks(self, tag_precedence):
             clip_header.tracks.append(header)
         return clip_header
 
+    def get_id(self):
+        return self.meta_data_file
+
     def get_clip_meta(self, tag_precedence):
         return self.get_clip_tracks(tag_precedence)
         #

From f160436850cb4ad08021c12092583449a0e3d1f3 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 15:57:35 +0200
Subject: [PATCH 072/117] remove debugg

---
 src/modelevaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 5eace7f2..8e2280d9 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -363,7 +363,7 @@ def load_clip_data(cptv_file):
         return None
 
     if filter_clip(clip, None, None, reason, after_date=after_date):
-        logging.info("Filtering %s", cptv_file)
+        # logging.info("Filtering %s", cptv_file)
         return None
     clip.tracks = [
         track for track in clip.tracks if not filter_track(track, EXCLUDED_TAGS, reason)

From ae4a81c1234ed63766750b4f9402203e7b9fcacf Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 16:08:55 +0200
Subject: [PATCH 073/117] catch ex

---
 src/modelevaluate.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 8e2280d9..858556e8 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -379,18 +379,22 @@ def load_clip_data(cptv_file):
     thermal_medians = np.uint16(thermal_medians)
     data = []
     for track in clip.tracks:
-        frames, preprocessed, masses = worker_model.preprocess(
-            clip_db, track, frames_per_classify=25, dont_filter=True
-        )
-        data.append(
-            (
-                f"{track.clip_id}-{track.get_id()}",
-                track.label,
-                frames,
-                preprocessed,
-                masses,
+        try:
+            frames, preprocessed, masses = worker_model.preprocess(
+                clip_db, track, frames_per_classify=25, dont_filter=True
             )
-        )
+
+            data.append(
+                (
+                    f"{track.clip_id}-{track.get_id()}",
+                    track.label,
+                    frames,
+                    preprocessed,
+                    masses,
+                )
+            )
+        except:
+            logging.error("Could not load %s", clip.clip_id, exc_info=True)
     return data
 
 

From 23eed72933dfb42db69664cb857ea846a1298c77 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 16:29:38 +0200
Subject: [PATCH 074/117] and support for therml norm diff

---
 src/ml_tools/hyperparams.py |  6 +++
 src/ml_tools/interpreter.py | 96 ++++++++++++++++++++++++++-----------
 src/ml_tools/preprocess.py  | 13 +++--
 3 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index b4b57055..90bcc362 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -27,6 +27,8 @@ def insert_defaults(self):
         self["segment_type"] = self.segment_type
         self["multi_label"] = True
         self["diff_norm"] = self.diff_norm
+        self["thermal_diff_norm"] = self.thermal_diff_norm
+
         self["smooth_predictions"] = self.smooth_predictions
         self["channels"] = self.channels
 
@@ -58,6 +60,10 @@ def excluded_labels(self):
     def remapped_labels(self):
         return self.get("remapped_labels", None)
 
+    @property
+    def thermal_diff_norm(self):
+        return self.get("thermal_diff_norm", False)
+
     @property
     def diff_norm(self):
         return self.get("diff_norm", True)
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index 2763957b..e4c0a99e 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -183,9 +183,12 @@ def preprocess_frames(
         data = []
         frames_used = []
         filtered_norm_limits = None
-        if self.params.diff_norm:
+        thermal_norm_limits = None
+        if self.params.diff_norm or self.params.thermal_diff_norm:
             min_diff = None
             max_diff = 0
+            thermal_max_diff = None
+            thermal_min_diff = None
             for i, region in enumerate(reversed(track.bounds_history)):
                 if region.blank:
                     continue
@@ -201,16 +204,30 @@ def preprocess_frames(
                     continue
 
                 f.float_arrays()
-                diff_frame = region.subimage(f.thermal) - region.subimage(
-                    clip.background
-                )
-                new_max = np.amax(diff_frame)
-                new_min = np.amin(diff_frame)
-                if min_diff is None or new_min < min_diff:
-                    min_diff = new_min
-                if new_max > max_diff:
-                    max_diff = new_max
-            filtered_norm_limits = (min_diff, max_diff)
+
+                if self.params.thermal_diff_norm:
+                    diff_frame = f.thermal - np.median(f.thermal)
+                    new_max = np.amax(diff_frame)
+                    new_min = np.amin(diff_frame)
+                    if thermal_min_diff is None or new_min < thermal_min_diff:
+                        thermal_min_diff = new_min
+                    if thermal_max_diff is None or new_max > thermal_max_diff:
+                        thermal_max_diff = new_max
+                if self.params.diff_norm:
+                    diff_frame = region.subimage(f.thermal) - region.subimage(
+                        clip.background
+                    )
+                    new_max = np.amax(diff_frame)
+                    new_min = np.amin(diff_frame)
+                    if min_diff is None or new_min < min_diff:
+                        min_diff = new_min
+                    if new_max > max_diff:
+                        max_diff = new_max
+            if self.params.thermal_diff_norm:
+                thermal_norm_limits = (thermal_min_diff, thermal_max_diff)
+
+            if self.params.diff_norm:
+                filtered_norm_limits = (min_diff, max_diff)
         for i, region in enumerate(reversed(track.bounds_history)):
             if region.blank:
                 continue
@@ -249,6 +266,7 @@ def preprocess_frames(
                 clip.background,
                 clip.crop_rectangle,
                 filtered_norm_limits=filtered_norm_limits,
+                thermal_norm_limits=thermal_norm_limits,
             )
             preprocessed = preprocess_single_frame(
                 cropped_frame,
@@ -293,30 +311,52 @@ def preprocess_segments(
 
         # should really be over whole track buts let just do the indices we predict of
         #  seems to make little different to just doing a min max normalization
+        thermal_norm_limits = None
         filtered_norm_limits = None
-        if self.params.diff_norm:
+        if self.params.diff_norm or self.params.thermal_diff_norm:
             min_diff = None
             max_diff = 0
-            for frame_index in frame_indices:
-                region = track.bounds_history[frame_index - track.start_frame]
-                f = clip.get_frame(region.frame_number)
-                if f is None:
-                    logging.warn("Could not get frame {}", region.frame_number)
+            thermal_max_diff = None
+            thermal_min_diff = None
+            for i, region in enumerate(reversed(track.bounds_history)):
+                if region.blank:
                     continue
+                if region.width == 0 or region.height == 0:
+                    logging.warn(
+                        "No width or height for frame %s regoin %s",
+                        region.frame_number,
+                        region,
+                    )
+                    continue
+                f = clip.get_frame(region.frame_number)
                 if region.blank or region.width <= 0 or region.height <= 0:
                     continue
 
                 f.float_arrays()
-                diff_frame = region.subimage(f.thermal) - region.subimage(
-                    clip.background
-                )
-                new_max = np.amax(diff_frame)
-                new_min = np.amin(diff_frame)
-                if min_diff is None or new_min < min_diff:
-                    min_diff = new_min
-                if new_max > max_diff:
-                    max_diff = new_max
-            filtered_norm_limits = (min_diff, max_diff)
+
+                if self.params.thermal_diff_norm:
+                    diff_frame = f.thermal - np.median(f.thermal)
+                    new_max = np.amax(diff_frame)
+                    new_min = np.amin(diff_frame)
+                    if thermal_min_diff is None or new_min < thermal_min_diff:
+                        thermal_min_diff = new_min
+                    if thermal_max_diff is None or new_max > thermal_max_diff:
+                        thermal_max_diff = new_max
+                if self.params.diff_norm:
+                    diff_frame = region.subimage(f.thermal) - region.subimage(
+                        clip.background
+                    )
+                    new_max = np.amax(diff_frame)
+                    new_min = np.amin(diff_frame)
+                    if min_diff is None or new_min < min_diff:
+                        min_diff = new_min
+                    if new_max > max_diff:
+                        max_diff = new_max
+            if self.params.thermal_diff_norm:
+                thermal_norm_limits = (thermal_min_diff, thermal_max_diff)
+
+            if self.params.diff_norm:
+                filtered_norm_limits = (min_diff, max_diff)
         for frame_index in frame_indices:
             region = track.bounds_history[frame_index - track.start_frame]
 
@@ -341,6 +381,7 @@ def preprocess_segments(
                 clip.background,
                 clip.crop_rectangle,
                 filtered_norm_limits=filtered_norm_limits,
+                thermal_norm_limits=thermal_norm_limits,
             )
             track_data[frame.frame_number] = cropped_frame
         features = None
@@ -365,6 +406,7 @@ def preprocess_segments(
                 self.params.frame_size,
                 self.params.channels,
                 self.preprocess_fn,
+                sample=f"{clip.get_id()}-{track.get_id()}",
             )
             if frames is None:
                 logging.warn("No frames to predict on")
diff --git a/src/ml_tools/preprocess.py b/src/ml_tools/preprocess.py
index b3186127..89214614 100644
--- a/src/ml_tools/preprocess.py
+++ b/src/ml_tools/preprocess.py
@@ -61,6 +61,7 @@ def preprocess_frame(
     crop_rectangle=None,
     calculate_filtered=True,
     filtered_norm_limits=None,
+    thermal_norm_limits=None,
 ):
     median = np.median(frame.thermal)
     cropped_frame = frame.crop_by_region(region, only_thermal=True)
@@ -79,7 +80,8 @@ def preprocess_frame(
         True,
     )
     cropped_frame.thermal -= median
-    np.clip(cropped_frame.thermal, 0, None, out=cropped_frame.thermal)
+    if thermal_norm_limits is None:
+        np.clip(cropped_frame.thermal, 0, None, out=cropped_frame.thermal)
     if calculate_filtered and filtered_norm_limits is not None:
         cropped_frame.filtered, stats = imageprocessing.normalize(
             cropped_frame.filtered,
@@ -88,8 +90,13 @@ def preprocess_frame(
             new_max=255,
         )
         if frame.thermal is not None:
+            thermal_min = None
+            thermal_max = None
+            if thermal_norm_limits is not None:
+                thermal_min, thermal_max = thermal_norm_limits
+                logging.info("Using therml min max %s, %s", thermal_min, thermal_max)
             cropped_frame.thermal, _ = imageprocessing.normalize(
-                cropped_frame.thermal, new_max=255
+                cropped_frame.thermal, min=thermal_min, max=thermal_max, new_max=255
             )
     else:
         cropped_frame.normalize()
@@ -161,7 +168,7 @@ def preprocess_movement(
     # index += 1
     # tools.saveclassify_image(
     #     data,
-    #     f"samples/{index}",
+    #     f"samples/{sample}-{index}",
     # )
 
     if preprocess_fn:

From 1e67f11893a94dc6aa2cccc6863c8df1e24810ae Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 16:30:39 +0200
Subject: [PATCH 075/117] remove test logging

---
 src/ml_tools/preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ml_tools/preprocess.py b/src/ml_tools/preprocess.py
index 89214614..bcb94da3 100644
--- a/src/ml_tools/preprocess.py
+++ b/src/ml_tools/preprocess.py
@@ -94,7 +94,6 @@ def preprocess_frame(
             thermal_max = None
             if thermal_norm_limits is not None:
                 thermal_min, thermal_max = thermal_norm_limits
-                logging.info("Using therml min max %s, %s", thermal_min, thermal_max)
             cropped_frame.thermal, _ = imageprocessing.normalize(
                 cropped_frame.thermal, min=thermal_min, max=thermal_max, new_max=255
             )

From c3ce590f9c51f789a4b2038e319ddb672d7fa6e7 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 16:38:35 +0200
Subject: [PATCH 076/117] add smooething

---
 src/modelevaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 858556e8..191f5491 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -462,7 +462,7 @@ def evaluate_dir(
                 #     top_score = len(output)
                 #     smoothed = output
                 # else:
-                # smoothed = output * output * masses
+                smoothed = output * output * masses
                 prediction.classified_clip(output, output, data[2], top_score=top_score)
                 y_true.append(label_mapping.get(label, label))
                 predicted_labels = [prediction.predicted_tag()]

From 8f77f5a7e8ab51f6caadadeeedb67ceb6663344b Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 1 Oct 2024 16:41:49 +0200
Subject: [PATCH 077/117] dont sq

---
 src/modelevaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 191f5491..286dfb04 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -462,7 +462,7 @@ def evaluate_dir(
                 #     top_score = len(output)
                 #     smoothed = output
                 # else:
-                smoothed = output * output * masses
+                smoothed = output * masses
                 prediction.classified_clip(output, output, data[2], top_score=top_score)
                 y_true.append(label_mapping.get(label, label))
                 predicted_labels = [prediction.predicted_tag()]

From 34f2f210a01899871f5d8cf496932e58c04d8674 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 2 Oct 2024 16:37:22 +0200
Subject: [PATCH 078/117] build frames dataset

---
 src/build.py               |  6 +++---
 src/config/buildconfig.py  |  4 +++-
 src/ml_tools/dataset.py    |  2 +-
 src/ml_tools/preprocess.py | 12 ++++++------
 src/ml_tools/tools.py      |  3 ++-
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/build.py b/src/build.py
index ff1edac9..f614d764 100644
--- a/src/build.py
+++ b/src/build.py
@@ -736,11 +736,11 @@ def rough_balance(datasets):
             logging.info("Not balancing")
             continue
         if len(counts) <= 2:
-            cap_at = counts[0]
+            cap_at = counts[-1]
         elif len(counts) < 7:
-            cap_at = counts[-2]
+            cap_at = counts[-1]
         else:
-            cap_at = counts[-3]
+            cap_at = counts[-1]
         logging.info("Capping dataset %s at %s", dataset.name, cap_at)
         for lbl, count in lbl_counts.items():
             if count <= cap_at:
diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index 31d2aa01..c659ca75 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -39,7 +39,7 @@ class BuildConfig(DefaultConfig):
     tag_precedence = attr.ib()
     excluded_tags = attr.ib()
     country = attr.ib()
-
+    use_segments = attr.ib()
     EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
 
     # country bounding boxs
@@ -88,6 +88,7 @@ def load(cls, build):
             tag_precedence=build["tag_precedence"],
             excluded_tags=build["excluded_tags"],
             country=build["country"],
+            use_segments=build["use_segments"],
         )
 
     @classmethod
@@ -105,6 +106,7 @@ def get_defaults(cls):
             tag_precedence=BuildConfig.DEFAULT_GROUPS,
             excluded_tags=BuildConfig.EXCLUDED_TAGS,
             country="NZ",
+            use_segments=True,
         )
 
     def validate(self):
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 4c4b0b63..3e422076 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -70,7 +70,7 @@ def __init__(
                 self.use_segments = False
                 self.segment_length = 1
             else:
-                self.use_segments = config.train.hyper_params.get("use_segments", True)
+                self.use_segments = config.build.use_segments
                 if self.use_segments:
                     self.segment_length = config.build.segment_length
                 else:
diff --git a/src/ml_tools/preprocess.py b/src/ml_tools/preprocess.py
index bcb94da3..1600425c 100644
--- a/src/ml_tools/preprocess.py
+++ b/src/ml_tools/preprocess.py
@@ -163,12 +163,12 @@ def preprocess_movement(
 
     #
     # # # # # for testing
-    # global index
-    # index += 1
-    # tools.saveclassify_image(
-    #     data,
-    #     f"samples/{sample}-{index}",
-    # )
+    global index
+    index += 1
+    tools.saveclassify_image(
+        data,
+        f"samples/{sample}-{index}",
+    )
 
     if preprocess_fn:
         data = preprocess_fn(data)
diff --git a/src/ml_tools/tools.py b/src/ml_tools/tools.py
index bdfb51de..38dd9e90 100644
--- a/src/ml_tools/tools.py
+++ b/src/ml_tools/tools.py
@@ -193,7 +193,8 @@ def saveclassify_image(data, filename):
     Path(filename).parent.mkdir(parents=True, exist_ok=True)
     r = Image.fromarray(np.uint8(data[:, :, 0]))
     g = Image.fromarray(np.uint8(data[:, :, 1]))
-    b = Image.fromarray(np.uint8(data[:, :, 2]))
+    b = g
+    # b = Image.fromarray(np.uint8(data[:, :, 2]))
     concat = np.concatenate((r, g, b), axis=1)  # horizontally
     img = Image.fromarray(np.uint8(concat))
     img.save(filename + ".png")

From a34b20544550da8c69b7ce04209934b077670eaa Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 2 Oct 2024 17:25:33 +0200
Subject: [PATCH 079/117] support for frames model

---
 src/ml_tools/hyperparams.py    | 7 +++++++
 src/ml_tools/kerasmodel.py     | 3 +++
 src/ml_tools/thermaldataset.py | 7 +++++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index 90bcc362..cd6ddb79 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -164,6 +164,13 @@ def square_width(self):
     def frame_size(self):
         return self.get("frame_size", 32)
 
+    def set_use_segments(self, use_segments):
+        self["use_segments"] = use_segments
+        if use_segments:
+            self["square_width"] = 5
+        else:
+            self["square_width"] = 1
+
     #
     # @property
     # def red_type(self):
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index c4c18dd2..f722dcf3 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -81,6 +81,9 @@ def load_training_meta(self, base_dir):
         self.ds_by_label = meta.get("by_label", True)
         self.excluded_labels = meta.get("excluded_labels")
         self.remapped_labels = meta.get("remapped_labels")
+        self.params.set_use_segments(
+            meta.get("config").get("build", {}).get("use_segments", True)
+        )
 
     def shape(self):
         if self.model is None:
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index d4a7e9e7..6604be7c 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -130,7 +130,9 @@ def load_dataset(filenames, remap_lookup, labels, args):
             extra_label_map=extra_label_map,
             include_track=args.get("include_track", False),
             num_frames=args.get("num_frames", 25),
-            channels=args.get("channels", [TrackChannels.thermal.name]),
+            channels=args.get(
+                "channels", [TrackChannels.thermal.name, TrackChannels.filtered.name]
+            ),
         ),
         num_parallel_calls=AUTOTUNE,
         deterministic=deterministic,
@@ -183,7 +185,7 @@ def read_tfrecord(
     channels=[TrackChannels.thermal.name, TrackChannels.filtered.name],
 ):
     logging.info(
-        "Read tf record with image %s lbls %s labeld %s aug  %s  prepr %s only features %s one hot %s include fetures %s",
+        "Read tf record with image %s lbls %s labeld %s aug  %s  prepr %s only features %s one hot %s include fetures %s num frames %s",
         image_size,
         num_labels,
         labeled,
@@ -192,6 +194,7 @@ def read_tfrecord(
         only_features,
         one_hot,
         include_features,
+        num_frames,
     )
     load_images = not only_features
     tfrecord_format = {

From 8d122397273d55d6518f3dedddb622c17b746adc Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 3 Oct 2024 20:43:13 +0200
Subject: [PATCH 080/117] skip frames on edge

---
 src/build.py                      |  1 +
 src/ml_tools/datasetstructures.py | 55 ++++++++++++++++++++++---------
 src/ml_tools/thermaldataset.py    |  2 +-
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/src/build.py b/src/build.py
index f614d764..7638f7a8 100644
--- a/src/build.py
+++ b/src/build.py
@@ -895,6 +895,7 @@ def main():
                         "tag_precedence": config.build.tag_precedence,
                         "min_mass": master_dataset.min_frame_mass,
                         "thermal_diff_norm": config.build.thermal_diff_norm,
+                        "filter_by_lq": master_dataset.filter_by_lq,
                     }
                 )
             create_tf_records(
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index f1840527..871b4b79 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -144,6 +144,7 @@ def __init__(
         human_tags=None,
         remapped_lbl=None,
         mega_missed_regions=None,
+        skip_ffc=True,
     ):
         # regions that megadetector found nothing in
         self.mega_missed_regions = mega_missed_regions
@@ -173,12 +174,8 @@ def __init__(
         self.frame_crop = None
         self.num_frames = num_frames
         self.important_predicted = 0
-
-        mass_history = np.uint16(
-            [region.mass for region in self.regions_by_frame.values()]
-        )
         mass_history = [
-            region.frame_number
+            region.mass
             for region in self.regions_by_frame.values()
             if region.mass > 0
             and (
@@ -243,29 +240,55 @@ def add_sample(self, sample):
     def calculate_sample_frames(
         self, min_mass=None, max_mass=None, ffc_frames=None, skip_last=None
     ):
+        crop_rectangle = Rectangle(2, 2, 160 - 2 * 2, 140 - 2 * 2)
+
+        logging.debug(
+            "Calculating sample with min %s and max %s ffc %s and skip %s",
+            min_mass,
+            max_mass,
+            ffc_frames,
+            skip_last,
+        )
         frame_numbers = list(self.regions_by_frame.keys())
+        previous_mass = None
+
         if skip_last is not None:
             skip_x = int(len(frame_numbers) * skip_last)
             frame_numbers = frame_numbers[:-skip_x]
-        frame_numbers = [
-            frame
-            for frame in frame_numbers
-            if (ffc_frames is None or frame not in ffc_frames)
-            and (
-                self.mega_missed_regions is None
-                or frame not in self.mega_missed_regions
-            )
-        ]
-        frame_numbers.sort()
 
+        frame_numbers.sort()
         for frame_num in frame_numbers:
             region = self.regions_by_frame[frame_num]
-            if region.mass == 0 or region.blank:
+
+            if (
+                region.mass == 0
+                or region.blank
+                or region.width <= 0
+                or region.height <= 0
+            ):
+                continue
+            if ffc_frames is not None and frame_num in ffc_frames:
                 continue
+
+            if (
+                self.mega_missed_regions is not None
+                and frame_num in self.mega_missed_regions
+            ):
+                continue
+
             if min_mass is not None and region.mass < min_mass:
                 continue
             if max_mass is not None and region.mass > max_mass:
                 continue
+            # dont use regions on the edge if the mass deviates too much from the last known good mass
+            region.set_is_along_border(crop_rectangle)
+            if region.is_along_border:
+                if previous_mass is not None:
+                    previous_mass_thresh = previous_mass * 0.1
+                    if (abs(previous_mass - region.mass)) >= previous_mass_thresh:
+                        continue
+            else:
+                previous_mass = region.mass
             f = FrameSample(
                 self.clip_id,
                 self.track_id,
diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 6604be7c..94f1d46c 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -329,7 +329,7 @@ def main():
         remapped_labels=get_remapped(),
         excluded_labels=get_excluded(),
         include_track=True,
-        num_frames=25,
+        num_frames=1,
     )
     print("Ecpoh size is", epoch_size)
     # print(get_distribution(resampled_ds, len(labels), extra_meta=False))

From 86fad5953015c4421e44a8077cc77a9bed025309 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 3 Oct 2024 21:06:12 +0200
Subject: [PATCH 081/117] tweak a few defaults and min mass filtering

---
 src/config/buildconfig.py         |  4 ++--
 src/ml_tools/datasetstructures.py | 31 ++++++++++++++++++-------------
 src/ml_tools/thermalwriter.py     |  1 +
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index c659ca75..52d9f085 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -101,11 +101,11 @@ def get_defaults(cls):
             segment_min_avg_mass=10,
             min_frame_mass=10,
             filter_by_lq=False,
-            max_segments=5,
+            max_segments=3,
             thermal_diff_norm=True,
             tag_precedence=BuildConfig.DEFAULT_GROUPS,
             excluded_tags=BuildConfig.EXCLUDED_TAGS,
-            country="NZ",
+            country=None,
             use_segments=True,
         )
 
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 871b4b79..51b26c6f 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -362,12 +362,14 @@ def get_segments(
         location=None,
         segment_frames=None,
         from_last=None,
+        frame_min_mass=None,
     ):
         if segment_frames is not None:
             raise Exception("Have not implement this path")
-        min_frames = segment_width
-        if self.label == "vehicle" or self.label == "human":
-            min_frames = segment_width / 4.0
+        min_frames = segment_width / 4.0
+        if self.label in ["stoat", "mustelid", "weasel", "ferret"]:
+            # try and always get one for these
+            min_frames = 0
 
         # in python3.7+ can just take the values and it guarantees order it was added to dict
         regions = self.bounds_history
@@ -390,6 +392,7 @@ def get_segments(
             source_file=self.source_file,
             dont_filter=dont_filter,
             skip_ffc=skip_ffc,
+            frame_min_mass=frame_min_mass,
         )
         # GP could get this from the tracks when writing
         # but might be best to keep samples independent for ease
@@ -963,11 +966,12 @@ def get_segments(
     source_file=None,
     dont_filter=False,
     skip_ffc=True,
+    frame_min_mass=None,
 ):
     if segment_type == SegmentType.ALL_RANDOM_NOMIN:
         segment_min_mass = None
     if min_frames is None:
-        min_frames = 25
+        min_frames = segment_width / 4.0
     segments = []
     mass_history = np.uint16([region.mass for region in regions])
     filtered_stats = {"segment_mass": 0, "too short": 0}
@@ -986,6 +990,7 @@ def get_segments(
         and not region.blank
         and region.width > 0
         and region.height > 0
+        and ((has_no_mass or frame_min_mass is None) or region.mass >= frame_min_mass)
     ]
     if len(frame_indices) == 0:
         logging.warn("Nothing to load for %s - %s", clip_id, track_id)
@@ -1024,9 +1029,9 @@ def get_segments(
             segment_min_mass,
             source_file=source_file,
         )
-    # if len(frame_indices) < min_frames:
-    # filtered_stats["too short"] += 1
-    # return segments, filtered_stats
+    if len(frame_indices) < min_frames:
+        filtered_stats["too short"] += 1
+        return segments, filtered_stats
     frame_indices = np.array(frame_indices)
     segment_count = max(1, len(frame_indices) // segment_frame_spacing)
     segment_count = int(segment_count)
@@ -1047,12 +1052,12 @@ def get_segments(
             # random_frames and not random_sections:
             np.random.shuffle(frame_indices)
         for i in range(segment_count):
-            # always get atleast one segmnet
-            if i > 0:
-                if (len(frame_indices) < segment_width and len(segments) > 1) or len(
-                    frame_indices
-                ) < (segment_width / 4.0):
-                    break
+            # always get atleast one segment, not doing annymore
+            # if i > 0:
+            if (len(frame_indices) < segment_width and len(segments) > 1) or len(
+                frame_indices
+            ) < segment_width / 4:
+                break
 
             if segment_type == SegmentType.ALL_SECTIONS:
                 # random frames from section 2.2 * segment_width
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 603698e4..4c7d0713 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -228,6 +228,7 @@ def get_data(clip_samples, extra_args):
                         skip_ffc=extra_args.get("skip_ffc", True),
                         ffc_frames=clip_meta.ffc_frames,
                         max_segments=len(samples),
+                        frame_min_mass=extra_args.get("min_mass"),
                     )
                 else:
                     filter_by_lq = extra_args.get("filter_by_lq", False)

From a9044d6aae52cf1a02ae5cb278d302a1bf2d0bb5 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 3 Oct 2024 21:14:33 +0200
Subject: [PATCH 082/117] add max samples

---
 src/build.py                      | 1 +
 src/config/buildconfig.py         | 4 ++++
 src/ml_tools/dataset.py           | 2 ++
 src/ml_tools/datasetstructures.py | 9 ++++++++-
 src/ml_tools/thermalwriter.py     | 1 +
 5 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/build.py b/src/build.py
index 7638f7a8..76ef6e09 100644
--- a/src/build.py
+++ b/src/build.py
@@ -896,6 +896,7 @@ def main():
                         "min_mass": master_dataset.min_frame_mass,
                         "thermal_diff_norm": config.build.thermal_diff_norm,
                         "filter_by_lq": master_dataset.filter_by_lq,
+                        "max_frames": master_dataset.max_frames,
                     }
                 )
             create_tf_records(
diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index 52d9f085..3ff3bdb5 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -40,6 +40,8 @@ class BuildConfig(DefaultConfig):
     excluded_tags = attr.ib()
     country = attr.ib()
     use_segments = attr.ib()
+    max_frames = attr.ib()
+
     EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
 
     # country bounding boxs
@@ -89,6 +91,7 @@ def load(cls, build):
             excluded_tags=build["excluded_tags"],
             country=build["country"],
             use_segments=build["use_segments"],
+            max_frames=build["max_frames"],
         )
 
     @classmethod
@@ -107,6 +110,7 @@ def get_defaults(cls):
             excluded_tags=BuildConfig.EXCLUDED_TAGS,
             country=None,
             use_segments=True,
+            max_frames=75,
         )
 
     def validate(self):
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 3e422076..8556c5f9 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -86,6 +86,7 @@ def __init__(
             self.segment_type = SegmentType.ALL_RANDOM
             self.max_segments = config.build.max_segments
             self.country = config.build.country
+            self.max_frames = config.build.max_frames
         else:
             self.country = "NZ"
             self.tag_precedence = BuildConfig.DEFAULT_GROUPS
@@ -100,6 +101,7 @@ def __init__(
             self.segment_min_avg_mass = 10
             self.min_frame_mass = 16
             self.segment_type = SegmentType.ALL_RANDOM
+            self.max_frames = 75
 
         self.country_rectangle = BuildConfig.COUNTRY_LOCATIONS.get(self.country)
         logging.info(
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 51b26c6f..890b0209 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -238,7 +238,12 @@ def add_sample(self, sample):
         self.samples.append(sample)
 
     def calculate_sample_frames(
-        self, min_mass=None, max_mass=None, ffc_frames=None, skip_last=None
+        self,
+        min_mass=None,
+        max_mass=None,
+        ffc_frames=None,
+        skip_last=None,
+        max_frames=None,
     ):
         crop_rectangle = Rectangle(2, 2, 160 - 2 * 2, 140 - 2 * 2)
 
@@ -302,6 +307,8 @@ def calculate_sample_frames(
                 track_median_mass=self.median_mass,
             )
             self.samples.append(f)
+        if max_frames is not None and len(self.samples) > max_frames:
+            self.samples = np.random.choice(self.samples, max_frames, replace=False)
 
     def remove_sample(self, f):
         self.samples.remove(f)
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 4c7d0713..67d04188 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -244,6 +244,7 @@ def get_data(clip_samples, extra_args):
                             else track.upper_mass
                         ),
                         ffc_frames=clip_meta.ffc_frames,
+                        max_frames =extra_args.get("max_frames")
                     )
                 samples = track.samples
                 frame_temp_median = {}

From 29dc459cfd2295992b93a68c8291b1af04e9e9c7 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 3 Oct 2024 21:19:13 +0200
Subject: [PATCH 083/117] fix cap

---
 src/build.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/build.py b/src/build.py
index 76ef6e09..2251974a 100644
--- a/src/build.py
+++ b/src/build.py
@@ -732,15 +732,15 @@ def rough_balance(datasets):
         counts.sort()
         std_dev = np.std(counts)
         logging.info("Counts are %s std dev %s", counts, std_dev)
-        if std_dev < dev_threshold or len(counts) == 0:
+        if std_dev < dev_threshold or len(counts) <= 1:
             logging.info("Not balancing")
             continue
         if len(counts) <= 2:
-            cap_at = counts[-1]
+            cap_at = counts[-2]
         elif len(counts) < 7:
-            cap_at = counts[-1]
+            cap_at = counts[-2]
         else:
-            cap_at = counts[-1]
+            cap_at = counts[-2]
         logging.info("Capping dataset %s at %s", dataset.name, cap_at)
         for lbl, count in lbl_counts.items():
             if count <= cap_at:

From 64e283a32f49a23458102067228a51fcfeb4492e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Fri, 4 Oct 2024 08:58:55 +0200
Subject: [PATCH 084/117] load fp or animal model

---
 src/config/buildconfig.py  |  2 +-
 src/ml_tools/kerasmodel.py | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index 3ff3bdb5..ae5e9baf 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -105,7 +105,7 @@ def get_defaults(cls):
             min_frame_mass=10,
             filter_by_lq=False,
             max_segments=3,
-            thermal_diff_norm=True,
+            thermal_diff_norm=False,
             tag_precedence=BuildConfig.DEFAULT_GROUPS,
             excluded_tags=BuildConfig.EXCLUDED_TAGS,
             country=None,
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index f722dcf3..546a83ad 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -66,8 +66,8 @@ def __init__(self, train_config=None, labels=None, data_dir=None):
         self.label_probabilities = None
         self.class_weights = None
         self.ds_by_label = True
-        self.excluded_labels = []
-        self.remapped_labels = []
+        self.excluded_labels = None
+        self.remapped_labels = None
         self.orig_labels = None
 
     def load_training_meta(self, base_dir):
@@ -517,12 +517,15 @@ def train_model(
         logging.info(
             "%s Training model for %s epochs with weights %s", run_name, epochs, weights
         )
-
-        if self.params.excluded_labels is None:
+        if self.params.excluded_labels is not None:
+            self.excluded_labels = self.params.excluded_labels
+        else:
             self.excluded_labels, self.remapped_labels = get_excluded(
                 self.data_type, self.params.multi_label
             )
-        if self.params.remapped_labels is None:
+        if self.params.remapped_labels is not None:
+            self.remapped_labels = self.params.remapped_labels
+        else:
             self.remapped_labels, self.remapped_labels = get_excluded(
                 self.data_type, self.params.multi_label
             )
@@ -531,7 +534,10 @@ def train_model(
         logging.info(
             "Excluding %s remapping %s", self.excluded_labels, self.remapped_labels
         )
-
+        for lbl in self.remapped_labels.values():
+            if lbl not in self.labels:
+                self.labels.append(lbl)
+        self.labels.sort()
         if self.params.multi_label:
             self.labels.append("land-bird")
         self.orig_labels = self.labels.copy()

From acb90bd66bfd5ba2f668a9a5f61a7d4eef3c09cd Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 16:37:20 +0200
Subject: [PATCH 085/117] fix variable name

---
 src/ml_tools/interpreter.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index e4c0a99e..c485ead5 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -49,35 +49,31 @@ def get_preprocess_fn(self):
         else:
             import tensorflow as tf
 
-            if pretrained_model == "resnet":
+            if model_name == "resnet":
                 return tf.keras.applications.resnet.preprocess_input
-            elif pretrained_model == "nasnet":
+            elif model_name == "nasnet":
                 return tf.keras.applications.nasnet.preprocess_input
-            elif pretrained_model == "resnetv2":
+            elif model_name == "resnetv2":
                 return tf.keras.applications.resnet_v2.preprocess_input
 
-            elif pretrained_model == "resnet152":
+            elif model_name == "resnet152":
                 return tf.keras.applications.resnet.preprocess_input
 
-            elif pretrained_model == "vgg16":
+            elif model_name == "vgg16":
                 return tf.keras.applications.vgg16.preprocess_input
 
-            elif pretrained_model == "vgg19":
+            elif model_name == "vgg19":
                 return tf.keras.applications.vgg19.preprocess_input
 
-            elif pretrained_model == "mobilenet":
+            elif model_name == "mobilenet":
                 return tf.keras.applications.mobilenet_v2.preprocess_input
 
-            elif pretrained_model == "densenet121":
+            elif model_name == "densenet121":
                 return tf.keras.applications.densenet.preprocess_input
 
-            elif pretrained_model == "inceptionresnetv2":
+            elif model_name == "inceptionresnetv2":
                 return tf.keras.applications.inception_resnet_v2.preprocess_input
-        logging.warn(
-            "pretrained model %s has no preprocessing function", pretrained_model
-        )
-        return None
-        logging.info("No preprocess defined for %s", model_name)
+        logging.warn("pretrained model %s has no preprocessing function", model_name)
         return None
 
     def preprocess(self, clip, track, **args):

From f254e405563bf25ee672f56b40e92f7e52e118a5 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 16:49:54 +0200
Subject: [PATCH 086/117] fix variable

---
 src/classify/trackprediction.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/classify/trackprediction.py b/src/classify/trackprediction.py
index 464569d5..482a6b5b 100644
--- a/src/classify/trackprediction.py
+++ b/src/classify/trackprediction.py
@@ -162,7 +162,6 @@ def classified_frames(self, frame_numbers, predictions, mass):
             self.class_best_score += smoothed_prediction
 
     def classified_frame(self, frame_number, predictions, mass):
-        self.prediction_frames.append([frame_number])
         self.last_frame_classified = frame_number
         self.num_frames_classified += 1
         self.masses.append(mass)

From 7d87815b51c39408de9e499366dcb4bdfe50171b Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 17:01:10 +0200
Subject: [PATCH 087/117] fix confusion

---
 src/classify/trackprediction.py | 2 +-
 src/ml_tools/kerasmodel.py      | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/classify/trackprediction.py b/src/classify/trackprediction.py
index 482a6b5b..04dcf886 100644
--- a/src/classify/trackprediction.py
+++ b/src/classify/trackprediction.py
@@ -165,7 +165,7 @@ def classified_frame(self, frame_number, predictions, mass):
         self.last_frame_classified = frame_number
         self.num_frames_classified += 1
         self.masses.append(mass)
-        smoothed_prediction = prediction * prediction * mass
+        smoothed_prediction = predictions**2 * mass
 
         prediction = Prediction(
             predictions,
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 546a83ad..6d721c44 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -874,10 +874,12 @@ def confusion_tracks(self, dataset, filename, threshold=0.8):
         ]
         for y, pred in pred_per_track.values():
             pred.normalize_score()
-            no_smoothing = np.mean(pred.predictions, axis=0)
+            preds = np.array([p.prediction for p in pred.predictions])
+
+            no_smoothing = np.mean(preds, axis=0)
             masses = np.array(pred.masses)[:, None]
             old_smoothing = pred.class_best_score
-            new_smooth = pred.predictions * masses
+            new_smooth = preds * masses
             new_smooth = np.sum(new_smooth, axis=0)
             new_smooth /= np.sum(masses)
 

From 51ef0a39bde02a7e83fdcce46d205a219e0fffdd Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 17:02:23 +0200
Subject: [PATCH 088/117] add num frames

---
 src/modelevaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 286dfb04..4c7b7796 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -580,6 +580,7 @@ def main():
                 include_track=True,
                 cache=True,
                 channels=model.params.channels,
+                num_frames=self.params.square_width**2,
             )
             model.labels = new_labels
             logging.info(

From 602d8e0064aad0b785e1a5f64e55a78f5b691fea Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 17:16:54 +0200
Subject: [PATCH 089/117] load params properly

---
 src/modelevaluate.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 4c7b7796..803e8a89 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -559,6 +559,13 @@ def main():
             if model.params.multi_label:
                 model.labels.append("land-bird")
             excluded, remapped = get_excluded(model.data_type)
+
+            if model.params.excluded_labels is not None:
+                excluded = model.params.excluded_labels
+
+            if model.params.remapped_labels is not None:
+                remapped = model.params.remapped_labels
+
             files = base_dir / args.dataset
             dataset, _, new_labels, _ = get_dataset(
                 files,
@@ -580,7 +587,7 @@ def main():
                 include_track=True,
                 cache=True,
                 channels=model.params.channels,
-                num_frames=self.params.square_width**2,
+                num_frames=model.params.square_width**2,
             )
             model.labels = new_labels
             logging.info(

From 4ecf68451d109137a1f65cbe97003defc32d6e11 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Sun, 6 Oct 2024 20:21:23 +0200
Subject: [PATCH 090/117] set shuffle based on number of frames

---
 src/ml_tools/tfdataset.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index bcb29027..8bb2c4de 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -68,10 +68,13 @@ def get_dataset(load_function, base_dir, labels, **args):
     remapped = {}
     keys = []
     values = []
+    shuffle_size = 4096
+    if args.get("num_frames", 25) == 1:
+        shuffle_size *= 25
     if model_labels is not None:
         new_labels = model_labels
 
-        logging.info("Mapping DS labels to model labels ")
+        logging.info("Mapping DS labels %s to model labels %s", labels, model_labels)
         # if we are loading a model with different labels we need to map the dataset labels
         # to the equivalent model labels
         for l_i, og_lbl in enumerate(labels):
@@ -80,7 +83,6 @@ def get_dataset(load_function, base_dir, labels, **args):
                 lbl = og_lbl
                 if lbl in to_remap:
                     lbl = to_remap[lbl]
-                    l_i = labels.index(lbl)
 
                 mdl_i = model_labels.index(lbl)
                 if lbl not in remapped:
@@ -171,7 +173,9 @@ def get_dataset(load_function, base_dir, labels, **args):
 
             l_filter = lambda x, y: tf.math.reduce_all(tf.math.equal(y, l_mask))
             l_dataset = dataset.filter(l_filter)
-            l_dataset = l_dataset.shuffle(40096, reshuffle_each_iteration=True)
+            l_dataset = l_dataset.shuffle(
+                shuffle_size * 10, reshuffle_each_iteration=True
+            )
 
             label_ds.append(l_dataset)
         dataset = tf.data.Dataset.sample_from_datasets(
@@ -190,9 +194,9 @@ def get_dataset(load_function, base_dir, labels, **args):
         and args.get("shuffle", True)
         and not args.get("resample")
     ):
-        logging.info("shuffling data")
+        logging.info("shuffling data with buffer %s", shuffle_size)
         dataset = dataset.shuffle(
-            4096, reshuffle_each_iteration=args.get("reshuffle", True)
+            shuffle_size, reshuffle_each_iteration=args.get("reshuffle", True)
         )
     # tf refues to run if epoch sizes change so we must decide a costant epoch size even though with reject res
     # it will chang eeach epoch, to ensure this take this repeat data and always take epoch_size elements

From 5b7732c1ebbb3b5d2d7634ac56dbc5dddf7bf987 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 8 Oct 2024 16:45:44 +0200
Subject: [PATCH 091/117] do not sort

---
 src/ml_tools/kerasmodel.py |  2 +-
 src/ml_tools/preprocess.py | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 6d721c44..d07ed2f2 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -537,7 +537,7 @@ def train_model(
         for lbl in self.remapped_labels.values():
             if lbl not in self.labels:
                 self.labels.append(lbl)
-        self.labels.sort()
+
         if self.params.multi_label:
             self.labels.append("land-bird")
         self.orig_labels = self.labels.copy()
diff --git a/src/ml_tools/preprocess.py b/src/ml_tools/preprocess.py
index 1600425c..fe02199a 100644
--- a/src/ml_tools/preprocess.py
+++ b/src/ml_tools/preprocess.py
@@ -121,6 +121,13 @@ def preprocess_single_frame(
         data,
         axis=2,
     )
+    # global index
+    # index += 1
+    # tools.saveclassify_image(
+    #     image,
+    #     f"samples/{save_info}-{index}",
+    # )
+
     if preprocess_fn:
         image = preprocess_fn(image)
     return image
@@ -163,12 +170,12 @@ def preprocess_movement(
 
     #
     # # # # # for testing
-    global index
-    index += 1
-    tools.saveclassify_image(
-        data,
-        f"samples/{sample}-{index}",
-    )
+    # global index
+    # index += 1
+    # tools.saveclassify_image(
+    #     data,
+    #     f"samples/{sample}-{index}",
+    # )
 
     if preprocess_fn:
         data = preprocess_fn(data)

From 708afadd255f31587a7c764bad9ee19fe5ff51aa Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 8 Oct 2024 17:06:33 +0200
Subject: [PATCH 092/117] dont resample evenly some labels

---
 src/ml_tools/tfdataset.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index 8bb2c4de..cadbc359 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -70,7 +70,7 @@ def get_dataset(load_function, base_dir, labels, **args):
     values = []
     shuffle_size = 4096
     if args.get("num_frames", 25) == 1:
-        shuffle_size *= 25
+        shuffle_size *= 20
     if model_labels is not None:
         new_labels = model_labels
 
@@ -166,6 +166,8 @@ def get_dataset(load_function, base_dir, labels, **args):
         logging.info("RESAMPLING")
         # seems the only way to get even distribution
         label_ds = []
+        unbalanced_ds = []
+        dont_balance = ["vehicle"]
         for i, l in enumerate(new_labels):
             l_mask = np.zeros((len(new_labels)))
             l_mask[i] = 1
@@ -173,17 +175,24 @@ def get_dataset(load_function, base_dir, labels, **args):
 
             l_filter = lambda x, y: tf.math.reduce_all(tf.math.equal(y, l_mask))
             l_dataset = dataset.filter(l_filter)
-            l_dataset = l_dataset.shuffle(
-                shuffle_size * 10, reshuffle_each_iteration=True
-            )
-
-            label_ds.append(l_dataset)
+            l_dataset = l_dataset.shuffle(shuffle_size, reshuffle_each_iteration=True)
+            if l in dont_balance:
+                unbalanced_ds.append(l_dataset)
+            else:
+                label_ds.append(l_dataset)
         dataset = tf.data.Dataset.sample_from_datasets(
             label_ds,
             # weights=[1 / len(new_labels)] * len(new_labels),
             stop_on_empty_dataset=True,
             rerandomize_each_iteration=True,
         )
+        dont_balance.append(dataset)
+        dataset = tf.data.Dataset.sample_from_datasets(
+            dont_balance,
+            # weights=[1 / len(new_labels)] * len(new_labels),
+            stop_on_empty_dataset=False,
+            rerandomize_each_iteration=True,
+        )
     if args.get("epoch_size") is not None:
         dataset = dataset.take(args.get("epoch_size"))
         logging.info("Setting dataset to %s", args.get("epoch_size"))

From f81ee53da9817a35516cb2bdda2cd594e14484a2 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 10 Oct 2024 18:47:50 +0200
Subject: [PATCH 093/117] add fp_frames

---
 src/classify/trackprediction.py   | 3 +++
 src/ml_tools/datasetstructures.py | 2 ++
 src/ml_tools/interpreter.py       | 3 ++-
 src/ml_tools/rawdb.py             | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/classify/trackprediction.py b/src/classify/trackprediction.py
index 04dcf886..94afe43b 100644
--- a/src/classify/trackprediction.py
+++ b/src/classify/trackprediction.py
@@ -79,6 +79,9 @@ def clarity(self):
         best = np.argsort(self.prediction)
         return self.prediction[best[-1]] - self.prediction[best[-2]]
 
+    def __str__(self):
+        return f"{self.frames} conf: {np.round(100*self.prediction)}"
+
 
 class TrackPrediction:
     """
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 890b0209..76f24e34 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -145,7 +145,9 @@ def __init__(
         remapped_lbl=None,
         mega_missed_regions=None,
         skip_ffc=True,
+        fp_frames=None,
     ):
+        self.fp_frames = fp_frames
         # regions that megadetector found nothing in
         self.mega_missed_regions = mega_missed_regions
         self.station_id = station_id
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index c485ead5..aae59708 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -18,7 +18,7 @@ def load_json(self, filename):
         filename = filename.with_suffix(".json")
         logging.info("Loading metadata from %s", filename)
         metadata = json.load(open(filename, "r"))
-
+        self.version = metadata.get("version", None)
         self.labels = metadata["labels"]
         self.params = HyperParams()
         self.params.update(metadata.get("hyperparams", {}))
@@ -224,6 +224,7 @@ def preprocess_frames(
 
             if self.params.diff_norm:
                 filtered_norm_limits = (min_diff, max_diff)
+
         for i, region in enumerate(reversed(track.bounds_history)):
             if region.blank:
                 continue
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index e671462e..4fd53754 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -203,6 +203,7 @@ def get_clip_tracks(self, tag_precedence):
                 source_file=self.file,
                 mega_missed_regions=track_meta.get("mega_missed_regions"),
                 station_id=clip_header.station_id,
+                fp_frames=track_meta.get("fp_model_predictions"),
                 # frame_temp_median=frame_temp_median,
             )
             clip_header.tracks.append(header)

From 10762a07100c8a2a45bf993ecf3bb4f586d2ac1d Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 14 Oct 2024 18:04:44 +0200
Subject: [PATCH 094/117] read fp model predictions

---
 src/build.py                      |  2 ++
 src/ml_tools/datasetstructures.py | 11 +++++++++++
 src/ml_tools/rawdb.py             | 17 ++++++++++++++++-
 src/ml_tools/thermalwriter.py     |  3 ++-
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/build.py b/src/build.py
index 2251974a..c8c51cb8 100644
--- a/src/build.py
+++ b/src/build.py
@@ -899,6 +899,8 @@ def main():
                         "max_frames": master_dataset.max_frames,
                     }
                 )
+            # dont filter the test set,
+            extra_args["filter_by_fp"] = dataset.name != "test"
             create_tf_records(
                 dataset,
                 dir,
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 76f24e34..1b1a90fa 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -17,6 +17,9 @@
 FRAME_SIZE = 32
 MIN_SIZE = 4
 
+# hard coded for now
+FP_LABELS = ["other", "unidentified", "rain", "false-positive", "water", "insect"]
+
 
 class SegmentType(Enum):
     IMPORTANT_RANDOM = 0
@@ -147,7 +150,9 @@ def __init__(
         skip_ffc=True,
         fp_frames=None,
     ):
+
         self.fp_frames = fp_frames
+
         # regions that megadetector found nothing in
         self.mega_missed_regions = mega_missed_regions
         self.station_id = station_id
@@ -372,6 +377,7 @@ def get_segments(
         segment_frames=None,
         from_last=None,
         frame_min_mass=None,
+        filter_by_fp=True,
     ):
         if segment_frames is not None:
             raise Exception("Have not implement this path")
@@ -402,6 +408,7 @@ def get_segments(
             dont_filter=dont_filter,
             skip_ffc=skip_ffc,
             frame_min_mass=frame_min_mass,
+            fp_frames=self.fp_frames if filter_by_fp else None,
         )
         # GP could get this from the tracks when writing
         # but might be best to keep samples independent for ease
@@ -976,6 +983,7 @@ def get_segments(
     dont_filter=False,
     skip_ffc=True,
     frame_min_mass=None,
+    fp_frames=None,
 ):
     if segment_type == SegmentType.ALL_RANDOM_NOMIN:
         segment_min_mass = None
@@ -1001,6 +1009,9 @@ def get_segments(
         and region.height > 0
         and ((has_no_mass or frame_min_mass is None) or region.mass >= frame_min_mass)
     ]
+    if fp_frames is not None and label not in FP_LABELS:
+        frame_indices = [f for f in frame_indices if f not in fp_frames]
+        logging.info("FIltering with fp frames %s", fp_frames)
     if len(frame_indices) == 0:
         logging.warn("Nothing to load for %s - %s", clip_id, track_id)
         return [], filtered_stats
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 4fd53754..4b6a93cc 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -135,6 +135,10 @@ def get_clip_tracks(self, tag_precedence):
             ffc_frames=self.ffc_frames,
         )
         tracks = metadata.get("Tracks", [])
+        fp_labels = metadata.get("fp_model_labels")
+        fp_index = None
+        if fp_labels is not None:
+            fp_index = fp_labels.index("false-positive")
         meta = []
         for track_meta in tracks:
             tags = track_meta.get("tags", [])
@@ -191,6 +195,17 @@ def get_clip_tracks(self, tag_precedence):
                 if start is None:
                     start = region.frame_number
                 end = region.frame_number
+
+            fp_meta = track_meta.get("fp_model_predictions")
+            fp_frames = None
+            if fp_meta is not None:
+                fp_frames = []
+                for pred in fp_meta.get("predictions", []):
+                    scores = pred["prediction"]
+                    best_arg = np.argmax(scores)
+                    confidence = scores[best_arg]
+                    if best_arg == fp_index and confidence > 75:
+                        fp_frames.append(pred["frames"][0])
             header = TrackHeader(
                 clip_id=clip_header.clip_id,
                 track_id=int(track_meta["id"]),
@@ -203,7 +218,7 @@ def get_clip_tracks(self, tag_precedence):
                 source_file=self.file,
                 mega_missed_regions=track_meta.get("mega_missed_regions"),
                 station_id=clip_header.station_id,
-                fp_frames=track_meta.get("fp_model_predictions"),
+                fp_frames=fp_frames,
                 # frame_temp_median=frame_temp_median,
             )
             clip_header.tracks.append(header)
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 67d04188..fafbc505 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -229,6 +229,7 @@ def get_data(clip_samples, extra_args):
                         ffc_frames=clip_meta.ffc_frames,
                         max_segments=len(samples),
                         frame_min_mass=extra_args.get("min_mass"),
+                        filter_by_fp=extra_args.get("filter_by_fp"),
                     )
                 else:
                     filter_by_lq = extra_args.get("filter_by_lq", False)
@@ -244,7 +245,7 @@ def get_data(clip_samples, extra_args):
                             else track.upper_mass
                         ),
                         ffc_frames=clip_meta.ffc_frames,
-                        max_frames =extra_args.get("max_frames")
+                        max_frames=extra_args.get("max_frames"),
                     )
                 samples = track.samples
                 frame_temp_median = {}

From 9c654521ef351cfd6ac7d2ab3a534a21049d755c Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 15 Oct 2024 21:20:12 +0200
Subject: [PATCH 095/117] check for int

---
 src/ml_tools/rawdb.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 4b6a93cc..4a76a725 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -205,7 +205,11 @@ def get_clip_tracks(self, tag_precedence):
                     best_arg = np.argmax(scores)
                     confidence = scores[best_arg]
                     if best_arg == fp_index and confidence > 75:
-                        fp_frames.append(pred["frames"][0])
+                        frame_i = pred["frames"]
+                        if isinstance(frame_i, int):
+                            fp_frames.append(frame_i)
+                        else:
+                            fp_frames.append(frame_i[0])
             header = TrackHeader(
                 clip_id=clip_header.clip_id,
                 track_id=int(track_meta["id"]),

From 1ec5202d20d11dacf324cc6c921fc5b28b868f14 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 15 Oct 2024 21:21:02 +0200
Subject: [PATCH 096/117] remove log

---
 src/ml_tools/datasetstructures.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 1b1a90fa..2de71838 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -1011,7 +1011,6 @@ def get_segments(
     ]
     if fp_frames is not None and label not in FP_LABELS:
         frame_indices = [f for f in frame_indices if f not in fp_frames]
-        logging.info("FIltering with fp frames %s", fp_frames)
     if len(frame_indices) == 0:
         logging.warn("Nothing to load for %s - %s", clip_id, track_id)
         return [], filtered_stats

From 39754a41f3a838acd226c0392022fce07d479f17 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 16 Oct 2024 09:17:56 +0200
Subject: [PATCH 097/117] add country code into tf records

---
 src/ml_tools/datasetstructures.py |  1 +
 src/ml_tools/rawdb.py             | 10 ++++++++++
 src/ml_tools/thermalwriter.py     | 13 +++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 2de71838..bc42aec0 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -117,6 +117,7 @@ class ClipHeader:
     trap = attr.ib()
     tracks = attr.ib()
     ffc_frames = attr.ib()
+    country_code = attr.ib()
     frame_temp_median = attr.ib(default=None)
 
     def get_samples(self):
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 4a76a725..f3e5557c 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -22,6 +22,7 @@
 from track.cliptrackextractor import is_affected_by_ffc
 from cptv_rs_python_bindings import CptvReader
 from ml_tools.rectangle import Rectangle
+from config.buildconfig import BuildConfig
 
 special_datasets = [
     "tag_frames",
@@ -116,11 +117,19 @@ def get_clip_tracks(self, tag_precedence):
         location = metadata.get("location")
         lat = None
         lng = None
+        country_code = None
         try:
             lat = location.get("lat")
             lng = location.get("lng")
+            if lat is not None and lng is not None:
+                for country, location in BuildConfig.COUNTRY_LOCATIONS.items():
+                    if location.contains(lng, lat):
+                        country_code = country
+                        break
         except:
+            logging.error("Could not parse lat lng", exc_info=True)
             pass
+
         clip_header = ClipHeader(
             clip_id=int(metadata["id"]),
             station_id=metadata.get("stationId"),
@@ -133,6 +142,7 @@ def get_clip_tracks(self, tag_precedence):
             trap=metadata.get("trap", ""),
             tracks=[],
             ffc_frames=self.ffc_frames,
+            country_code=country_code,
         )
         tracks = metadata.get("Tracks", [])
         fp_labels = metadata.get("fp_model_labels")
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index fafbc505..7a123460 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -55,7 +55,7 @@
 from functools import lru_cache
 
 
-def create_tf_example(sample, data, features, labels, num_frames):
+def create_tf_example(sample, data, features, labels, num_frames, country_code):
     """Converts image and annotations to a tf.Example proto.
 
     Args:
@@ -128,6 +128,9 @@ def create_tf_example(sample, data, features, labels, num_frames):
         "image/format": tfrecord_util.bytes_feature("jpeg".encode("utf8")),
         "image/class/text": tfrecord_util.bytes_feature(sample.label.encode("utf8")),
         "image/class/label": tfrecord_util.int64_feature(labels.index(sample.label)),
+        "image/country_id": tfrecord_util.bytes_feature(
+            str(country_code).encode("utf8")
+        ),
     }
 
     example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
@@ -157,9 +160,11 @@ def save_data(samples, writer, labels, extra_args):
         return 0
     saved = 0
     try:
-        for data in sample_data:
+        country_code = sample_data[1]
+        sample_data = sample_data[0]
+        for sample, images, features in sample_data:
             tf_example = create_tf_example(
-                data[0], data[1], data[2], labels, extra_args["num_frames"]
+                sample, images, features, labels, extra_args["num_frames"], country_code
             )
             writer.write(tf_example.SerializeToString())
             saved += 1
@@ -372,4 +377,4 @@ def get_data(clip_samples, extra_args):
             "Cant get Samples for %s", clip_samples[0].source_file, exc_info=True
         )
         return None
-    return data
+    return (data, clip_meta.country_code)

From 7550768f76249a41c3cac39a487f55337c757b6d Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 17 Oct 2024 18:18:11 +0200
Subject: [PATCH 098/117] remove some

---
 src/ml_tools/thermaldataset.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/thermaldataset.py b/src/ml_tools/thermaldataset.py
index 94f1d46c..bd4da773 100644
--- a/src/ml_tools/thermaldataset.py
+++ b/src/ml_tools/thermaldataset.py
@@ -38,7 +38,13 @@ def get_excluded():
         "mammal",
         "frog",
         "cow",
-        # "fox",
+        # added gp forretrain
+        "wombat",
+        "gray kangaroo",
+        "echidna",
+        "fox",
+        "deer",
+        "sheep",
         # "wombat",
     ]
 

From c008df55b24a81fa66a6c9b8754c41360c786760 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 17 Oct 2024 18:20:02 +0200
Subject: [PATCH 099/117] dont filte rby fp

---
 src/ml_tools/interpreter.py | 1 +
 src/modelevaluate.py        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index aae59708..12a5c57b 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -299,6 +299,7 @@ def preprocess_segments(
             from_last=predict_from_last,
             max_segments=max_segments,
             dont_filter=dont_filter,
+            filter_by_fp = False,
         )
         frame_indices = set()
         for segment in segments:
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 803e8a89..7c11df91 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -463,7 +463,9 @@ def evaluate_dir(
                 #     smoothed = output
                 # else:
                 smoothed = output * masses
-                prediction.classified_clip(output, output, data[2], top_score=top_score)
+                prediction.classified_clip(
+                    output, smoothed, data[2], top_score=top_score
+                )
                 y_true.append(label_mapping.get(label, label))
                 predicted_labels = [prediction.predicted_tag()]
                 confidence = prediction.max_score

From 88aa20572e94dd7410b323a83608a6095bdccdff Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 17 Oct 2024 18:26:42 +0200
Subject: [PATCH 100/117] fix excluded

---
 src/ml_tools/tfdataset.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index cadbc359..f0299f79 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -110,7 +110,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         for l in labels:
             keys.append(labels.index(l))
             if l not in new_labels:
-                remapped[l] = -1
+                remapped[l] = [-1]
                 values.append(-1)
                 logging.info("Excluding %s", l)
             else:
@@ -119,7 +119,9 @@ def get_dataset(load_function, base_dir, labels, **args):
 
         # add the remapped labels to the correct place
         for k, v in to_remap.items():
-            if k in labels and v in labels:
+            if k in excluded_labels:
+                continue
+            if k in labels and v in new_labels and k in new_labels:
                 remapped[v].append(k)
                 values[labels.index(k)] = new_labels.index(v)
                 del remapped[k]

From a8717ec7b199eaeeee0e8594af562636682c5919 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Fri, 18 Oct 2024 09:48:44 +0200
Subject: [PATCH 101/117] fix fine tune

---
 src/ml_tools/kerasmodel.py | 50 ++++++++++++++++++--------------------
 src/ml_tools/tfdataset.py  |  4 +--
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index d07ed2f2..c385b3f3 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -534,34 +534,12 @@ def train_model(
         logging.info(
             "Excluding %s remapping %s", self.excluded_labels, self.remapped_labels
         )
-        for lbl in self.remapped_labels.values():
-            if lbl not in self.labels:
-                self.labels.append(lbl)
 
-        if self.params.multi_label:
+        if self.params.multi_label and "land-bird" not in self.labels:
             self.labels.append("land-bird")
         self.orig_labels = self.labels.copy()
-        for l in self.excluded_labels:
-            if l in self.labels:
-                self.labels.remove(l)
-        for l in self.remapped_labels.keys():
-            if l in self.labels:
-                self.labels.remove(l)
-        self.log_dir = self.log_base / run_name
-        self.log_dir.mkdir(parents=True, exist_ok=True)
-        if fine_tune is not None:
-            self.load_model(fine_tune, weights=weights)
-            self.adjust_final_layer()
-
-        elif not self.model:
-            self.build_model(
-                dense_sizes=self.params.dense_sizes,
-                retrain_from=self.params.retrain_layer,
-                dropout=self.params.dropout,
-                run_name=run_name,
-            )
-        self.model.summary()
 
+        self.preprocess_fn = self.get_preprocess_fn()
         self.train, remapped, new_labels, epoch_size = get_dataset(
             train_files,
             self.data_type,
@@ -580,6 +558,28 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
         )
+        self.labels = new_labels
+
+        self.log_dir = self.log_base / run_name
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        if fine_tune is not None:
+            self.load_model(fine_tune, weights=weights)
+            self.adjust_final_layer()
+        else:
+
+            if not self.model:
+                self.build_model(
+                    dense_sizes=self.params.dense_sizes,
+                    retrain_from=self.params.retrain_layer,
+                    dropout=self.params.dropout,
+                    run_name=run_name,
+                )
+
+            if weights is not None:
+                self.model.load_weights(weights)
+
+        self.model.summary()
+
         self.remapped = remapped
         self.validate, remapped, _, _ = get_dataset(
             validate_files,
@@ -597,8 +597,6 @@ def train_model(
             num_frames=self.params.square_width**2,
             channels=self.params.channels,
         )
-        if weights is not None:
-            self.model.load_weights(weights)
         if rebalance:
             self.class_weights = get_weighting(self.train, self.labels)
         logging.info(
diff --git a/src/ml_tools/tfdataset.py b/src/ml_tools/tfdataset.py
index f0299f79..4914ea5b 100644
--- a/src/ml_tools/tfdataset.py
+++ b/src/ml_tools/tfdataset.py
@@ -121,7 +121,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         for k, v in to_remap.items():
             if k in excluded_labels:
                 continue
-            if k in labels and v in new_labels and k in new_labels:
+            if k in labels and v in new_labels:
                 remapped[v].append(k)
                 values[labels.index(k)] = new_labels.index(v)
                 del remapped[k]
@@ -135,7 +135,7 @@ def get_dataset(load_function, base_dir, labels, **args):
         name="remapped_y",
     )
     num_labels = len(new_labels)
-    logging.info("New labels are %s", new_labels)
+    logging.info("New labels are %s from original %s", new_labels, labels)
     for k, v in zip(keys, values):
         logging.info(
             "Mapping %s to %s", labels[k], new_labels[v] if v >= 0 else "nothing"

From 84a5da6be7b0467784eabcd6d2dd986634d7c819 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 21 Oct 2024 09:16:58 +0200
Subject: [PATCH 102/117] add parsing

---
 src/ml_tools/kerasmodel.py |  5 ++++-
 src/ml_tools/rawdb.py      | 23 ++++++++++++-----------
 src/modelevaluate.py       |  6 ++++++
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index c385b3f3..4e5acf8a 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -563,7 +563,10 @@ def train_model(
         self.log_dir = self.log_base / run_name
         self.log_dir.mkdir(parents=True, exist_ok=True)
         if fine_tune is not None:
-            self.load_model(fine_tune, weights=weights)
+            self.load_model(fine_tune, weights=weights, training=True)
+            # load model loads old labels
+            self.labels = new_labels
+
             self.adjust_final_layer()
         else:
 
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index f3e5557c..00c14593 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -118,17 +118,18 @@ def get_clip_tracks(self, tag_precedence):
         lat = None
         lng = None
         country_code = None
-        try:
-            lat = location.get("lat")
-            lng = location.get("lng")
-            if lat is not None and lng is not None:
-                for country, location in BuildConfig.COUNTRY_LOCATIONS.items():
-                    if location.contains(lng, lat):
-                        country_code = country
-                        break
-        except:
-            logging.error("Could not parse lat lng", exc_info=True)
-            pass
+        if location is not None:
+            try:
+                lat = location.get("lat")
+                lng = location.get("lng")
+                if lat is not None and lng is not None:
+                    for country, location in BuildConfig.COUNTRY_LOCATIONS.items():
+                        if location.contains(lng, lat):
+                            country_code = country
+                            break
+            except:
+                logging.error("Could not parse lat lng", exc_info=True)
+                pass
 
         clip_header = ClipHeader(
             clip_id=int(metadata["id"]),
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 7c11df91..94ae85e9 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -452,7 +452,13 @@ def evaluate_dir(
             for data in clip_data:
                 label = data[1]
                 preprocessed = data[3]
+                if len(preprocessed) == 0:
+                    logging.info("No data found for %s", data[0])
+                    y_true.append(label_mapping.get(label, label))
+                    y_pred.append("None")
+                    continue
                 output = model.predict(preprocessed)
+
                 prediction = TrackPrediction(data[0], model.labels)
                 masses = np.array(data[4])
                 masses = masses[:, None]

From 4def7a8e9aa56daaef1f0ea6533a507493ab55d8 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Fri, 25 Oct 2024 13:49:47 +0200
Subject: [PATCH 103/117] try limit memory

---
 src/ml_tools/tfwriter.py | 92 +++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 519f4ffb..9b51b29a 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -39,12 +39,12 @@
 import math
 
 
-def process_job(queue, labels, base_dir, save_data, extra_args):
+def process_job(queue, labels, base_dir, save_data, writer_i, extra_args):
     import gc
 
     pid = os.getpid()
 
-    writer_i = 1
+    # writer_i = 1
     name = f"{writer_i}-{pid}.tfrecord"
 
     options = tf.io.TFRecordOptions(compression_type="GZIP")
@@ -66,14 +66,14 @@ def process_job(queue, labels, base_dir, save_data, extra_args):
                 saved += save_data(samples, writer, labels, extra_args)
                 files += 1
                 del samples
-                if saved > 250000 / num_frames:
-                    logging.info("Closing old writer")
-                    writer.close()
-                    writer_i += 1
-                    name = f"{writer_i}-{pid}.tfrecord"
-                    logging.info("Opening %s", name)
-                    saved = 0
-                    writer = tf.io.TFRecordWriter(str(base_dir / name), options=options)
+                # if saved > 250000 / num_frames:
+                #     logging.info("Closing old writer")
+                #     writer.close()
+                #     writer_i += 1
+                #     name = f"{writer_i}-{pid}.tfrecord"
+                #     logging.info("Opening %s", name)
+                #     saved = 0
+                #     writer = tf.io.TFRecordWriter(str(base_dir / name), options=options)
                 if i % int(25000 / num_frames) == 0:
                     logging.info("Saved %s ", files)
                     gc.collect()
@@ -106,37 +106,49 @@ def create_tf_records(
         "writing to output path: %s for %s samples", output_path, len(samples_by_source)
     )
     num_processes = 8
+    writer_i = 0
+    index = 0
+    jobs_per_process = 300 * num_processes
     try:
-        job_queue = Queue()
-        processes = []
-        for i in range(num_processes):
-            p = Process(
-                target=process_job,
-                args=(job_queue, labels, output_path, save_data, extra_args),
-            )
-            processes.append(p)
-            p.start()
-            added = 0
-        for source_file in source_files:
-            job_queue.put((samples_by_source[source_file]))
-            added += 1
-            while job_queue.qsize() > num_processes * 10:
-                logging.info("Sleeping for %s", 10)
-                # give it a change to catch up
-                time.sleep(10)
-
-        logging.info("Processing %d", job_queue.qsize())
-        for i in range(len(processes)):
-            job_queue.put(("DONE"))
-        for process in processes:
-            try:
-                process.join()
-            except KeyboardInterrupt:
-                logging.info("KeyboardInterrupt, terminating.")
-                for process in processes:
-                    process.terminate()
-                exit()
-        logging.info("Saved %s", len(dataset.samples_by_id))
+        while index < len(source_files):
+            job_queue = Queue()
+            processes = []
+            for i in range(num_processes):
+                p = Process(
+                    target=process_job,
+                    args=(
+                        job_queue,
+                        labels,
+                        output_path,
+                        save_data,
+                        writer_i,
+                        extra_args,
+                    ),
+                )
+                processes.append(p)
+                p.start()
+                added = 0
+            writer_i += 1
+            for source_file in source_files[index : index + jobs_per_process]:
+                job_queue.put((samples_by_source[source_file]))
+                added += 1
+                while job_queue.qsize() > num_processes * 10:
+                    logging.info("Sleeping for %s", 10)
+                    # give it a change to catch up
+                    time.sleep(10)
+            index += jobs_per_process
+            logging.info("Processing %d", job_queue.qsize())
+            for i in range(len(processes)):
+                job_queue.put(("DONE"))
+            for process in processes:
+                try:
+                    process.join()
+                except KeyboardInterrupt:
+                    logging.info("KeyboardInterrupt, terminating.")
+                    for process in processes:
+                        process.terminate()
+                    exit()
+            logging.info("Saved %s", len(dataset.samples_by_id))
 
     except:
         logging.error("Error saving track info", exc_info=True)

From 50faca5b73aa579094c88c7159f76ef6dd884e88 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 28 Oct 2024 14:39:18 +0100
Subject: [PATCH 104/117] dont validate bins for after date test clips

---
 src/build.py             | 13 +++++++++++--
 src/ml_tools/tfwriter.py |  4 ++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/build.py b/src/build.py
index c8c51cb8..8a53f26a 100644
--- a/src/build.py
+++ b/src/build.py
@@ -571,7 +571,7 @@ def add_samples(
     dataset.add_samples(samples)
 
 
-def validate_datasets(datasets, test_bins, date):
+def validate_datasets(datasets, test_bins, after_date):
     # check that clips are only in one dataset
     # that only test set has clips after date
     # that test set is the only dataset with test_clips
@@ -580,7 +580,7 @@ def validate_datasets(datasets, test_bins, date):
     #     for track in dataset.tracks:
     #         assert track.start_time < date
 
-    for i, dataset in enumerate(datasets):
+    for i, dataset in enumerate(datasets[:2]):
         dont_check = set(
             [
                 sample.bin_id
@@ -608,6 +608,15 @@ def validate_datasets(datasets, test_bins, date):
                     if sample.label in split_by_clip
                 ]
             )
+            if dataset.name == "test" and after_date is not None:
+                dont_check_other = set(
+                    [
+                        sample.bin_id
+                        for sample in other.samples_by_id.values()
+                        if sample.rec_time > after_date
+                    ]
+                )
+                dont_check = dont_check + dont_check_other
             other_bins = set([sample.bin_id for sample in other.samples_by_id.values()])
             other_bins = other_bins - dont_check
             other_clips = set(
diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 9b51b29a..8658fb18 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -46,7 +46,7 @@ def process_job(queue, labels, base_dir, save_data, writer_i, extra_args):
 
     # writer_i = 1
     name = f"{writer_i}-{pid}.tfrecord"
-
+    logging.info("Writing to %s", name)
     options = tf.io.TFRecordOptions(compression_type="GZIP")
     writer = tf.io.TFRecordWriter(str(base_dir / name), options=options)
     i = 0
@@ -108,7 +108,7 @@ def create_tf_records(
     num_processes = 8
     writer_i = 0
     index = 0
-    jobs_per_process = 300 * num_processes
+    jobs_per_process = 3000 * num_processes
     try:
         while index < len(source_files):
             job_queue = Queue()

From 37c68e1291c953f7d85abd17c2578d9795078c5f Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 28 Oct 2024 14:50:21 +0100
Subject: [PATCH 105/117] added start time

---
 src/build.py                      | 2 +-
 src/ml_tools/datasetstructures.py | 5 +++--
 src/ml_tools/rawdb.py             | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/build.py b/src/build.py
index 8a53f26a..b61ea270 100644
--- a/src/build.py
+++ b/src/build.py
@@ -608,7 +608,7 @@ def validate_datasets(datasets, test_bins, after_date):
                     if sample.label in split_by_clip
                 ]
             )
-            if dataset.name == "test" and after_date is not None:
+            if other.name == "test" and after_date is not None:
                 dont_check_other = set(
                     [
                         sample.bin_id
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index bc42aec0..731e65a2 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -141,7 +141,7 @@ def __init__(
         ffc_frames=None,
         sample_frames_indices=None,
         station_id=None,
-        rec_time=None,
+        start_time=None,
         source_file=None,
         camera=None,
         confidence=None,
@@ -153,7 +153,7 @@ def __init__(
     ):
 
         self.fp_frames = fp_frames
-
+        self.start_time = start_time
         # regions that megadetector found nothing in
         self.mega_missed_regions = mega_missed_regions
         self.station_id = station_id
@@ -410,6 +410,7 @@ def get_segments(
             skip_ffc=skip_ffc,
             frame_min_mass=frame_min_mass,
             fp_frames=self.fp_frames if filter_by_fp else None,
+            rec_time=self.start_time,
         )
         # GP could get this from the tracks when writing
         # but might be best to keep samples independent for ease
diff --git a/src/ml_tools/rawdb.py b/src/ml_tools/rawdb.py
index 00c14593..e99200f6 100644
--- a/src/ml_tools/rawdb.py
+++ b/src/ml_tools/rawdb.py
@@ -23,6 +23,7 @@
 from cptv_rs_python_bindings import CptvReader
 from ml_tools.rectangle import Rectangle
 from config.buildconfig import BuildConfig
+from datetime import timedelta
 
 special_datasets = [
     "tag_frames",
@@ -234,6 +235,7 @@ def get_clip_tracks(self, tag_precedence):
                 mega_missed_regions=track_meta.get("mega_missed_regions"),
                 station_id=clip_header.station_id,
                 fp_frames=fp_frames,
+                start_time=clip_header.rec_time + timedelta(seconds=start / FPS),
                 # frame_temp_median=frame_temp_median,
             )
             clip_header.tracks.append(header)

From 600af583dd4ec0b294506d7e8f1e25e156608458 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 29 Oct 2024 08:36:30 +0100
Subject: [PATCH 106/117] union set

---
 src/build.py             | 2 +-
 src/ml_tools/tfwriter.py | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/build.py b/src/build.py
index b61ea270..ef416f2c 100644
--- a/src/build.py
+++ b/src/build.py
@@ -616,7 +616,7 @@ def validate_datasets(datasets, test_bins, after_date):
                         if sample.rec_time > after_date
                     ]
                 )
-                dont_check = dont_check + dont_check_other
+                dont_check = dont_check | dont_check_other
             other_bins = set([sample.bin_id for sample in other.samples_by_id.values()])
             other_bins = other_bins - dont_check
             other_clips = set(
diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 8658fb18..18f628e1 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -132,10 +132,7 @@ def create_tf_records(
             for source_file in source_files[index : index + jobs_per_process]:
                 job_queue.put((samples_by_source[source_file]))
                 added += 1
-                while job_queue.qsize() > num_processes * 10:
-                    logging.info("Sleeping for %s", 10)
-                    # give it a change to catch up
-                    time.sleep(10)
+
             index += jobs_per_process
             logging.info("Processing %d", job_queue.qsize())
             for i in range(len(processes)):

From 43180b7de38b65066db3636c0d5f4068a7de0083 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 5 Nov 2024 20:25:35 +0100
Subject: [PATCH 107/117] repeat frames at random rather than only last frame

---
 src/classify/clipclassifier.py    | 12 +++++++++-
 src/classify/trackprediction.py   | 13 ++++++----
 src/ml_tools/datasetstructures.py |  5 ++--
 src/ml_tools/imageprocessing.py   | 12 ++--------
 src/ml_tools/interpreter.py       |  5 +++-
 src/ml_tools/preprocess.py        |  8 +++++++
 src/ml_tools/previewer.py         |  8 +++----
 src/track/clip.py                 |  1 -
 src/track/track.py                | 40 ++++++++++++++++++-------------
 9 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/src/classify/clipclassifier.py b/src/classify/clipclassifier.py
index 9bcaa135..9e7dd279 100644
--- a/src/classify/clipclassifier.py
+++ b/src/classify/clipclassifier.py
@@ -245,7 +245,17 @@ def save_metadata(
                 prediction = predictions.prediction_for(track.get_id())
                 if prediction is None:
                     continue
-
+                # DEBUGGING STUFF REMOVE ME
+                # logging.info("Track predictions %s", track)
+                # for p in prediction.predictions:
+                #     logging.info(
+                #         "Have %s sum %s smoothed %s mass %s",
+                #         p,
+                #         np.sum(p.prediction),
+                #         np.round(p.smoothed_prediction),
+                #         p.mass,
+                #     )
+                # logging.info("smoothed %s", np.round(100 * prediction.class_best_score))
                 prediction_meta = prediction.get_metadata()
                 prediction_meta["model_id"] = model_id
                 prediction_info.append(prediction_meta)
diff --git a/src/classify/trackprediction.py b/src/classify/trackprediction.py
index 94afe43b..a9af5056 100644
--- a/src/classify/trackprediction.py
+++ b/src/classify/trackprediction.py
@@ -110,18 +110,23 @@ def __init__(self, track_id, labels, keep_all=True, start_frame=None):
         self.masses = []
 
     def classified_clip(
-        self, predictions, smoothed_predictions, prediction_frames, top_score=None
+        self,
+        predictions,
+        smoothed_predictions,
+        prediction_frames,
+        masses,
+        top_score=None,
     ):
         self.num_frames_classified = len(predictions)
-        for prediction, smoothed_prediction, frames in zip(
-            predictions, smoothed_predictions, prediction_frames
+        for prediction, smoothed_prediction, frames, mass in zip(
+            predictions, smoothed_predictions, prediction_frames, masses
         ):
             prediction = Prediction(
                 prediction,
                 smoothed_prediction,
                 frames,
                 np.amax(frames),
-                None,
+                mass,
             )
             self.predictions.append(prediction)
 
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 731e65a2..6a7fbd61 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -1058,6 +1058,7 @@ def get_segments(
     segment_count = int(segment_count)
     if max_segments is not None:
         segment_count = min(max_segments, segment_count)
+
     # take any segment_width frames, this could be done each epoch
     whole_indices = frame_indices
     random_frames = segment_type in [
@@ -1074,8 +1075,7 @@ def get_segments(
             np.random.shuffle(frame_indices)
         for i in range(segment_count):
             # always get atleast one segment, not doing annymore
-            # if i > 0:
-            if (len(frame_indices) < segment_width and len(segments) > 1) or len(
+            if (len(frame_indices) < segment_width / 2.0 and len(segments) > 1) or len(
                 frame_indices
             ) < segment_width / 4:
                 break
@@ -1089,6 +1089,7 @@ def get_segments(
                     replace=False,
                 )
                 frames = section[indices]
+                # might need to change that gp 11/05 - 2024
                 frame_indices = frame_indices[segment_frame_spacing:]
             elif random_frames:
                 # frame indices already randomized so just need to grab some
diff --git a/src/ml_tools/imageprocessing.py b/src/ml_tools/imageprocessing.py
index 4eeebcac..42312dee 100644
--- a/src/ml_tools/imageprocessing.py
+++ b/src/ml_tools/imageprocessing.py
@@ -34,8 +34,6 @@ def resize_and_pad(
         resize_dim = (width, height)
     if pad is None:
         pad = np.min(frame)
-    else:
-        pad = 0
 
     resized = np.full(new_dim, pad, dtype=frame.dtype)
     offset_x = 0
@@ -80,20 +78,14 @@ def resize_cv(image, dim, interpolation=cv2.INTER_LINEAR, extra_h=0, extra_v=0):
     )
 
 
-def square_clip(data, frames_per_row, tile_dim, normalize=True):
+def square_clip(data, frames_per_row, tile_dim, frame_samples, normalize=True):
     # lay each frame out side by side in rows
     new_frame = np.zeros((frames_per_row * tile_dim[0], frames_per_row * tile_dim[1]))
     i = 0
     success = False
     for x in range(frames_per_row):
         for y in range(frames_per_row):
-            if i >= len(data):
-                frame = data[-1]
-            else:
-                frame = data[i]
-
-            # cv2.imshow("frame", np.uint8(frame))
-            # cv2.waitKey(0)
+            frame = data[frame_samples[i]]
             if normalize:
                 frame, stats = normalize(frame, new_max=255)
                 if not stats[0]:
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index 12a5c57b..2b299181 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -146,6 +146,7 @@ def classify_track(self, clip, track, segment_frames=None):
         # self.model.predict(preprocessed)
         top_score = None
         smoothed_predictions = None
+
         if self.params.smooth_predictions:
             masses = np.array(masses)
             top_score = np.sum(masses)
@@ -155,6 +156,7 @@ def classify_track(self, clip, track, segment_frames=None):
             output,
             smoothed_predictions,
             prediction_frames,
+            masses,
             top_score=top_score,
         )
         track_prediction.classify_time = time.time() - start
@@ -213,6 +215,7 @@ def preprocess_frames(
                     diff_frame = region.subimage(f.thermal) - region.subimage(
                         clip.background
                     )
+
                     new_max = np.amax(diff_frame)
                     new_min = np.amin(diff_frame)
                     if min_diff is None or new_min < min_diff:
@@ -299,7 +302,7 @@ def preprocess_segments(
             from_last=predict_from_last,
             max_segments=max_segments,
             dont_filter=dont_filter,
-            filter_by_fp = False,
+            filter_by_fp=False,
         )
         frame_indices = set()
         for segment in segments:
diff --git a/src/ml_tools/preprocess.py b/src/ml_tools/preprocess.py
index fe02199a..9ab61c8f 100644
--- a/src/ml_tools/preprocess.py
+++ b/src/ml_tools/preprocess.py
@@ -147,6 +147,13 @@ def preprocess_movement(
 ):
     frame_types = {}
     data = []
+    frame_samples = list(np.arange(len(preprocess_frames)))
+    if len(preprocess_frames) < frames_per_row * 5:
+        extra_samples = np.random.choice(
+            frame_samples, frames_per_row * 5 - len(preprocess_frames)
+        )
+        frame_samples.extend(extra_samples)
+        frame_samples.sort()
     for channel in channels:
         if isinstance(channel, str):
             channel = TrackChannels[channel]
@@ -158,6 +165,7 @@ def preprocess_movement(
             channel_segment,
             frames_per_row,
             (frame_size, frame_size),
+            frame_samples,
             normalize=False,
         )
         # already done normalization
diff --git a/src/ml_tools/previewer.py b/src/ml_tools/previewer.py
index ddd203ea..34255047 100644
--- a/src/ml_tools/previewer.py
+++ b/src/ml_tools/previewer.py
@@ -91,8 +91,8 @@ def export_clip_preview(self, filename, clip: Clip, predictions=None):
         if self.debug:
             footer = Previewer.stats_footer(clip.stats)
         if predictions and (
-            self.preview_type == self.PREVIEW_CLASSIFIED
-            or self.preview_type == self.PREVIEW_TRACKING
+            self.preview_type == PREVIEW_CLASSIFIED
+            or self.preview_type == PREVIEW_TRACKING
         ):
             self.create_track_descriptions(clip, predictions)
 
@@ -103,14 +103,14 @@ def export_clip_preview(self, filename, clip: Clip, predictions=None):
 
         res_x = clip.res_x
         res_y = clip.res_y
-        if self.preview_type == self.PREVIEW_TRACKING:
+        if self.preview_type == PREVIEW_TRACKING:
             res_x *= 2
             res_y *= 2
 
         mpeg = MPEGCreator(str(filename))
         frame_scale = 4
         for frame_number, frame in enumerate(clip.frame_buffer):
-            if self.preview_type == self.PREVIEW_RAW:
+            if self.preview_type == PREVIEW_RAW:
                 image = self.convert_and_resize(
                     frame.thermal, clip.stats.min_temp, clip.stats.max_temp, clip.type
                 )
diff --git a/src/track/clip.py b/src/track/clip.py
index e21ccbc8..72cc489d 100644
--- a/src/track/clip.py
+++ b/src/track/clip.py
@@ -185,7 +185,6 @@ def calculate_background(self, frame_reader):
             self.update_background(frame.pix)
             self._background_calculated()
             return
-
         first_frame = frame
         initial_frames = None
         initial_diff = None
diff --git a/src/track/track.py b/src/track/track.py
index 5550391c..165ee39d 100644
--- a/src/track/track.py
+++ b/src/track/track.py
@@ -444,6 +444,7 @@ def get_segments(
         max_segments=None,
         ffc_frames=None,
         dont_filter=False,
+        filter_by_fp=False,
     ):
         if from_last is not None:
             if from_last == 0:
@@ -476,23 +477,28 @@ def get_segments(
                 )
                 segments.append(segment)
         else:
-            segments, _ = get_segments(
-                self.clip_id,
-                self._id,
-                start_frame,
-                segment_frame_spacing=segment_frame_spacing,
-                segment_width=segment_width,
-                regions=regions,
-                ffc_frames=ffc_frames,
-                repeats=repeats,
-                # frame_temp_median=frame_temp_median,
-                min_frames=min_frames,
-                segment_frames=None,
-                segment_type=segment_type,
-                max_segments=max_segments,
-                dont_filter=dont_filter,
-            )
-        return segments
+            all_segments = []
+            for seg_type in [SegmentType.ALL_RANDOM, SegmentType.ALL_SECTIONS]:
+                segments, _ = get_segments(
+                    self.clip_id,
+                    self._id,
+                    start_frame,
+                    segment_frame_spacing=segment_frame_spacing,
+                    segment_width=segment_width,
+                    regions=regions,
+                    ffc_frames=ffc_frames,
+                    repeats=repeats,
+                    # frame_temp_median=frame_temp_median,
+                    min_frames=min_frames,
+                    segment_frames=None,
+                    segment_type=seg_type,
+                    max_segments=max_segments,
+                    dont_filter=dont_filter,
+                    # segment_type=seg_type,
+                )
+                all_segments.extend(segments)
+
+        return all_segments
 
     @classmethod
     def from_region(cls, clip, region, tracker_version=None, tracking_config=None):

From 7ca8817b43f7fd67aa3569e61441d451b855a831 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 5 Nov 2024 20:26:58 +0100
Subject: [PATCH 108/117] less jobs

---
 src/ml_tools/tfwriter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index 18f628e1..d40cf8ac 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -108,7 +108,7 @@ def create_tf_records(
     num_processes = 8
     writer_i = 0
     index = 0
-    jobs_per_process = 3000 * num_processes
+    jobs_per_process = 600 * num_processes
     try:
         while index < len(source_files):
             job_queue = Queue()

From 46b431b641a6e746abb835c3874a119b0071ef14 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Thu, 7 Nov 2024 18:10:18 +0100
Subject: [PATCH 109/117] add multiple segment type option

---
 src/build.py                      |   3 +-
 src/ml_tools/dataset.py           |  46 +---
 src/ml_tools/datasetstructures.py | 343 ++++++++++++++++--------------
 src/ml_tools/hyperparams.py       |  16 +-
 src/ml_tools/interpreter.py       |   3 +-
 src/ml_tools/kerasmodel.py        |   1 -
 src/ml_tools/tfwriter.py          |  31 +--
 src/ml_tools/thermalwriter.py     |   2 +-
 src/ml_tools/tools.py             |   4 +-
 src/modelevaluate.py              |   8 +-
 src/track/track.py                |  38 ++--
 11 files changed, 226 insertions(+), 269 deletions(-)

diff --git a/src/build.py b/src/build.py
index ef416f2c..3c766af5 100644
--- a/src/build.py
+++ b/src/build.py
@@ -896,7 +896,7 @@ def main():
                     {
                         "segment_frame_spacing": master_dataset.segment_spacing * 9,
                         "segment_width": master_dataset.segment_length,
-                        "segment_type": master_dataset.segment_type,
+                        "segment_types": master_dataset.segment_types,
                         "segment_min_avg_mass": master_dataset.segment_min_avg_mass,
                         "max_segments": master_dataset.max_segments,
                         "dont_filter_segment": True,
@@ -932,6 +932,7 @@ def main():
         "counts": dataset_counts,
         "by_label": False,
         "config": attrs.asdict(config),
+        "segment_types": master_dataset.segment_types,
     }
 
     with open(meta_filename, "w") as f:
diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 8556c5f9..7e633b32 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -83,7 +83,7 @@ def __init__(
             self.excluded_tags = config.build.excluded_tags
             self.min_frame_mass = config.build.min_frame_mass
             self.filter_by_lq = config.build.filter_by_lq
-            self.segment_type = SegmentType.ALL_RANDOM
+            self.segment_types = [SegmentType.ALL_RANDOM]
             self.max_segments = config.build.max_segments
             self.country = config.build.country
             self.max_frames = config.build.max_frames
@@ -100,7 +100,7 @@ def __init__(
             self.segment_spacing = 1
             self.segment_min_avg_mass = 10
             self.min_frame_mass = 16
-            self.segment_type = SegmentType.ALL_RANDOM
+            self.segment_types = [SegmentType.ALL_RANDOM]
             self.max_frames = 75
 
         self.country_rectangle = BuildConfig.COUNTRY_LOCATIONS.get(self.country)
@@ -244,7 +244,7 @@ def load_clip(self, db_clip, dont_filter_segment=False):
                 track_header.get_segments(
                     segment_width,
                     segment_frame_spacing,
-                    self.segment_type,
+                    self.segment_types,
                     self.segment_min_avg_mass,
                     max_segments=self.max_segments,
                     dont_filter=dont_filter_segment,
@@ -504,46 +504,6 @@ def regroup(
     def has_data(self):
         return len(self.samples_by_id) > 0
 
-    #
-    # def recalculate_segments(self, segment_type=SegmentType.ALL_RANDOM):
-    #     self.samples_by_bin.clear()
-    #     self.samples_by_label.clear()
-    #     del self.samples[:]
-    #     del self.samples
-    #     self.samples = []
-    #     self.samples_by_label = {}
-    #     self.samples_by_bin = {}
-    #     logging.info("%s generating segments  type %s", self.name, segment_type)
-    #     start = time.time()
-    #     empty_tracks = []
-    #     filtered_stats = 0
-    #
-    #     for track in self.tracks:
-    #         segment_frame_spacing = int(
-    #             round(self.segment_spacing * track.frames_per_second)
-    #         )
-    #         segment_width = self.segment_length
-    #         track.calculate_segments(
-    #             segment_frame_spacing,
-    #             segment_width,
-    #             segment_type,
-    #             segment_min_mass=segment_min_avg_mass,
-    #         )
-    #         filtered_stats = filtered_stats + track.filtered_stats["segment_mass"]
-    #         if len(track.segments) == 0:
-    #             empty_tracks.append(track)
-    #             continue
-    #         for sample in track.segments:
-    #             self.add_clip_sample_mappings(sample)
-    #
-    #     self.rebuild_cdf()
-    #     logging.info(
-    #         "%s #segments %s filtered stats are %s took  %s",
-    #         self.name,
-    #         len(self.samples),
-    #         filtered_stats,
-    #         time.time() - start,
-    #     )
     def remove_sample_by_id(self, id, bin_id):
         del self.samples_by_id[id]
         try:
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 6a7fbd61..daa41741 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -367,7 +367,7 @@ def get_segments(
         self,
         segment_width,
         segment_frame_spacing=9,
-        segment_type=SegmentType.ALL_RANDOM,
+        segment_types=[SegmentType.ALL_RANDOM],
         segment_min_mass=None,
         repeats=1,
         max_segments=None,
@@ -389,7 +389,7 @@ def get_segments(
 
         # in python3.7+ can just take the values and it guarantees order it was added to dict
         regions = self.bounds_history
-        self.samples, self.filtered_stats = get_segments(
+        self.samples, filtered_stats = get_segments(
             self.clip_id,
             self.track_id,
             self.start_frame,
@@ -402,7 +402,7 @@ def get_segments(
             lower_mass=self.lower_mass,
             repeats=repeats,
             min_frames=min_frames,
-            segment_type=segment_type,
+            segment_types=segment_types,
             max_segments=max_segments,
             station_id=self.station_id,
             source_file=self.source_file,
@@ -412,6 +412,7 @@ def get_segments(
             fp_frames=self.fp_frames if filter_by_fp else None,
             rec_time=self.start_time,
         )
+        self.filtered_stats.update(filtered_stats)
         # GP could get this from the tracks when writing
         # but might be best to keep samples independent for ease
         for s in self.samples:
@@ -974,8 +975,7 @@ def get_segments(
     lower_mass=0,
     repeats=1,
     min_frames=None,
-    segment_frames=None,
-    segment_type=SegmentType.ALL_RANDOM,
+    segment_types=[SegmentType.ALL_RANDOM],
     max_segments=None,
     location=None,
     station_id=None,
@@ -986,9 +986,8 @@ def get_segments(
     skip_ffc=True,
     frame_min_mass=None,
     fp_frames=None,
+    repeat_frame_indices=True,
 ):
-    if segment_type == SegmentType.ALL_RANDOM_NOMIN:
-        segment_min_mass = None
     if min_frames is None:
         min_frames = segment_width / 4.0
     segments = []
@@ -997,163 +996,189 @@ def get_segments(
 
     has_no_mass = np.sum(mass_history) == 0
 
-    frame_indices = [
-        region.frame_number
-        for region in regions
-        if (has_no_mass or region.mass > 0)
-        and (
-            ffc_frames is None
-            or skip_ffc is False
-            or region.frame_number not in ffc_frames
-        )
-        and not region.blank
-        and region.width > 0
-        and region.height > 0
-        and ((has_no_mass or frame_min_mass is None) or region.mass >= frame_min_mass)
-    ]
-    if fp_frames is not None and label not in FP_LABELS:
-        frame_indices = [f for f in frame_indices if f not in fp_frames]
-    if len(frame_indices) == 0:
-        logging.warn("Nothing to load for %s - %s", clip_id, track_id)
-        return [], filtered_stats
-    if segment_min_mass is not None:
-        segment_min_mass = min(
-            segment_min_mass,
-            np.median(mass_history[frame_indices - start_frame]),
-        )
-    else:
-        segment_min_mass = 1
-        # remove blank frames
-
-    if segment_type == SegmentType.TOP_RANDOM:
-        # take top 50 mass frames
-        frame_indices = sorted(
-            frame_indices,
-            key=lambda f_i: mass_history[f_i - start_frame],
-            reverse=True,
-        )
-        frame_indices = frame_indices[:50]
-        frame_indices.sort()
-    if segment_type == SegmentType.TOP_SEQUENTIAL:
-        return get_top_mass_segments(
-            clip_id,
-            track_id,
-            label,
-            camera,
-            segment_width,
-            segment_frame_spacing,
-            mass_history,
-            ffc_frames,
-            regions,
-            start_frame,
-            lower_mass,
-            segment_min_mass,
-            source_file=source_file,
-        )
-    if len(frame_indices) < min_frames:
-        filtered_stats["too short"] += 1
-        return segments, filtered_stats
-    frame_indices = np.array(frame_indices)
-    segment_count = max(1, len(frame_indices) // segment_frame_spacing)
-    segment_count = int(segment_count)
-    if max_segments is not None:
-        segment_count = min(max_segments, segment_count)
-
-    # take any segment_width frames, this could be done each epoch
-    whole_indices = frame_indices
-    random_frames = segment_type in [
-        SegmentType.IMPORTANT_RANDOM,
-        SegmentType.ALL_RANDOM,
-        SegmentType.ALL_RANDOM_NOMIN,
-        SegmentType.TOP_RANDOM,
-        None,
-    ]
-    for _ in range(repeats):
-        frame_indices = whole_indices.copy()
-        if random_frames:
-            # random_frames and not random_sections:
-            np.random.shuffle(frame_indices)
-        for i in range(segment_count):
-            # always get atleast one segment, not doing annymore
-            if (len(frame_indices) < segment_width / 2.0 and len(segments) > 1) or len(
-                frame_indices
-            ) < segment_width / 4:
-                break
+    for segment_type in segment_types:
+        s_min_mass = segment_min_mass
+        if segment_type == SegmentType.ALL_RANDOM_NOMIN:
+            s_min_mass = None
 
-            if segment_type == SegmentType.ALL_SECTIONS:
-                # random frames from section 2.2 * segment_width
-                section = frame_indices[: int(segment_width * 2.2)]
-                indices = np.random.choice(
-                    len(section),
-                    min(segment_width, len(section)),
-                    replace=False,
-                )
-                frames = section[indices]
-                # might need to change that gp 11/05 - 2024
-                frame_indices = frame_indices[segment_frame_spacing:]
-            elif random_frames:
-                # frame indices already randomized so just need to grab some
-                frames = frame_indices[:segment_width]
-                frame_indices = frame_indices[segment_width:]
-            else:
-                segment_start = i * segment_frame_spacing
-                segment_end = segment_start + segment_width
-                segment_end = min(len(frame_indices), segment_end)
-                frames = frame_indices[segment_start:segment_end]
-
-            remaining = segment_width - len(frames)
-            # sample another same frames again if need be
-            if remaining > 0:
-                extra_frames = np.random.choice(
-                    frames,
-                    min(remaining, len(frames)),
-                    replace=False,
-                )
-                frames = np.concatenate([frames, extra_frames])
-            frames.sort()
-            relative_frames = frames - start_frame
-            mass_slice = mass_history[relative_frames]
-            segment_mass = np.sum(mass_slice)
-            segment_avg_mass = segment_mass / len(mass_slice)
-            filtered = False
-            if segment_min_mass and segment_avg_mass < segment_min_mass:
-                if dont_filter:
-                    filtered = True
-                else:
-                    filtered_stats["segment_mass"] += 1
-                    continue
-
-            # temp_slice = frame_temp_median[relative_frames]
-            region_slice = regions[relative_frames]
-            movement_data = None
-            if segment_avg_mass < 50:
-                segment_weight_factor = 0.75
-            elif segment_avg_mass < 100:
-                segment_weight_factor = 1
-            else:
-                segment_weight_factor = 1.2
-
-            for z, f in enumerate(frames):
-                assert region_slice[z].frame_number == f
-            segment = SegmentHeader(
+        frame_indices = [
+            region.frame_number
+            for region in regions
+            if (has_no_mass or region.mass > 0)
+            and (
+                ffc_frames is None
+                or skip_ffc is False
+                or region.frame_number not in ffc_frames
+            )
+            and not region.blank
+            and region.width > 0
+            and region.height > 0
+            and (
+                (has_no_mass or frame_min_mass is None) or region.mass >= frame_min_mass
+            )
+        ]
+        if fp_frames is not None and label not in FP_LABELS:
+            frame_indices = [f for f in frame_indices if f not in fp_frames]
+        if len(frame_indices) == 0:
+            logging.warn("Nothing to load for %s - %s", clip_id, track_id)
+            return [], filtered_stats
+        if s_min_mass is not None:
+            s_min_mass = min(
+                s_min_mass,
+                np.median(mass_history[frame_indices - start_frame]),
+            )
+        else:
+            s_min_mass = 1
+            # remove blank frames
+
+        if segment_type == SegmentType.TOP_RANDOM:
+            # take top 50 mass frames
+            frame_indices = sorted(
+                frame_indices,
+                key=lambda f_i: mass_history[f_i - start_frame],
+                reverse=True,
+            )
+            frame_indices = frame_indices[:50]
+            frame_indices.sort()
+        if segment_type == SegmentType.TOP_SEQUENTIAL:
+            new_segments, filtered = get_top_mass_segments(
                 clip_id,
                 track_id,
-                start_frame=start_frame,
-                frames=segment_width,
-                weight=segment_weight_factor,
-                mass=segment_mass,
-                label=label,
-                regions=region_slice,
-                frame_indices=frames,
-                movement_data=movement_data,
-                camera=camera,
-                location=location,
-                station_id=station_id,
-                rec_time=rec_time,
+                label,
+                camera,
+                segment_width,
+                segment_frame_spacing,
+                mass_history,
+                ffc_frames,
+                regions,
+                start_frame,
+                lower_mass,
+                s_min_mass,
                 source_file=source_file,
-                filtered=filtered,
             )
-            segments.append(segment)
+            segments.extend(new_segments)
+            filtered_stats.merge(filtered)
+            continue
+        if len(frame_indices) < min_frames:
+            filtered_stats["too short"] += 1
+            continue
+
+        frame_indices = np.array(frame_indices)
+        segment_count = max(1, len(frame_indices) // segment_frame_spacing)
+        segment_count = int(segment_count)
+        # probably only counts for all random
+        if max_segments is not None and segment_type not in [SegmentType.ALL_SECTIONS]:
+            segment_count = min(max_segments, segment_count)
+
+        # take any segment_width frames, this could be done each epoch
+        whole_indices = frame_indices
+        random_frames = segment_type in [
+            SegmentType.IMPORTANT_RANDOM,
+            SegmentType.ALL_RANDOM,
+            SegmentType.ALL_RANDOM_NOMIN,
+            SegmentType.TOP_RANDOM,
+            None,
+        ]
+        for _ in range(repeats):
+            frame_indices = whole_indices.copy()
+            if random_frames:
+                # random_frames and not random_sections:
+                np.random.shuffle(frame_indices)
+            for i in range(segment_count):
+                # always get atleast one segment, not doing annymore
+                if (
+                    len(frame_indices) < segment_width / 2.0 and len(segments) > 1
+                ) or len(frame_indices) < segment_width / 4:
+                    break
+
+                if segment_type == SegmentType.ALL_SECTIONS:
+                    # random frames from section 2.2 * segment_width
+                    section = frame_indices[: int(segment_width * 2.2)]
+
+                    indices = np.random.choice(
+                        len(section),
+                        min(segment_width, len(section)),
+                        replace=False,
+                    )
+                    frames = section[indices]
+                    # might need to change that gp 11/05 - 2024
+                    frame_indices = frame_indices[segment_width:]
+                elif random_frames:
+                    # frame indices already randomized so just need to grab some
+                    frames = frame_indices[:segment_width]
+                    frame_indices = frame_indices[segment_width:]
+                else:
+                    segment_start = i * segment_frame_spacing
+                    segment_end = segment_start + segment_width
+                    segment_end = min(len(frame_indices), segment_end)
+                    frames = frame_indices[segment_start:segment_end]
+
+                remaining = segment_width - len(frames)
+                # sample another same frames again if need be
+                if remaining > 0:
+                    extra_frames = np.random.choice(
+                        frames,
+                        min(remaining, len(frames)),
+                        replace=False,
+                    )
+                    frames = np.concatenate([frames, extra_frames])
+                frames.sort()
+                relative_frames = frames - start_frame
+                mass_slice = mass_history[relative_frames]
+                segment_mass = np.sum(mass_slice)
+                segment_avg_mass = segment_mass / len(mass_slice)
+                filtered = False
+                if s_min_mass and segment_avg_mass < s_min_mass:
+                    if dont_filter:
+                        filtered = True
+                    else:
+                        filtered_stats["segment_mass"] += 1
+                        continue
+
+                # temp_slice = frame_temp_median[relative_frames]
+                region_slice = regions[relative_frames]
+                movement_data = None
+                if segment_avg_mass < 50:
+                    segment_weight_factor = 0.75
+                elif segment_avg_mass < 100:
+                    segment_weight_factor = 1
+                else:
+                    segment_weight_factor = 1.2
+
+                for z, f in enumerate(frames):
+                    assert region_slice[z].frame_number == f
+
+                if repeat_frame_indices:
+                    # i think this can be default, means we dont need to handle
+                    # short segments elsewhere
+                    if len(frames) < segment_width:
+                        extra_samples = np.random.choice(
+                            frames, segment_width - len(frames)
+                        )
+                        frames = list(frames)
+                        frames.extend(extra_samples)
+                        frames.sort()
+
+                segment = SegmentHeader(
+                    clip_id,
+                    track_id,
+                    start_frame=start_frame,
+                    frames=segment_width,
+                    weight=segment_weight_factor,
+                    mass=segment_mass,
+                    label=label,
+                    regions=region_slice,
+                    frame_indices=frames,
+                    movement_data=movement_data,
+                    camera=camera,
+                    location=location,
+                    station_id=station_id,
+                    rec_time=rec_time,
+                    source_file=source_file,
+                    filtered=filtered,
+                )
+                segments.append(segment)
+
     return segments, filtered_stats
 
 
diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index cd6ddb79..b1868fd0 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -24,7 +24,7 @@ def insert_defaults(self):
         self["square_width"] = self.square_width
         self["frame_size"] = self.frame_size
         self["segment_width"] = self.segment_width
-        self["segment_type"] = self.segment_type
+        self["segment_types"] = self.segment_types
         self["multi_label"] = True
         self["diff_norm"] = self.diff_norm
         self["thermal_diff_norm"] = self.thermal_diff_norm
@@ -89,12 +89,14 @@ def segment_width(self):
         return self.get("segment_width", 25 if self.use_segments else 1)
 
     @property
-    def segment_type(self):
-        segment_type = self.get("segment_type", SegmentType.ALL_RANDOM.name)
-        if isinstance(segment_type, str):
-            return SegmentType[segment_type]
-        else:
-            return segment_type
+    def segment_types(self):
+
+        segment_types = self.get("segment_type", [SegmentType.ALL_RANDOM])
+        # convert string to enum type
+        if isinstance(segment_types[0], str):
+            for i in range(len(segment_types)):
+                segment_types[i] = SegmentType[segment_types[i]]
+        return segment_types
 
     @property
     def mvm(self):
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index 2b299181..bdac4f53 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -21,6 +21,7 @@ def load_json(self, filename):
         self.version = metadata.get("version", None)
         self.labels = metadata["labels"]
         self.params = HyperParams()
+        print("Hypers are ", metadata.get("hyperparams", {}))
         self.params.update(metadata.get("hyperparams", {}))
         self.data_type = metadata.get("type", "thermal")
 
@@ -298,7 +299,7 @@ def preprocess_segments(
             ffc_frames=[] if dont_filter else clip.ffc_frames,
             repeats=1,
             segment_frames=segment_frames,
-            segment_type=self.params.segment_type,
+            segment_types=self.params.segment_types,
             from_last=predict_from_last,
             max_segments=max_segments,
             dont_filter=dont_filter,
diff --git a/src/ml_tools/kerasmodel.py b/src/ml_tools/kerasmodel.py
index 4e5acf8a..45cbb466 100644
--- a/src/ml_tools/kerasmodel.py
+++ b/src/ml_tools/kerasmodel.py
@@ -1083,7 +1083,6 @@ def plot_confusion_matrix(cm, class_names):
     counts = cm.copy()
     threshold = counts.max() / 2.0
 
-    print("Threshold is", threshold, " for ", cm.max())
     # Normalize the confusion matrix.
 
     cm = np.around(cm.astype("float") / cm.sum(axis=1)[:, np.newaxis], decimals=2)
diff --git a/src/ml_tools/tfwriter.py b/src/ml_tools/tfwriter.py
index d40cf8ac..983308f2 100644
--- a/src/ml_tools/tfwriter.py
+++ b/src/ml_tools/tfwriter.py
@@ -12,31 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from PIL import Image
 from pathlib import Path
 from multiprocessing import Process, Queue
-
-import collections
-import hashlib
-import io
-import json
-import multiprocessing
 import os
-import time
-from absl import app
-from absl import flags
 from absl import logging
 import numpy as np
-from PIL import Image, ImageOps
-
 import tensorflow as tf
-from . import tfrecord_util
-from ml_tools import tools
-from ml_tools.imageprocessing import normalize, rotate
-from track.cliptracker import get_diff_back_filtered
-import cv2
-import random
-import math
 
 
 def process_job(queue, labels, base_dir, save_data, writer_i, extra_args):
@@ -44,7 +25,6 @@ def process_job(queue, labels, base_dir, save_data, writer_i, extra_args):
 
     pid = os.getpid()
 
-    # writer_i = 1
     name = f"{writer_i}-{pid}.tfrecord"
     logging.info("Writing to %s", name)
     options = tf.io.TFRecordOptions(compression_type="GZIP")
@@ -66,15 +46,8 @@ def process_job(queue, labels, base_dir, save_data, writer_i, extra_args):
                 saved += save_data(samples, writer, labels, extra_args)
                 files += 1
                 del samples
-                # if saved > 250000 / num_frames:
-                #     logging.info("Closing old writer")
-                #     writer.close()
-                #     writer_i += 1
-                #     name = f"{writer_i}-{pid}.tfrecord"
-                #     logging.info("Opening %s", name)
-                #     saved = 0
-                #     writer = tf.io.TFRecordWriter(str(base_dir / name), options=options)
-                if i % int(25000 / num_frames) == 0:
+
+                if i % int(2500 / num_frames) == 0:
                     logging.info("Saved %s ", files)
                     gc.collect()
                     writer.flush()
diff --git a/src/ml_tools/thermalwriter.py b/src/ml_tools/thermalwriter.py
index 7a123460..891edb54 100644
--- a/src/ml_tools/thermalwriter.py
+++ b/src/ml_tools/thermalwriter.py
@@ -227,7 +227,7 @@ def get_data(clip_samples, extra_args):
                         segment_frame_spacing=extra_args.get(
                             "segment_frame_spacing", 9
                         ),
-                        segment_type=extra_args.get("segment_type"),
+                        segment_types=extra_args.get("segment_types"),
                         segment_min_mass=extra_args.get("segment_min_avg_mass"),
                         dont_filter=extra_args.get("dont_filter_segment", False),
                         skip_ffc=extra_args.get("skip_ffc", True),
diff --git a/src/ml_tools/tools.py b/src/ml_tools/tools.py
index 38dd9e90..ce604906 100644
--- a/src/ml_tools/tools.py
+++ b/src/ml_tools/tools.py
@@ -15,6 +15,7 @@
 from pathlib import Path
 from ml_tools.rectangle import Rectangle
 from dateutil import parser
+from enum import Enum
 
 EPISON = 1e-5
 
@@ -54,7 +55,8 @@ def default(self, obj):
             return obj.meta_dictionary()
         elif isinstance(obj, Path):
             return str(obj)
-
+        elif isinstance(obj, Enum):
+            return str(obj.name)
         # Let the base class default method raise the TypeError
         return json.JSONEncoder.default(self, obj)
 
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index 94ae85e9..fb07124f 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -463,14 +463,14 @@ def evaluate_dir(
                 masses = np.array(data[4])
                 masses = masses[:, None]
                 top_score = None
-                # if model.params.multi_label is True:
-                #     # every label could be 1 for each prediction
-                #     top_score = len(output)
+                if model.params.multi_label is True:
+                    #     # every label could be 1 for each prediction
+                    top_score = np.sum(masses)
                 #     smoothed = output
                 # else:
                 smoothed = output * masses
                 prediction.classified_clip(
-                    output, smoothed, data[2], top_score=top_score
+                    output, smoothed, data[2], masses, top_score=top_score
                 )
                 y_true.append(label_mapping.get(label, label))
                 predicted_labels = [prediction.predicted_tag()]
diff --git a/src/track/track.py b/src/track/track.py
index 165ee39d..b8264c35 100644
--- a/src/track/track.py
+++ b/src/track/track.py
@@ -439,7 +439,7 @@ def get_segments(
         repeats=1,
         min_frames=0,
         segment_frames=None,
-        segment_type=SegmentType.ALL_RANDOM,
+        segment_types=[SegmentType.ALL_RANDOM],
         from_last=None,
         max_segments=None,
         ffc_frames=None,
@@ -477,28 +477,22 @@ def get_segments(
                 )
                 segments.append(segment)
         else:
-            all_segments = []
-            for seg_type in [SegmentType.ALL_RANDOM, SegmentType.ALL_SECTIONS]:
-                segments, _ = get_segments(
-                    self.clip_id,
-                    self._id,
-                    start_frame,
-                    segment_frame_spacing=segment_frame_spacing,
-                    segment_width=segment_width,
-                    regions=regions,
-                    ffc_frames=ffc_frames,
-                    repeats=repeats,
-                    # frame_temp_median=frame_temp_median,
-                    min_frames=min_frames,
-                    segment_frames=None,
-                    segment_type=seg_type,
-                    max_segments=max_segments,
-                    dont_filter=dont_filter,
-                    # segment_type=seg_type,
-                )
-                all_segments.extend(segments)
+            segments, _ = get_segments(
+                self.clip_id,
+                self._id,
+                start_frame,
+                segment_frame_spacing=segment_frame_spacing,
+                segment_width=segment_width,
+                regions=regions,
+                ffc_frames=ffc_frames,
+                repeats=repeats,
+                min_frames=min_frames,
+                segment_types=segment_types,
+                max_segments=max_segments,
+                dont_filter=dont_filter,
+            )
 
-        return all_segments
+        return segments
 
     @classmethod
     def from_region(cls, clip, region, tracker_version=None, tracking_config=None):

From e9b51a977237cd848df3a58cf6048f6292240857 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 11 Nov 2024 18:19:42 +0100
Subject: [PATCH 110/117] try random section

---
 src/classify/clipclassifier.py    |  2 ++
 src/ml_tools/datasetstructures.py | 37 ++++++++++++++++++++++---------
 src/ml_tools/interpreter.py       |  1 +
 src/track/track.py                |  2 ++
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/classify/clipclassifier.py b/src/classify/clipclassifier.py
index 9e7dd279..4de7ce0c 100644
--- a/src/classify/clipclassifier.py
+++ b/src/classify/clipclassifier.py
@@ -182,6 +182,8 @@ def classify_clip(self, clip, model, meta_data, reuse_frames=None):
         predictions.model_load_time = time.time() - start
 
         for i, track in enumerate(clip.tracks):
+            logging.info("Track id is %s", track.get_id())
+
             segment_frames = None
             if reuse_frames:
                 tracks = meta_data.get("tracks")
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index daa41741..554fc7d5 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -987,6 +987,7 @@ def get_segments(
     frame_min_mass=None,
     fp_frames=None,
     repeat_frame_indices=True,
+    min_segments=None,
 ):
     if min_frames is None:
         min_frames = segment_width / 4.0
@@ -1059,7 +1060,9 @@ def get_segments(
             segments.extend(new_segments)
             filtered_stats.merge(filtered)
             continue
-        if len(frame_indices) < min_frames:
+        if len(frame_indices) < min_frames and (
+            min_segments == 0 or min_segments is None
+        ):
             filtered_stats["too short"] += 1
             continue
 
@@ -1069,7 +1072,6 @@ def get_segments(
         # probably only counts for all random
         if max_segments is not None and segment_type not in [SegmentType.ALL_SECTIONS]:
             segment_count = min(max_segments, segment_count)
-
         # take any segment_width frames, this could be done each epoch
         whole_indices = frame_indices
         random_frames = segment_type in [
@@ -1079,17 +1081,31 @@ def get_segments(
             SegmentType.TOP_RANDOM,
             None,
         ]
+        random_mask = True
         for _ in range(repeats):
-            frame_indices = whole_indices.copy()
-            if random_frames:
-                # random_frames and not random_sections:
-                np.random.shuffle(frame_indices)
+            used_indices = []
+            if not random_mask:
+                frame_indices = whole_indices.copy()
+
+                if random_frames:
+                    # random_frames and not random_sections:
+                    np.random.shuffle(frame_indices)
+
             for i in range(segment_count):
+                if random_mask:
+                    mask_start = i * 25
+                    frame_indices = list(whole_indices[0:mask_start].copy())
+                    frame_indices.extend(whole_indices[mask_start + 25 :].copy())
+                    frame_indices = [f for f in frame_indices if f not in used_indices]
+                    frame_indices = np.uint32(frame_indices)
+                    np.random.shuffle(frame_indices)
+
                 # always get atleast one segment, not doing annymore
-                if (
-                    len(frame_indices) < segment_width / 2.0 and len(segments) > 1
-                ) or len(frame_indices) < segment_width / 4:
-                    break
+                if len(frame_indices) == 0 or len(segments) >= min_segments:
+                    if (
+                        len(frame_indices) < segment_width / 2.0 and len(segments) > 1
+                    ) or len(frame_indices) < segment_width / 4:
+                        break
 
                 if segment_type == SegmentType.ALL_SECTIONS:
                     # random frames from section 2.2 * segment_width
@@ -1106,6 +1122,7 @@ def get_segments(
                 elif random_frames:
                     # frame indices already randomized so just need to grab some
                     frames = frame_indices[:segment_width]
+                    used_indices.extend(frames)
                     frame_indices = frame_indices[segment_width:]
                 else:
                     segment_start = i * segment_frame_spacing
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index bdac4f53..b2dc166d 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -304,6 +304,7 @@ def preprocess_segments(
             max_segments=max_segments,
             dont_filter=dont_filter,
             filter_by_fp=False,
+            min_segments=1,
         )
         frame_indices = set()
         for segment in segments:
diff --git a/src/track/track.py b/src/track/track.py
index b8264c35..f265b014 100644
--- a/src/track/track.py
+++ b/src/track/track.py
@@ -445,6 +445,7 @@ def get_segments(
         ffc_frames=None,
         dont_filter=False,
         filter_by_fp=False,
+        min_segments=1,
     ):
         if from_last is not None:
             if from_last == 0:
@@ -490,6 +491,7 @@ def get_segments(
                 segment_types=segment_types,
                 max_segments=max_segments,
                 dont_filter=dont_filter,
+                min_segments=min_segments,
             )
 
         return segments

From 25ae03b70aa43203dd120a26dafcfc9a7ec8d391 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 11 Nov 2024 18:23:17 +0100
Subject: [PATCH 111/117] add min path

---
 src/ml_tools/datasetstructures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 554fc7d5..dcb6e200 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -379,6 +379,7 @@ def get_segments(
         from_last=None,
         frame_min_mass=None,
         filter_by_fp=True,
+        min_segments=None,
     ):
         if segment_frames is not None:
             raise Exception("Have not implement this path")
@@ -411,6 +412,7 @@ def get_segments(
             frame_min_mass=frame_min_mass,
             fp_frames=self.fp_frames if filter_by_fp else None,
             rec_time=self.start_time,
+            min_segments=min_segments,
         )
         self.filtered_stats.update(filtered_stats)
         # GP could get this from the tracks when writing

From 7cd7efc5ffd82c36c929b94ee834ac8df4791e8e Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Mon, 11 Nov 2024 18:37:24 +0100
Subject: [PATCH 112/117] fix to small tracks

---
 src/ml_tools/datasetstructures.py | 15 ++++++++++-----
 src/ml_tools/interpreter.py       |  4 +++-
 src/modelevaluate.py              |  3 +--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index dcb6e200..ad4b1d1b 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -1095,11 +1095,16 @@ def get_segments(
 
             for i in range(segment_count):
                 if random_mask:
-                    mask_start = i * 25
-                    frame_indices = list(whole_indices[0:mask_start].copy())
-                    frame_indices.extend(whole_indices[mask_start + 25 :].copy())
-                    frame_indices = [f for f in frame_indices if f not in used_indices]
-                    frame_indices = np.uint32(frame_indices)
+                    if len(whole_indices) < 40:
+                        frame_indices = whole_indices.copy()
+                    else:
+                        mask_start = i * 25
+                        frame_indices = list(whole_indices[0:mask_start].copy())
+                        frame_indices.extend(whole_indices[mask_start + 25 :].copy())
+                        frame_indices = [
+                            f for f in frame_indices if f not in used_indices
+                        ]
+                        frame_indices = np.uint32(frame_indices)
                     np.random.shuffle(frame_indices)
 
                 # always get atleast one segment, not doing annymore
diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index b2dc166d..44426163 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -126,6 +126,7 @@ def preprocess(self, clip, track, **args):
                 predict_from_last,
                 segment_frames=segment_frames,
                 dont_filter=args.get("dont_filter", False),
+                min_segments=args.get("min_segments"),
             )
         else:
             frames, preprocessed, masses = self.preprocess_frames(
@@ -290,6 +291,7 @@ def preprocess_segments(
         predict_from_last=None,
         segment_frames=None,
         dont_filter=False,
+        min_segments=None,
     ):
         from ml_tools.preprocess import preprocess_frame, preprocess_movement
 
@@ -304,7 +306,7 @@ def preprocess_segments(
             max_segments=max_segments,
             dont_filter=dont_filter,
             filter_by_fp=False,
-            min_segments=1,
+            min_segments=min_segments,
         )
         frame_indices = set()
         for segment in segments:
diff --git a/src/modelevaluate.py b/src/modelevaluate.py
index fb07124f..b665972c 100644
--- a/src/modelevaluate.py
+++ b/src/modelevaluate.py
@@ -381,9 +381,8 @@ def load_clip_data(cptv_file):
     for track in clip.tracks:
         try:
             frames, preprocessed, masses = worker_model.preprocess(
-                clip_db, track, frames_per_classify=25, dont_filter=True
+                clip_db, track, frames_per_classify=25, dont_filter=True, min_segments=1
             )
-
             data.append(
                 (
                     f"{track.clip_id}-{track.get_id()}",

From 0ca2c9344b9a29d05a366cc0456310eeea0e8468 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 12 Nov 2024 09:52:15 +0100
Subject: [PATCH 113/117] added mask segment type as default

---
 src/ml_tools/dataset.py           |  2 +-
 src/ml_tools/datasetstructures.py | 34 +++++++++++++++++++++----------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/ml_tools/dataset.py b/src/ml_tools/dataset.py
index 7e633b32..87e8ccaa 100644
--- a/src/ml_tools/dataset.py
+++ b/src/ml_tools/dataset.py
@@ -83,7 +83,7 @@ def __init__(
             self.excluded_tags = config.build.excluded_tags
             self.min_frame_mass = config.build.min_frame_mass
             self.filter_by_lq = config.build.filter_by_lq
-            self.segment_types = [SegmentType.ALL_RANDOM]
+            self.segment_types = [SegmentType.ALL_RANDOM_MASKED]
             self.max_segments = config.build.max_segments
             self.country = config.build.country
             self.max_frames = config.build.max_frames
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index ad4b1d1b..57d04dcc 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -30,6 +30,7 @@ class SegmentType(Enum):
     ALL_SECTIONS = 5
     TOP_RANDOM = 6
     ALL_RANDOM_NOMIN = 7
+    ALL_RANDOM_MASKED = 8
 
 
 class BaseSample(ABC):
@@ -1071,9 +1072,13 @@ def get_segments(
         frame_indices = np.array(frame_indices)
         segment_count = max(1, len(frame_indices) // segment_frame_spacing)
         segment_count = int(segment_count)
+        mask_length = 25
+
         # probably only counts for all random
         if max_segments is not None and segment_type not in [SegmentType.ALL_SECTIONS]:
             segment_count = min(max_segments, segment_count)
+            # adjust size of mask if we take less segments
+            mask_length = max(mask_length, len(frame_indices) // segment_count)
         # take any segment_width frames, this could be done each epoch
         whole_indices = frame_indices
         random_frames = segment_type in [
@@ -1081,12 +1086,13 @@ def get_segments(
             SegmentType.ALL_RANDOM,
             SegmentType.ALL_RANDOM_NOMIN,
             SegmentType.TOP_RANDOM,
+            SegmentType.ALL_RANDOM_MASKED,
             None,
         ]
-        random_mask = True
+
         for _ in range(repeats):
             used_indices = []
-            if not random_mask:
+            if segment_type != SegmentType.ALL_RANDOM_MASKED or len(whole_indices) < 40:
                 frame_indices = whole_indices.copy()
 
                 if random_frames:
@@ -1094,21 +1100,27 @@ def get_segments(
                     np.random.shuffle(frame_indices)
 
             for i in range(segment_count):
-                if random_mask:
-                    if len(whole_indices) < 40:
-                        frame_indices = whole_indices.copy()
-                    else:
-                        mask_start = i * 25
-                        frame_indices = list(whole_indices[0:mask_start].copy())
-                        frame_indices.extend(whole_indices[mask_start + 25 :].copy())
+                if segment_type == SegmentType.ALL_RANDOM_MASKED:
+                    if len(whole_indices) > 40:
+                        mask_start = i * mask_length
+                        frame_indices = whole_indices[0:mask_start]
+                        frame_indices = np.concatenate(
+                            [frame_indices, whole_indices[mask_start + mask_length :]],
+                            axis=0,
+                        )
+                        # maybe some faster way of doing this...
                         frame_indices = [
                             f for f in frame_indices if f not in used_indices
                         ]
                         frame_indices = np.uint32(frame_indices)
-                    np.random.shuffle(frame_indices)
+                        np.random.shuffle(frame_indices)
 
                 # always get atleast one segment, not doing annymore
-                if len(frame_indices) == 0 or len(segments) >= min_segments:
+                if (
+                    len(frame_indices) == 0
+                    or min_segments is None
+                    or len(segments) >= min_segments
+                ):
                     if (
                         len(frame_indices) < segment_width / 2.0 and len(segments) > 1
                     ) or len(frame_indices) < segment_width / 4:

From d39902ec044e1d1b6e427c9b00baac99afffdadd Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Tue, 12 Nov 2024 10:03:12 +0100
Subject: [PATCH 114/117] tidy up

---
 src/classify/clipclassifier.py | 13 -------------
 src/ml_tools/hyperparams.py    |  1 -
 src/ml_tools/tools.py          | 14 +++++++++++---
 src/rebuildDate.py             |  2 +-
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/classify/clipclassifier.py b/src/classify/clipclassifier.py
index 4de7ce0c..9f6945a7 100644
--- a/src/classify/clipclassifier.py
+++ b/src/classify/clipclassifier.py
@@ -182,8 +182,6 @@ def classify_clip(self, clip, model, meta_data, reuse_frames=None):
         predictions.model_load_time = time.time() - start
 
         for i, track in enumerate(clip.tracks):
-            logging.info("Track id is %s", track.get_id())
-
             segment_frames = None
             if reuse_frames:
                 tracks = meta_data.get("tracks")
@@ -247,17 +245,6 @@ def save_metadata(
                 prediction = predictions.prediction_for(track.get_id())
                 if prediction is None:
                     continue
-                # DEBUGGING STUFF REMOVE ME
-                # logging.info("Track predictions %s", track)
-                # for p in prediction.predictions:
-                #     logging.info(
-                #         "Have %s sum %s smoothed %s mass %s",
-                #         p,
-                #         np.sum(p.prediction),
-                #         np.round(p.smoothed_prediction),
-                #         p.mass,
-                #     )
-                # logging.info("smoothed %s", np.round(100 * prediction.class_best_score))
                 prediction_meta = prediction.get_metadata()
                 prediction_meta["model_id"] = model_id
                 prediction_info.append(prediction_meta)
diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index b1868fd0..db558eff 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -90,7 +90,6 @@ def segment_width(self):
 
     @property
     def segment_types(self):
-
         segment_types = self.get("segment_type", [SegmentType.ALL_RANDOM])
         # convert string to enum type
         if isinstance(segment_types[0], str):
diff --git a/src/ml_tools/tools.py b/src/ml_tools/tools.py
index ce604906..cad64667 100644
--- a/src/ml_tools/tools.py
+++ b/src/ml_tools/tools.py
@@ -194,9 +194,17 @@ def saveclassify_image(data, filename):
     # saves image channels side by side, expected data to be values in the range of 0->1
     Path(filename).parent.mkdir(parents=True, exist_ok=True)
     r = Image.fromarray(np.uint8(data[:, :, 0]))
-    g = Image.fromarray(np.uint8(data[:, :, 1]))
-    b = g
-    # b = Image.fromarray(np.uint8(data[:, :, 2]))
+    _, _, channels = data.shape
+
+    if channels == 1:
+        g = r
+    else:
+        g = Image.fromarray(np.uint8(data[:, :, 1]))
+
+    if channels == 2:
+        b = r
+    else:
+        b = Image.fromarray(np.uint8(data[:, :, 2]))
     concat = np.concatenate((r, g, b), axis=1)  # horizontally
     img = Image.fromarray(np.uint8(concat))
     img.save(filename + ".png")
diff --git a/src/rebuildDate.py b/src/rebuildDate.py
index 7693842d..661e2d60 100644
--- a/src/rebuildDate.py
+++ b/src/rebuildDate.py
@@ -9,7 +9,7 @@
 from dateutil.parser import parse as parse_date
 
 parser = argparse.ArgumentParser()
-parser.add_argument("data_dir", help="Directory of hdf5 files")
+parser.add_argument("data_dir", help="Directory of cptv files")
 args = parser.parse_args()
 args.data_dir = Path(args.data_dir)
 latest_date = None

From 48a25e3731fb9e5e98d1621abb0ee639cae729e4 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 13 Nov 2024 16:07:36 +0100
Subject: [PATCH 115/117] add check for none

---
 src/ml_tools/interpreter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ml_tools/interpreter.py b/src/ml_tools/interpreter.py
index 44426163..140aef35 100644
--- a/src/ml_tools/interpreter.py
+++ b/src/ml_tools/interpreter.py
@@ -334,7 +334,7 @@ def preprocess_segments(
                     )
                     continue
                 f = clip.get_frame(region.frame_number)
-                if region.blank or region.width <= 0 or region.height <= 0:
+                if region.blank or region.width <= 0 or region.height <= 0 or f is None:
                     continue
 
                 f.float_arrays()

From 52bfd995dfec9cbcf73ad7436524a675afcfab50 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 13 Nov 2024 16:22:00 +0100
Subject: [PATCH 116/117] tidy up

---
 src/config/buildconfig.py         | 2 +-
 src/ml_tools/datasetstructures.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/config/buildconfig.py b/src/config/buildconfig.py
index ae5e9baf..cf3812a9 100644
--- a/src/config/buildconfig.py
+++ b/src/config/buildconfig.py
@@ -43,7 +43,7 @@ class BuildConfig(DefaultConfig):
     max_frames = attr.ib()
 
     EXCLUDED_TAGS = ["poor tracking", "part", "untagged", "unidentified"]
-
+    NO_MIN_FRAMES = ["stoat", "mustelid", "weasel", "ferret"]
     # country bounding boxs
     COUNTRY_LOCATIONS = {
         "AU": Rectangle.from_ltrb(
diff --git a/src/ml_tools/datasetstructures.py b/src/ml_tools/datasetstructures.py
index 57d04dcc..9a96db8d 100644
--- a/src/ml_tools/datasetstructures.py
+++ b/src/ml_tools/datasetstructures.py
@@ -6,6 +6,7 @@
 from track.region import Region
 from abc import ABC, abstractmethod
 from ml_tools.rectangle import Rectangle
+from config.buildconfig import BuildConfig
 from ml_tools import imageprocessing
 from enum import Enum
 import attr
@@ -385,9 +386,11 @@ def get_segments(
         if segment_frames is not None:
             raise Exception("Have not implement this path")
         min_frames = segment_width / 4.0
-        if self.label in ["stoat", "mustelid", "weasel", "ferret"]:
+        if self.label in BuildConfig.NO_MIN_FRAMES:
             # try and always get one for these
             min_frames = 0
+            if min_segments is None:
+                min_segments = 1
 
         # in python3.7+ can just take the values and it guarantees order it was added to dict
         regions = self.bounds_history

From 092280fdde4ddb7e98bd4543044ab57a2666ac56 Mon Sep 17 00:00:00 2001
From: gferraro <g.ferraro22@gmail.com>
Date: Wed, 13 Nov 2024 16:36:56 +0100
Subject: [PATCH 117/117] fix segment type load for old meta

---
 src/ml_tools/hyperparams.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ml_tools/hyperparams.py b/src/ml_tools/hyperparams.py
index db558eff..6ed2ba1d 100644
--- a/src/ml_tools/hyperparams.py
+++ b/src/ml_tools/hyperparams.py
@@ -92,7 +92,10 @@ def segment_width(self):
     def segment_types(self):
         segment_types = self.get("segment_type", [SegmentType.ALL_RANDOM])
         # convert string to enum type
-        if isinstance(segment_types[0], str):
+        if isinstance(segment_types, str):
+            # old metadata
+            segment_types = [SegmentType[segment_types]]
+        elif isinstance(segment_types[0], str):
             for i in range(len(segment_types)):
                 segment_types[i] = SegmentType[segment_types[i]]
         return segment_types