From ec6295942c7b03032631e01fe35af98df0359e38 Mon Sep 17 00:00:00 2001
From: Jonathan Karlsen <JONAK@equinor.com>
Date: Tue, 3 Dec 2024 15:21:27 +0100
Subject: [PATCH] Have snapshots overwrite fm status if next fm starts

This commit makes snapshots overwrite non-finalized states for
forwardmodels if the next forwardmodel has started. This will avoid
forwardmodels being stuck in running/pending if an event is lost. This
is a safe assumption as no forwardmodel will start running before the
previous one has finished successfully.
---
 .../forward_model_runner/reporting/event.py   |  2 +-
 src/ert/ensemble_evaluator/snapshot.py        | 45 +++++++++++++++-
 .../ensemble_evaluator/test_snapshot.py       | 53 +++++++++++++++++++
 tests/ert/unit_tests/test_tracking.py         |  2 +-
 4 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/src/_ert/forward_model_runner/reporting/event.py b/src/_ert/forward_model_runner/reporting/event.py
index f4f140232e1..3d55665c024 100644
--- a/src/_ert/forward_model_runner/reporting/event.py
+++ b/src/_ert/forward_model_runner/reporting/event.py
@@ -67,7 +67,6 @@ def __init__(self, evaluator_url, token=None, cert_path=None):
                 self._cert = f.read()
         else:
             self._cert = None
-
         self._statemachine = StateMachine()
         self._statemachine.add_handler((Init,), self._init_handler)
         self._statemachine.add_handler((Start, Running, Exited), self._job_handler)
@@ -163,6 +162,7 @@ def _job_handler(self, msg: Union[Start, Running, Exited]):
             if msg.success():
                 logger.debug(f"Job {job_name} exited successfully")
                 self._dump_event(ForwardModelStepSuccess(**job_msg))
+
             else:
                 logger.error(
                     _JOB_EXIT_FAILED_STRING.format(
diff --git a/src/ert/ensemble_evaluator/snapshot.py b/src/ert/ensemble_evaluator/snapshot.py
index f2027f4e964..6ca43e4cb0d 100644
--- a/src/ert/ensemble_evaluator/snapshot.py
+++ b/src/ert/ensemble_evaluator/snapshot.py
@@ -276,7 +276,6 @@ def update_from_event(
     ) -> "EnsembleSnapshot":
         e_type = type(event)
         timestamp = event.time
-
         if source_snapshot is None:
             source_snapshot = EnsembleSnapshot()
         if e_type in get_args(RealizationEvent):
@@ -364,7 +363,9 @@ def update_from_event(
                 event.fm_step,
                 fm,
             )
-
+            self._make_sure_previous_fm_is_not_running(
+                event.real, event.fm_step, source_snapshot
+            )
         elif e_type in get_args(EnsembleEvent):
             event = cast(EnsembleEvent, event)
             self._ensemble_state = _ENSEMBLE_TYPE_EVENT_TO_STATUS[type(event)]
@@ -385,6 +386,46 @@ def update_fm_step(
         self._fm_step_snapshots[real_id, fm_step_id].update(fm_step)
         return self
 
+    def _make_sure_previous_fm_is_not_running(
+        self, real_id: str, fm_step_id: str, source_snapshot: "EnsembleSnapshot"
+    ) -> None:
+        if fm_step_id == "0":
+            return
+        previous_fm_step_id = str(int(fm_step_id) - 1)
+        previous_fm_step = source_snapshot._fm_step_snapshots[
+            real_id, previous_fm_step_id
+        ]
+        current_fm_start_time_from_source_snapshot = source_snapshot._fm_step_snapshots[
+            real_id, fm_step_id
+        ].get("start_time")
+        current_fm_start_time_from_update_snapshot = self._fm_step_snapshots[
+            real_id, fm_step_id
+        ].get("start_time")
+        if "status" in previous_fm_step and previous_fm_step["status"] not in [
+            "Failed",
+            "Finished",
+        ]:
+            logger.error(
+                (
+                    f"Did not get finished event for {real_id=} {previous_fm_step_id=}, "
+                    "but next fm was started so we assume it finished successfully and carry on."
+                )
+            )
+            self._fm_step_snapshots[real_id, previous_fm_step_id]["status"] = "Finished"
+            self._fm_step_snapshots[real_id, previous_fm_step_id]["end_time"] = (
+                current_fm_start_time_from_source_snapshot
+                or current_fm_start_time_from_update_snapshot
+            )
+            previous_fm_step["status"] = "Finished"
+            previous_fm_step["end_time"] = (
+                current_fm_start_time_from_source_snapshot
+                or current_fm_start_time_from_update_snapshot
+            )
+            # might affect many of the previous FMs
+            self._make_sure_previous_fm_is_not_running(
+                real_id, previous_fm_step_id, source_snapshot
+            )
+
 
 class FMStepSnapshot(TypedDict, total=False):
     status: Optional[str]
diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_snapshot.py b/tests/ert/unit_tests/ensemble_evaluator/test_snapshot.py
index f0492b1bb92..035bee277db 100644
--- a/tests/ert/unit_tests/ensemble_evaluator/test_snapshot.py
+++ b/tests/ert/unit_tests/ensemble_evaluator/test_snapshot.py
@@ -1,8 +1,10 @@
+import logging
 from datetime import datetime
 
 from _ert.events import (
     ForwardModelStepFailure,
     ForwardModelStepRunning,
+    ForwardModelStepStart,
     ForwardModelStepSuccess,
     RealizationSuccess,
 )
@@ -103,3 +105,54 @@ def test_that_realization_success_message_updates_state():
     assert (
         snapshot.to_dict()["reals"]["0"]["status"] == state.REALIZATION_STATE_FINISHED
     )
+
+
+def test_fm_updates_previous_fm_if_it_is_stuck_in_nonfinalized_state(caplog):
+    """In case a snapshot event is lost (if connection is dropped), a forwardmodel can be stuck in a non-finalized state.
+    This should be handled, and it is safe to assume it has already finished if the next forwardmodel has started.
+    We also don't know the end_time of the prior forwardmodel if we have lost a finished/exited event,
+    so we set it to the start_time of the next forwardmodel."""
+    caplog.set_level(logging.ERROR)
+
+    main_snapshot = EnsembleSnapshot()
+    main_snapshot._fm_step_snapshots["0", "0"] = FMStepSnapshot(
+        status="Pending",
+        index="0",
+        start_time=datetime.fromtimestamp(757575),
+        name="forward_model0",
+    )
+    main_snapshot._fm_step_snapshots["0", "1"] = FMStepSnapshot(
+        status="Running",
+        index="1",
+        start_time=datetime.fromtimestamp(939393),
+        name="forward_model1",
+    )
+
+    update_snapshot = EnsembleSnapshot()
+    update_snapshot.update_from_event(
+        ForwardModelStepStart(
+            real="0", fm_step="2", time=datetime.fromtimestamp(10101010)
+        ),
+        main_snapshot,
+    )
+    update_snapshot.update_from_event(
+        ForwardModelStepRunning(real="0", fm_step="2"), main_snapshot
+    )
+    main_snapshot.merge_snapshot(update_snapshot)
+
+    affected_snapshots = [main_snapshot, update_snapshot]
+    for snapshot in affected_snapshots:
+        assert snapshot.get_fm_step(real_id="0", fm_step_id="0")["status"] == "Finished"
+        assert snapshot.get_fm_step(real_id="0", fm_step_id="0")[
+            "end_time"
+        ] == datetime.fromtimestamp(939393)
+
+        assert snapshot.get_fm_step(real_id="0", fm_step_id="1")["status"] == "Finished"
+        assert snapshot.get_fm_step(real_id="0", fm_step_id="1")[
+            "end_time"
+        ] == datetime.fromtimestamp(10101010)
+
+        assert snapshot.get_fm_step(real_id="0", fm_step_id="2")["status"] == "Running"
+        assert "end_time" not in snapshot.get_fm_step(real_id="0", fm_step_id="2")
+
+    assert "Did not get finished event for" in caplog.text
diff --git a/tests/ert/unit_tests/test_tracking.py b/tests/ert/unit_tests/test_tracking.py
index cf031d8f1f6..1076989bb33 100644
--- a/tests/ert/unit_tests/test_tracking.py
+++ b/tests/ert/unit_tests/test_tracking.py
@@ -118,7 +118,7 @@ def check_expression(original, path_expression, expected, msg_start):
         ),
         pytest.param(
             "",
-            '    import os\n    if os.getcwd().split("/")[-2].split("-")[1] == "0": sys.exit(1)',
+            '    import os, sys\n    if os.getcwd().split("/")[-2].split("-")[1] == "0": sys.exit(1)',
             [
                 ENSEMBLE_SMOOTHER_MODE,
                 "--realizations",