hapifhir · jamesagnew · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/...ocs/src/main/resources/ca/uhn/hapi/fhir/changelog/7_8_0/6583-fix-batch2-intermittent.yaml b/...ocs/src/main/resources/ca/uhn/hapi/fhir/changelog/7_8_0/6583-fix-batch2-intermittent.yaml
@@ -0,0 +1,4 @@
+---
+type: fix
+issue: 6583
+title: "When processing a batch2 job under heavy load, a race condition meant that the final reducer step would occasionally fail if it started immediately following a batch2 maintenance task."
diff --git a/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/batch2/JpaJobPersistenceImpl.java b/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/batch2/JpaJobPersistenceImpl.java
@@ -542,8 +542,15 @@ public Page<WorkChunkMetadata> fetchAllWorkChunkMetadataForJobInStates(
 
 	@Override
 	public boolean updateInstance(String theInstanceId, JobInstanceUpdateCallback theModifier) {
-		Batch2JobInstanceEntity instanceEntity =
-				myEntityManager.find(Batch2JobInstanceEntity.class, theInstanceId, LockModeType.PESSIMISTIC_WRITE);
+		/*
+		 * We may already have a copy of the entity in the L1 cache, and it may be
+		 * stale if the scheduled maintenance service has touched it recently. So
+		 * we fetch it and then refresh-lock it so that we don't fail if someone
+		 * else has touched it.
+		 */
+		Batch2JobInstanceEntity instanceEntity = myEntityManager.find(Batch2JobInstanceEntity.class, theInstanceId);
+		myEntityManager.refresh(instanceEntity, LockModeType.PESSIMISTIC_WRITE);
+
 		if (null == instanceEntity) {
 			ourLog.error("No instance found with Id {}", theInstanceId);
 			return false;