diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 18bf3cc4d..2683a0335 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -193,12 +193,18 @@ def setup(self, mode: SetupMode):
         self._mark_setup_start()
 
         if hasattr(self.framework_module, "setup"):
-            self.framework_module.setup(
-                *self.framework_def.setup_args,
-                _shell_=False,  # prevents #arg from being interpreted as comment
-                _live_output_=rconfig().setup.live_output,
-                _activity_timeout_=rconfig().setup.activity_timeout,
-            )
+            try:
+                self.framework_module.setup(
+                    *self.framework_def.setup_args,
+                    _shell_=False,  # prevents #arg from being interpreted as comment
+                    _exit_immediately_=True,
+                    _live_output_=rconfig().setup.live_output,
+                    _activity_timeout_=rconfig().setup.activity_timeout,
+                )
+            except Exception as e:
+                raise JobError(
+                    f"Setup of framework {self.framework_name} failed."
+                ) from e
 
         if self.framework_def.setup_script is not None:
             run_script(
diff --git a/amlb/utils/process.py b/amlb/utils/process.py
index 97c557a86..5f40f0b02 100644
--- a/amlb/utils/process.py
+++ b/amlb/utils/process.py
@@ -245,6 +245,7 @@ def run_cmd(cmd, *args, **kwargs):
         activity_timeout=None,
         log_level=logging.INFO,
         monitor=None,
+        exit_immediately=False,
     )
     for k, v in params:
         kk = "_" + k + "_"
@@ -253,6 +254,8 @@ def run_cmd(cmd, *args, **kwargs):
             del kwargs[kk]
     cmd_args = as_cmd_args(*args, **kwargs)
     full_cmd = flatten([cmd]) + cmd_args
+    if params.exit_immediately and platform.system() != "Windows":
+        full_cmd = [os.environ["SHELL"], "-e"] + full_cmd
     str_cmd = " ".join(full_cmd)
     log.log(params.log_level, "Running cmd `%s`", str_cmd)
     log.debug("Running cmd `%s` with input: %s", str_cmd, params.input_str)
diff --git a/tests/conftest.py b/tests/conftest.py
index 04d7cf67b..6ea684a27 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,9 @@ def load_default_resources(tmp_path):
         os.path.join(default_dirs.root_dir, "resources", "config.yaml")
     )
     config_default_dirs = default_dirs
+    config_test = Namespace(
+        frameworks=Namespace(definition_file=["{root}/tests/resources/frameworks.yaml"])
+    )
     # allowing config override from user_dir: useful to define custom benchmarks and frameworks for example.
     config_user = Namespace()
     # config listing properties set by command line
@@ -30,7 +33,7 @@ def load_default_resources(tmp_path):
     config_args = Namespace({k: v for k, v in config_args if v is not None})
     # merging all configuration files and saving to the global variable
     resources.from_configs(
-        config_default, config_default_dirs, config_user, config_args
+        config_default, config_test, config_default_dirs, config_user, config_args
     )
 
 
diff --git a/tests/resources/frameworks.yaml b/tests/resources/frameworks.yaml
new file mode 100644
index 000000000..c1ff7207b
--- /dev/null
+++ b/tests/resources/frameworks.yaml
@@ -0,0 +1,3 @@
+setup_fail:
+  description: "used for tests"
+  module: tests.resources.frameworks.setup_fail
diff --git a/tests/unit/amlb/benchmarks/test_benchmark.py b/tests/unit/amlb/benchmarks/test_benchmark.py
index 866deae50..b088c1700 100644
--- a/tests/unit/amlb/benchmarks/test_benchmark.py
+++ b/tests/unit/amlb/benchmarks/test_benchmark.py
@@ -1,8 +1,10 @@
 from pathlib import Path
+from subprocess import SubprocessError
 
 import pytest
 
 from amlb import Benchmark, SetupMode, resources, DockerBenchmark, SingularityBenchmark
+from amlb.job import JobError
 from amlb.utils import Namespace
 
 
@@ -112,3 +114,20 @@ def test_singularity_image_name_as_docker(
         as_docker_image=True,
     )
     assert result == expected
+
+
+def test_benchmark_setup_errors_if_framework_does_not_install(
+    load_default_resources,
+) -> None:
+    benchmark = Benchmark(
+        framework_name="setup_fail",
+        benchmark_name="test",
+        constraint_name="test",
+        job_history=None,
+    )
+
+    with pytest.raises(JobError) as exc_info:
+        benchmark.setup(SetupMode.force)
+    assert "setup" in str(exc_info.value)
+    assert isinstance(exc_info.value.__cause__, SubprocessError)
+    assert "command_that_fails" in exc_info.value.__cause__.stderr