diff --git a/pghoard/restore.py b/pghoard/restore.py index 4cfe839f..bbfc8b99 100644 --- a/pghoard/restore.py +++ b/pghoard/restore.py @@ -657,7 +657,17 @@ def run(self, args=None): class BasebackupFetcher: - def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None): + def __init__( + self, + *, + app_config, + debug, + site, + pgdata, + tablespaces, + data_files: List[FileInfo], + status_output_file=None, + ): self.log = logging.getLogger(self.__class__.__name__) self.completed_jobs: Set[str] = set() self.config = app_config @@ -698,6 +708,9 @@ def fetch_all(self): except TimeoutError: self.pending_jobs.clear() self.last_progress_ts = time.monotonic() + + # Increase the timeout and retry + self.max_stale_seconds *= 2 if self.errors: break diff --git a/test/test_restore.py b/test/test_restore.py index f5f18648..5fe51683 100644 --- a/test/test_restore.py +++ b/test/test_restore.py @@ -348,6 +348,7 @@ def test_real_processing_with_threading_retries_on_timeout_fails_after_3(self): def real_processing_with_threading_retries_on_timeout(self, fetcher, restore_dir, max_fails): fail_counter = [0] + base_max_stale_seconds = 2 class FailingChunkFetcher(ChunkFetcher): def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): @@ -357,9 +358,14 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): # Corrupt the file to test that retrying failed basebackup chunk yields sensible results with open(os.path.join(restore_dir, "pg_notify", "0000"), "w") as f: f.write("foo") - time.sleep(4) - fetcher.max_stale_seconds = 2 + # ensure we sleep long enough to timeout based on the number of retries + sleep_seconds = base_max_stale_seconds * ( + 2 ** max_fails + ) if max_fails < MAX_RETRIES else base_max_stale_seconds + time.sleep(sleep_seconds) + + fetcher.max_stale_seconds = base_max_stale_seconds with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher): if max_fails < MAX_RETRIES: fetcher.fetch_all()