Skip to content

Commit

Permalink
Merge pull request #631 from orange-kao/orange-retry-based-on-backup-…
Browse files Browse the repository at this point in the history
…size

Better retry for base backup restore
  • Loading branch information
alexole authored Oct 30, 2024
2 parents 69d1115 + 4869d84 commit 5e22c85
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 10 deletions.
33 changes: 27 additions & 6 deletions pghoard/restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
from . import common, config, logutil, version
from .postgres_command import PGHOARD_HOST, PGHOARD_PORT

MAX_RETRIES = 6
STALL_MIN_RETRIES = 6 # minimum retry for stalled download, for the whole basebackup restore
SINGLE_FILE_MAX_RETRIES = 6 # maximum retry for a single file


class RestoreError(Error):
Expand Down Expand Up @@ -605,6 +606,10 @@ def _get_basebackup(
os.makedirs(dirname, exist_ok=True)
os.chmod(dirname, 0o700)

# Based on limited samples, there could be one stalled download per 122GiB of transfer
# So we tolerate one stall for every 64GiB of transfer (or STALL_MIN_RETRIES for smaller backup)
stall_max_retries = max(STALL_MIN_RETRIES, int(int(metadata.get("total-size-enc", 0)) / (64 * 2 ** 30)))

fetcher = BasebackupFetcher(
app_config=self.config,
data_files=basebackup_data_files,
Expand All @@ -613,6 +618,7 @@ def _get_basebackup(
pgdata=pgdata,
site=site,
tablespaces=tablespaces,
stall_max_retries=stall_max_retries,
)
fetcher.fetch_all()

Expand Down Expand Up @@ -657,7 +663,18 @@ def run(self, args=None):


class BasebackupFetcher:
def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None):
def __init__(
self,
*,
app_config,
debug,
site,
pgdata,
tablespaces,
data_files: List[FileInfo],
stall_max_retries: int,
status_output_file=None
):
self.log = logging.getLogger(self.__class__.__name__)
self.completed_jobs: Set[str] = set()
self.config = app_config
Expand All @@ -681,9 +698,10 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files:
self.tablespaces = tablespaces
self.total_download_size = 0
self.retry_per_file: Dict[str, int] = {}
self.stall_max_retries = stall_max_retries

def fetch_all(self):
for retry in range(MAX_RETRIES):
for retry in range(self.stall_max_retries):
try:
with self.manager_class() as manager:
self._setup_progress_tracking(manager)
Expand All @@ -701,8 +719,11 @@ def fetch_all(self):
if self.errors:
break

if retry == MAX_RETRIES - 1:
self.log.error("Download stalled despite retries, aborting")
if retry == self.stall_max_retries - 1:
self.log.error(
"Download stalled despite retries, aborting"
" (reached maximum retry %r)", self.stall_max_retries
)
self.errors = 1
break

Expand Down Expand Up @@ -781,7 +802,7 @@ def job_failed(self, key, exception):
retries = self.retry_per_file.get(key, 0) + 1
self.retry_per_file[key] = retries
self.pending_jobs.remove(key)
if retries < MAX_RETRIES:
if retries < SINGLE_FILE_MAX_RETRIES:
self.jobs_to_retry.add(key)
return
self.errors += 1
Expand Down
15 changes: 11 additions & 4 deletions test/test_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from pghoard.common import TAR_METADATA_FILENAME, write_json_file
from pghoard.restore import (
MAX_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
STALL_MIN_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
create_recovery_conf
)

Expand Down Expand Up @@ -195,7 +195,8 @@ def test_progress_tracking_and_error_handling(self):
status_output_file=status_output_file,
pgdata=pgdata,
site=site,
tablespaces=tablespaces
tablespaces=tablespaces,
stall_max_retries=STALL_MIN_RETRIES,
)
manager, pool, manager_enter = MagicMock(), MagicMock(), MagicMock()
fetcher.manager_class = lambda: manager
Expand Down Expand Up @@ -361,7 +362,7 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):

fetcher.max_stale_seconds = 2
with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
if max_fails < MAX_RETRIES:
if max_fails < STALL_MIN_RETRIES:
fetcher.fetch_all()
self.check_sha256(
os.path.join(restore_dir, "pg_notify", "0000"),
Expand Down Expand Up @@ -484,7 +485,13 @@ def run_restore_test(self, path, tar_executable, logic, tablespaces=None, files=
config["tar_executable"] = tar_executable
site = next(iter(config["backup_sites"]))
fetcher = BasebackupFetcher(
app_config=config, data_files=files, debug=True, pgdata=restore_dir, site=site, tablespaces=tablespaces or {}
app_config=config,
data_files=files,
debug=True,
pgdata=restore_dir,
site=site,
tablespaces=tablespaces or {},
stall_max_retries=STALL_MIN_RETRIES
)
try:
logic(fetcher, restore_dir)
Expand Down

0 comments on commit 5e22c85

Please sign in to comment.