From 9d7d1397fde2eac5ee1026701eaa0ec54e6e6e6b Mon Sep 17 00:00:00 2001 From: Fallen_Breath Date: Sun, 1 Dec 2024 12:19:55 +0800 Subject: [PATCH] Add backup config `reuse_stat_unchanged_file` for those speed lover --- docs/config.md | 23 +++++ docs/config.zh.md | 23 +++++ prime_backup/action/create_backup_action.py | 101 ++++++++++++++++---- prime_backup/config/backup_config.py | 1 + prime_backup/db/session.py | 5 + 5 files changed, 132 insertions(+), 21 deletions(-) diff --git a/docs/config.md b/docs/config.md index 0912e41..83e3645 100644 --- a/docs/config.md +++ b/docs/config.md @@ -216,6 +216,7 @@ Configs on how the backup is made "**/session.lock" ], "follow_target_symlink": false, + "reuse_stat_unchanged_file": false, "hash_method": "xxh128", "compress_method": "zstd", "compress_threshold": 64 @@ -305,6 +306,28 @@ Prime Backup will save not only the `world` symbolic link, but also the `foo` sy - Type: `bool` - Default: `false` +#### reuse_stat_unchanged_file + +When enabled, during backup creation, Prime Backup will try to directly reuse file information from the previous backup +for files whose stat (size, mtime, mode, etc.) have not changed. No file hash checking will be done on these stat-unchanged files + +If you want the maximum possible backup creation speed, you can try enabling this option. +However, this also introduces the potential risk of incomplete backups + +!!! warning + + Please only enable this option after ensuring that the server's operating system and file system are functioning properly and stably. + Otherwise, if issues such as system time rollback or abnormal file system metadata occur, some of the files might + have their content changed but keep their stat unchanged, and then Prime Backup will create an incomplete backup + +!!! tip + + Unless you really need this backup speed boost or the system disk read performance is too low, it is not recommended to enable this option. + Prime Backup is already fast enough + +- Type: `bool` +- Default: `false` + #### hash_method The algorithm to hash the files. Available options: `"xxh128"`、`"sha256"`、`"blake3"` diff --git a/docs/config.zh.md b/docs/config.zh.md index d936042..73ff6d7 100644 --- a/docs/config.zh.md +++ b/docs/config.zh.md @@ -216,6 +216,7 @@ Prime Backup 在创建备份时的操作时序如下: "**/session.lock" ], "follow_target_symlink": false, + "reuse_stat_unchanged_file": false, "hash_method": "xxh128", "compress_method": "zstd", "compress_threshold": 64 @@ -305,6 +306,28 @@ Prime Backup 除了会保存 `world` 这个符号链接外,还会保存 `foo` - 类型:`bool` - 默认值:`false` +#### reuse_stat_unchanged_file + +启用时,在创建备份过程中,Prime Backup 将尝试直接复用之前备份中那些未改变(如大小、修改时间、权限等)的文件信息。 +对于这些未改变文件的统计信息,将不会进行文件哈希检查。 + +如果你想获得尽可能快的备份创建速度,可以尝试启用此选项,但这会引入潜在的备份不完整的风险。 +除非你确实需要这一备份速度增益,或者系统磁盘读取性能过低,否则不建议启用此选项 + +!!! warning + + 请在确保服务器的操作系统和文件系统正常且稳定运行后,再启用此选项。 + 否则,如果出现系统时间回退或文件系统元数据异常等问题,可能会造成某些文件的内容变化但 stat 保持不变的情况, + 从而导致 Prime Backup 创建出了一个不完整的备份 + +!!! tip + + 除非你确实需要这一备份速度增益,或者系统磁盘读取性能过低,否则不建议启用此选项。 + Prime Backup 的速度已经足够快了 + +- 类型:`bool` +- 默认值:`false` + #### hash_method 对文件进行哈希时所使用的算法。可用选项:`"xxh128"`、`"sha256"`、`"blake3"` diff --git a/prime_backup/action/create_backup_action.py b/prime_backup/action/create_backup_action.py index ad1bf5a..9cad3cc 100644 --- a/prime_backup/action/create_backup_action.py +++ b/prime_backup/action/create_backup_action.py @@ -198,6 +198,7 @@ class _ScanResult: class _PreCalculationResult: stats: Dict[Path, os.stat_result] = dataclasses.field(default_factory=dict) hashes: Dict[Path, str] = dataclasses.field(default_factory=dict) + reused_files: Dict[Path, schema.File] = dataclasses.field(default_factory=dict) class CreateBackupAction(CreateBackupActionBase): @@ -221,6 +222,9 @@ def __init__(self, creator: Operator, comment: str, *, tags: Optional[BackupTags self.__source_path: Path = source_path or self.config.source_path + def __file_path_to_db_path(self, path: Path) -> str: + return path.relative_to(self.__source_path).as_posix() + def __scan_files(self) -> _ScanResult: ignore_patterns = pathspec.GitIgnoreSpec.from_lines(self.config.backup.ignore_patterns) result = _ScanResult() @@ -284,32 +288,70 @@ def __pre_calculate_stats(self, scan_result: _ScanResult): for file_entry in scan_result.all_files: stats[file_entry.path] = file_entry.stat + def __reuse_unchanged_files(self, session: DbSession, scan_result: _ScanResult): + backup = session.get_last_backup() + if backup is None: + return + + @dataclasses.dataclass(frozen=True) + class StatKey: + path: str + size: Optional[int] # it shouldn't be None, but just in case + mode: int + uid: int + gid: int + mtime: int + + stat_to_files: Dict[StatKey, schema.File] = {} + for file in session.get_backup_files(backup.id): + if stat.S_ISREG(file.mode): + key = StatKey( + path=file.path, + size=file.blob_raw_size, + mode=file.mode, + uid=file.uid, + gid=file.gid, + mtime=file.mtime_ns, + ) + stat_to_files[key] = file + + for file_entry in scan_result.all_files: + if file_entry.is_file(): + key = StatKey( + path=self.__file_path_to_db_path(file_entry.path), + size=file_entry.stat.st_size, + mode=file_entry.stat.st_mode, + uid=file_entry.stat.st_uid, + gid=file_entry.stat.st_gid, + mtime=file_entry.stat.st_mtime_ns + ) + if (file := stat_to_files.get(key)) is not None: + self.__pre_calc_result.reused_files[file_entry.path] = file + def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult): hashes = self.__pre_calc_result.hashes hashes.clear() - sizes: Set[int] = set() - for file_entry in scan_result.all_files: - if file_entry.is_file(): - sizes.add(file_entry.stat.st_size) + file_entries_to_hash: List[_ScanResultEntry] = [ + file_entry + for file_entry in scan_result.all_files + if file_entry.is_file() and file_entry.path not in self.__pre_calc_result.reused_files + ] - hash_dict_lock = threading.Lock() - existence = session.has_blob_with_size_batched(list(sizes)) - self.__blob_by_size_cache.update(existence) + all_sizes: Set[int] = {file_entry.stat.st_size for file_entry in file_entries_to_hash} + existed_sizes = session.has_blob_with_size_batched(list(all_sizes)) + self.__blob_by_size_cache.update(existed_sizes) def hash_worker(pth: Path): - h = hash_utils.calc_file_hash(pth) - with hash_dict_lock: - hashes[pth] = h + hashes[pth] = hash_utils.calc_file_hash(pth) with FailFastBlockingThreadPool(name='hasher') as pool: - for file_entry in scan_result.all_files: - if file_entry.is_file(): - if existence[file_entry.stat.st_size]: - # we need to hash the file, sooner or later - pool.submit(hash_worker, file_entry.path) - else: - pass # will use hash_once policy + for file_entry in file_entries_to_hash: + if existed_sizes[file_entry.stat.st_size]: + # we need to hash the file, sooner or later + pool.submit(hash_worker, file_entry.path) + else: + pass # will use hash_once policy @functools.cached_property def __temp_path(self) -> Path: @@ -510,7 +552,21 @@ def bp_rba(h: str) -> Path: raise VolatileBlobFile('blob file {} keeps changing'.format(src_path_str)) def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, schema.File]: - related_path = path.relative_to(self.__source_path) + if (reused_file := self.__pre_calc_result.reused_files.get(path)) is not None: + # make a copy + return session.create_file( + path=reused_file.path, + role=FileRole.unknown.value, + mode=reused_file.mode, + content=reused_file.content, + blob_hash=reused_file.blob_hash, + blob_compress=reused_file.blob_compress, + blob_raw_size=reused_file.blob_raw_size, + blob_stored_size=reused_file.blob_stored_size, + uid=reused_file.uid, + gid=reused_file.gid, + mtime_ns=reused_file.mtime_ns, + ) if (st := self.__pre_calc_result.stats.pop(path, None)) is None: st = path.lstat() @@ -530,16 +586,16 @@ def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, s elif stat.S_ISDIR(st.st_mode): pass elif stat.S_ISLNK(st.st_mode): - content = path.readlink().as_posix().encode('utf8') + content = os.readlink(path).encode('utf8') else: raise UnsupportedFileFormat(st.st_mode) return session.create_file( - path=related_path.as_posix(), - content=content, + path=self.__file_path_to_db_path(path), role=FileRole.unknown.value, mode=st.st_mode, + content=content, uid=st.st_uid, gid=st.st_gid, mtime_ns=st.st_mtime_ns, @@ -573,6 +629,9 @@ def run(self) -> BackupInfo: )) self.__pre_calculate_stats(scan_result) + if self.config.backup.reuse_stat_unchanged_file: + self.__reuse_unchanged_files(session, scan_result) + self.logger.info('Reused {} / {} unchanged files'.format(len(self.__pre_calc_result.reused_files), len(scan_result.all_files))) if self.config.get_effective_concurrency() > 1: self.__pre_calculate_hash(session, scan_result) self.logger.info('Pre-calculate all file hash done') diff --git a/prime_backup/config/backup_config.py b/prime_backup/config/backup_config.py index 9ad20f9..41b1695 100644 --- a/prime_backup/config/backup_config.py +++ b/prime_backup/config/backup_config.py @@ -17,6 +17,7 @@ class BackupConfig(Serializable): '**/session.lock', ] follow_target_symlink: bool = False + reuse_stat_unchanged_file: bool = False hash_method: HashMethod = HashMethod.xxh128 compress_method: CompressMethod = CompressMethod.zstd compress_threshold: int = 64 diff --git a/prime_backup/db/session.py b/prime_backup/db/session.py index 8c9e533..7e928a1 100644 --- a/prime_backup/db/session.py +++ b/prime_backup/db/session.py @@ -589,6 +589,11 @@ def get_backup_ids_by_blob_hashes(self, hashes: List[str]) -> List[int]: fileset_ids = self.get_fileset_ids_by_blob_hashes(hashes) return self.get_backup_ids_by_fileset_ids(fileset_ids) + def get_last_backup(self) -> Optional[schema.Backup]: + s = select(schema.Backup).order_by(desc(schema.Backup.id)).limit(1) + backups = _list_it(self.session.execute(s).scalars().all()) + return backups[0] if backups else None + def list_backup(self, backup_filter: Optional[BackupFilter] = None, limit: Optional[int] = None, offset: Optional[int] = None) -> List[schema.Backup]: s = select(schema.Backup) if backup_filter is not None: