From 2d4ad43b5e3e82fbc872179a37c604fa785cd2f0 Mon Sep 17 00:00:00 2001 From: ricolin Date: Wed, 26 Jul 2023 23:42:26 +0800 Subject: [PATCH 01/30] Update hack/stack.sh --- hack/stack.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hack/stack.sh b/hack/stack.sh index 4f6012c..b3ba372 100755 --- a/hack/stack.sh +++ b/hack/stack.sh @@ -56,7 +56,7 @@ backup_service_period = 1200 retention_service_period = 1200 backup_cycle_timout = 5min retention_time = 2w3d -backup_metadata_key="__automated_backup" +backup_metadata_key="__staffeln_backup" retention_metadata_key="__staffeln_retention" full_backup_depth = 4 @@ -77,9 +77,9 @@ pip install -U setuptools pip "${HOME}"/.local/bin/pip3 install -e . # Start staffeln conductor -staffeln-db-manage create_schema -#staffeln-db-manage upgrade head -set +x -source /opt/stack/openrc admin admin -set -x -staffeln-conductor & +"${HOME}"/.local/bin/staffeln-db-manage --config-file /etc/staffeln/staffeln.conf create_schema + #staffeln-db-manage upgrade head + +echo You can fetch authroize with command: source /opt/stack/openrc admin admin +echo You can now run staffeln conductor with: "${HOME}"/.local/bin/staffeln-conductor --config-file /etc/staffeln/staffeln.conf +echo You can now run staffeln api with: "${HOME}"/.local/bin/staffeln-api --config-file /etc/staffeln/staffeln.conf From cc4a0f04057855bc58dd1b98df836d0209a4e4dc Mon Sep 17 00:00:00 2001 From: ricolin Date: Thu, 3 Aug 2023 11:27:22 +0800 Subject: [PATCH 02/30] Add image builder action to staffeln --- .github/workflows/build.yml | 67 +++++++++++++++++++++++++++++++++++++ Dockerfile | 18 ++++++++++ 2 files changed, 85 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 Dockerfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..39f1e15 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,67 @@ +name: build +concurrency: + group: ${{ github.head_ref || github.run_id }} + cancel-in-progress: true +on: + workflow_dispatch: + inputs: + image_push: + type: boolean + description: 'Push images to Container Registry' + required: false + default: false + pull_request: + types: + - opened + - synchronize + - reopened + push: + branches: + - main +jobs: + image: + runs-on: ubuntu-latest + strategy: + matrix: + from: + - jammy + release: + - "2023.1" + steps: + - name: Install QEMU static binaries + uses: docker/setup-qemu-action@v2 + - name: Configure Buildkit + uses: docker/setup-buildx-action@v2 + - name: Checkout project + uses: actions/checkout@v3 + - name: Setup environment variables + run: echo "PROJECT_REF=${{ github.sha }}" >> "$GITHUB_ENV" + - name: Authenticate with Quay.io + uses: docker/login-action@v2 + if: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.image_push == true) }} + with: + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + - name: Build image + uses: docker/build-push-action@v3 + with: + build-args: |- + BUILDER_IMAGE=quay.io/vexxhost/openstack-builder-${{ matrix.from }} + RUNTIME_IMAGE=quay.io/vexxhost/openstack-runtime-${{ matrix.from }} + RELEASE=${{ matrix.release }} + PROJECT=staffln + PROJECT_REPO=https://github.com/vexxhost/staffeln + PROJECT_REF=${{ env.PROJECT_REF }} + cache-from: type=gha,scope=${{ matrix.from }}-${{ matrix.release }} + cache-to: type=gha,mode=max,scope=${{ matrix.from }}-${{ matrix.release }} + context: . + platforms: linux/amd64 + push: ${{ github.event_name == 'push' || inputs.image_push == true }} + tags: quay.io/vexxhost/staffeln:${{ env.PROJECT_REF }}-${{ matrix.from }} + - name: Promote image + uses: akhilerm/tag-push-action@v2.0.0 + if: github.event_name == 'push' && ((matrix.from == 'focal') || (matrix.from == 'jammy' && matrix.release != 'yoga')) + with: + dst: quay.io/vexxhost/staffeln:${{ matrix.release }} + src: quay.io/vexxhost/staffeln:${{ env.PROJECT_REF }}-${{ matrix.from }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b21c967 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +# syntax=docker/dockerfile-upstream:master-labs + +ARG BUILDER_IMAGE=quay.io/vexxhost/openstack-builder-focal +ARG RUNTIME_IMAGE=quay.io/vexxhost/openstack-runtime-focal + +# hadolint ignore=DL3007 +FROM quay.io/vexxhost/bindep-loci:latest AS bindep + +FROM ${BUILDER_IMAGE}:ced4522d9a10ba7172f373289af6dace06be3b36 AS builder +COPY --from=bindep --link /runtime-pip-packages /runtime-pip-packages + +FROM ${RUNTIME_IMAGE}:a391e31bb33041611e2aa2797debcb21e6f221cd AS runtime +COPY --from=bindep --link /runtime-dist-packages /runtime-dist-packages +COPY --from=builder --link /var/lib/openstack /var/lib/openstack +# hadolint ignore=DL3022 +COPY --from=docker.io/alpine/helm:3.11.2 /usr/bin/helm /usr/local/bin/helm +# hadolint ignore=DL3022 +COPY --from=gcr.io/go-containerregistry/crane /ko-app/crane /usr/local/bin/crane From 4c82df06404bd9231625b10c96bc75c7cbb5d1e6 Mon Sep 17 00:00:00 2001 From: ricolin Date: Thu, 14 Sep 2023 11:34:48 +0800 Subject: [PATCH 03/30] Add BACKUP_INIT task status for queue task This allow us to re-pulling task status and check if it still need to trigger action. --- staffeln/common/constants.py | 1 + staffeln/conductor/backup.py | 7 +++++++ staffeln/conductor/manager.py | 7 ++++++- staffeln/db/sqlalchemy/api.py | 2 +- staffeln/objects/queue.py | 2 +- 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/staffeln/common/constants.py b/staffeln/common/constants.py index 20f462e..b7d6d09 100644 --- a/staffeln/common/constants.py +++ b/staffeln/common/constants.py @@ -1,3 +1,4 @@ +BACKUP_INIT = 4 BACKUP_FAILED = 3 BACKUP_COMPLETED = 2 BACKUP_WIP = 1 diff --git a/staffeln/conductor/backup.py b/staffeln/conductor/backup.py index 19659c2..05f5cfd 100755 --- a/staffeln/conductor/backup.py +++ b/staffeln/conductor/backup.py @@ -107,6 +107,13 @@ def get_queues(self, filters=None): ) return queues + def get_queue_task_by_id(self, task_id): + """Get single volume queue task from the queue_data table""" + queue = objects.Queue.get_by_id( # pylint: disable=E1120 + context=self.ctx, id=task_id + ) + return queue + def create_queue(self, old_tasks): """ Create the queue of all the volumes for backup diff --git a/staffeln/conductor/manager.py b/staffeln/conductor/manager.py index 0f876c6..6a3ae7c 100755 --- a/staffeln/conductor/manager.py +++ b/staffeln/conductor/manager.py @@ -112,7 +112,12 @@ def _process_todo_tasks(self): for task in tasks_to_start: with lock.Lock(self.lock_mgt, task.volume_id) as t_lock: if t_lock.acquired: - self.controller.create_volume_backup(task) + # Re-pulling status and make it's up-to-date + task = self.controller.get_queue_task_by_id(task_id=task.id) + if task.backup_status == constants.BACKUP_PLANNED: + task.backup_status = constants.BACKUP_INIT + task.save() + self.controller.create_volume_backup(task) # Refresh the task queue def _update_task_queue(self): diff --git a/staffeln/db/sqlalchemy/api.py b/staffeln/db/sqlalchemy/api.py index ff2fcfd..3cda5e7 100644 --- a/staffeln/db/sqlalchemy/api.py +++ b/staffeln/db/sqlalchemy/api.py @@ -314,7 +314,7 @@ def update_queue(self, id, values): LOG.error("Queue resource not found.") def get_queue_by_id(self, context, id): - """Get the column from queue_data with matching backup_id""" + """Get the column from queue_data with matching id""" return self._get_queue(context, fieldname="id", value=id) def _get_queue(self, context, fieldname, value): diff --git a/staffeln/objects/queue.py b/staffeln/objects/queue.py index c6b1177..db49c21 100644 --- a/staffeln/objects/queue.py +++ b/staffeln/objects/queue.py @@ -36,7 +36,7 @@ def list(cls, context, filters=None): # pylint: disable=E0213 @base.remotable_classmethod def get_by_id(cls, context, id): # pylint: disable=E0213 - """Find a backup based on backup_id + """Find a queue task based on id :param context: Security context. NOTE: This should only be used internally by the indirection_api. Unfortunately, RPC requires context as the first From caf714ba1b5ee8005993629747439148aa2ba0d0 Mon Sep 17 00:00:00 2001 From: ricolin Date: Thu, 14 Sep 2023 12:56:37 +0800 Subject: [PATCH 04/30] Enable debug for devstack by default --- hack/stack.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hack/stack.sh b/hack/stack.sh index b3ba372..6830d57 100755 --- a/hack/stack.sh +++ b/hack/stack.sh @@ -49,6 +49,8 @@ EOF # Create staffeln configuration file cat < /etc/staffeln/staffeln.conf +[DEFAULT] +debug = True [conductor] backup_workers = 1 rotation_workers = 1 From cf0312981ec3448bf0b26dfb418e5eb0f4598804 Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 18 Sep 2023 15:57:42 +0800 Subject: [PATCH 05/30] Add tooz file driver support --- hack/stack.sh | 5 +- staffeln/common/lock.py | 116 +++++++++++++++++++++++++++++++--- staffeln/conductor/manager.py | 4 +- staffeln/conf/conductor.py | 20 +++++- staffeln/conf/database.py | 1 - staffeln/exception.py | 83 ++++++++++++++++++++++++ 6 files changed, 216 insertions(+), 13 deletions(-) create mode 100644 staffeln/exception.py diff --git a/hack/stack.sh b/hack/stack.sh index 6830d57..497b29a 100755 --- a/hack/stack.sh +++ b/hack/stack.sh @@ -18,6 +18,7 @@ else fi # Create DevStack configuration file + sudo mkdir /etc/staffeln sudo chown -R "${USER}". /etc/staffeln cat < /opt/stack/local.conf @@ -65,8 +66,10 @@ full_backup_depth = 4 [database] backend = sqlalchemy connection = "mysql+pymysql://staffeln:password@localhost:3306/staffeln" -tooz_connection = "mysql://staffeln:password@localhost:3306/staffeln" mysql_engine = InnoDB + +[coordination] +backend_url = "file:///tmp/staffeln_locks" EOF # Create staffeln database diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 159f60a..8006fd1 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -1,18 +1,27 @@ -import staffeln.conf +import errno +import glob +import os +import re +import sys +from typing import Optional # noqa: H301 +import uuid + +from oslo_config import cfg from oslo_log import log +from oslo_utils import timeutils from oslo_utils import uuidutils from tooz import coordination -CONF = staffeln.conf.CONF +from staffeln import conf +from staffeln import exception + +CONF = conf.CONF LOG = log.getLogger(__name__) class LockManager(object): - def __init__(self, node_id=None): - self.db_url = CONF.database.tooz_connection - self.node_id = uuidutils.generate_uuid() if node_id is None else node_id - # get_coordinator(backend_url, member_id) - self.coordinator = coordination.get_coordinator(self.db_url, node_id) + def __init__(self): + self.coordinator = COORDINATOR def __enter__(self): self.coordinator.start() @@ -23,19 +32,110 @@ def __exit__(self, exc_type, exc_val, exc_tb): class Lock(object): - def __init__(self, lock_manager, lock_name): + def __init__(self, lock_manager, lock_name, remove_lock=False): self.lock_manager = lock_manager self.lock_name = lock_name self.lock = None self.acquired = False + self.remove_lock = remove_lock def __enter__(self): self.lock = self.lock_manager.coordinator.get_lock(self.lock_name) self.acquired = self.lock.acquire(blocking=False) if not self.acquired: LOG.debug(f"Failed to lock for {self.lock_name}") + LOG.debug(f"acquired lock for {self.lock_name}") return self def __exit__(self, exc_type, exc_val, exc_tb): if self.acquired: self.lock.release() + LOG.debug(f"released lock for {self.lock_name}") + if self.remove_lock: + self.lock_manager.coordinator.remove_lock(self.lock_name) + LOG.debug(f"removed lock file (if any) for {self.lock_name}") + + +class Coordinator(object): + """Tooz coordination wrapper. + + Coordination member id is created from concatenated + `prefix` and `agent_id` parameters. + + :param str agent_id: Agent identifier + :param str prefix: Used to provide member identifier with a + meaningful prefix. + """ + + def __init__(self, agent_id: Optional[str] = None, prefix: str = ''): + self.coordinator = None + self.agent_id = agent_id or str(uuid.uuid4()) + self.started = False + self.prefix = prefix + self._file_path = None + + def _get_file_path(self, backend_url): + if backend_url.startswith('file://'): + path = backend_url[7:] + # Copied from TooZ's _normalize_path to get the same path they use + if sys.platform == 'win32': + path = re.sub(r'\\(?=\w:\\)', '', os.path.normpath(path)) + return os.path.abspath(os.path.join(path, self.prefix)) + return None + + def start(self) -> None: + if self.started: + return + + backend_url = CONF.coordination.backend_url + + # member_id should be bytes + member_id = (self.prefix + self.agent_id).encode('ascii') + self.coordinator = coordination.get_coordinator(backend_url, member_id) + assert self.coordinator is not None + self.coordinator.start(start_heart=True) + self._file_path = self._get_file_path(backend_url) + self.started = True + + def stop(self) -> None: + """Disconnect from coordination backend and stop heartbeat.""" + if self.started: + if self.coordinator is not None: + self.coordinator.stop() + self.coordinator = None + self.started = False + + def get_lock(self, name: str): + """Return a Tooz backend lock. + + :param str name: The lock name that is used to identify it + across all nodes. + """ + # lock name should be bytes + lock_name = (self.prefix + name).encode('ascii') + if self.coordinator is not None: + return self.coordinator.get_lock(lock_name) + else: + raise exception.LockCreationFailed('Coordinator uninitialized.') + + def remove_lock(self, glob_name): + # Most locks clean up on release, but not the file lock, so we manually + # clean them. + + def _err(file_name: str, exc: Exception) -> None: + LOG.warning('Failed to cleanup lock %(name)s: %(exc)s', + {'name': file_name, 'exc': exc}) + + if self._file_path: + files = glob.glob(self._file_path + glob_name) + for file_name in files: + try: + os.remove(file_name) + except OSError as exc: + if (exc.errno != errno.ENOENT): + _err(file_name, exc) + except Exception as exc: + _err(file_name, exc) + + +COORDINATOR = Coordinator(prefix='staffeln-') diff --git a/staffeln/conductor/manager.py b/staffeln/conductor/manager.py index 6a3ae7c..6e04e2a 100755 --- a/staffeln/conductor/manager.py +++ b/staffeln/conductor/manager.py @@ -60,7 +60,7 @@ def _process_wip_tasks(self): LOG.debug( f"try to get lock and run task for volume: {queue.volume_id}." ) - with lock.Lock(self.lock_mgt, queue.volume_id) as q_lock: + with lock.Lock(self.lock_mgt, queue.volume_id, remove_lock=True) as q_lock: if q_lock.acquired: self.controller.check_volume_backup_status(queue) else: # time out @@ -110,7 +110,7 @@ def _process_todo_tasks(self): ) if len(tasks_to_start) != 0: for task in tasks_to_start: - with lock.Lock(self.lock_mgt, task.volume_id) as t_lock: + with lock.Lock(self.lock_mgt, task.volume_id, remove_lock=True) as t_lock: if t_lock.acquired: # Re-pulling status and make it's up-to-date task = self.controller.get_queue_task_by_id(task_id=task.id) diff --git a/staffeln/conf/conductor.py b/staffeln/conf/conductor.py index 8a2d409..33160cf 100755 --- a/staffeln/conf/conductor.py +++ b/staffeln/conf/conductor.py @@ -106,6 +106,19 @@ ), ] + +coordination_group = cfg.OptGroup( + "coordination", + title="Coordination Options", + help=_("Options under this group are used to define Coordination's configuration."), +) + + +coordination_opts = [ + cfg.StrOpt("backend_url", default="", help=_("lock coordination connection backend URL.")), +] + + CONDUCTOR_OPTS = (backup_opts, rotation_opts) @@ -113,7 +126,12 @@ def register_opts(conf): conf.register_group(conductor_group) conf.register_opts(backup_opts, group=conductor_group) conf.register_opts(rotation_opts, group=conductor_group) + conf.register_opts(coordination_opts, group=coordination_group) def list_opts(): - return {"DEFAULT": rotation_opts, conductor_group: backup_opts} + return { + "DEFAULT": rotation_opts, + conductor_group: backup_opts, + coordination_group: coordination_opts + } diff --git a/staffeln/conf/database.py b/staffeln/conf/database.py index e06bf75..761aa15 100644 --- a/staffeln/conf/database.py +++ b/staffeln/conf/database.py @@ -15,7 +15,6 @@ SQL_OPTS = [ cfg.StrOpt("mysql_engine", default="InnoDB", help=_("MySQL engine to use.")), - cfg.StrOpt("tooz_connection", default="", help=_("Tooz MySQL connection URL.")), ] diff --git a/staffeln/exception.py b/staffeln/exception.py new file mode 100644 index 0000000..2faf951 --- /dev/null +++ b/staffeln/exception.py @@ -0,0 +1,83 @@ +# Copyright 2010 United States Government as represented by the +# Administrator of the National Aeronautics and Space Administration. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Staffeln base exception handling.""" + +from typing import Optional, Union # noqa: H301 +from oslo_log import log as logging + + +LOG = logging.getLogger(__name__) + + +class StaffelnException(Exception): + """Base Staffeln Exception + + To correctly use this class, inherit from it and define + a 'message' property. That message will get printf'd + with the keyword arguments provided to the constructor. + + """ + message = "An unknown exception occurred." + code = 500 + headers: dict = {} + safe = False + + def __init__(self, message: Optional[Union[str, tuple]] = None, **kwargs): + self.kwargs = kwargs + self.kwargs['message'] = message + + if 'code' not in self.kwargs: + try: + self.kwargs['code'] = self.code + except AttributeError: + pass + + for k, v in self.kwargs.items(): + if isinstance(v, Exception): + self.kwargs[k] = str(v) + + if self._should_format(): + try: + message = self.message % kwargs + except Exception: + self._log_exception() + message = self.message + elif isinstance(message, Exception): + message = str(message) + + self.msg = message + super(StaffelnException, self).__init__(message) + # Oslo.messaging use the argument 'message' to rebuild exception + # directly at the rpc client side, therefore we should not use it + # in our keyword arguments, otherwise, the rebuild process will fail + # with duplicate keyword exception. + self.kwargs.pop('message', None) + + def _log_exception(self) -> None: + # kwargs doesn't match a variable in the message + # log the issue and the kwargs + LOG.exception('Exception in string format operation:') + for name, value in self.kwargs.items(): + LOG.error("%(name)s: %(value)s", + {'name': name, 'value': value}) + + def _should_format(self) -> bool: + return self.kwargs['message'] is None or '%(message)' in self.message + + +class LockCreationFailed(StaffelnException): + message = "Unable to create lock. Coordination backend not started." From b6641c37c825ca1b657110868e9edb63500659bb Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 18 Sep 2023 16:21:21 +0800 Subject: [PATCH 06/30] Fix lint --- staffeln/common/lock.py | 30 ++++++++++++------------------ staffeln/conductor/manager.py | 8 ++++++-- staffeln/conf/conductor.py | 6 ++++-- staffeln/exception.py | 22 +++++++++++----------- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 8006fd1..5568bfe 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -3,18 +3,13 @@ import os import re import sys -from typing import Optional # noqa: H301 import uuid +from typing import Optional # noqa: H301 -from oslo_config import cfg from oslo_log import log -from oslo_utils import timeutils -from oslo_utils import uuidutils +from staffeln import conf, exception from tooz import coordination -from staffeln import conf -from staffeln import exception - CONF = conf.CONF LOG = log.getLogger(__name__) @@ -67,7 +62,7 @@ class Coordinator(object): meaningful prefix. """ - def __init__(self, agent_id: Optional[str] = None, prefix: str = ''): + def __init__(self, agent_id: Optional[str] = None, prefix: str = ""): self.coordinator = None self.agent_id = agent_id or str(uuid.uuid4()) self.started = False @@ -75,11 +70,11 @@ def __init__(self, agent_id: Optional[str] = None, prefix: str = ''): self._file_path = None def _get_file_path(self, backend_url): - if backend_url.startswith('file://'): + if backend_url.startswith("file://"): path = backend_url[7:] # Copied from TooZ's _normalize_path to get the same path they use - if sys.platform == 'win32': - path = re.sub(r'\\(?=\w:\\)', '', os.path.normpath(path)) + if sys.platform == "win32": + path = re.sub(r"\\(?=\w:\\)", "", os.path.normpath(path)) return os.path.abspath(os.path.join(path, self.prefix)) return None @@ -90,7 +85,7 @@ def start(self) -> None: backend_url = CONF.coordination.backend_url # member_id should be bytes - member_id = (self.prefix + self.agent_id).encode('ascii') + member_id = (self.prefix + self.agent_id).encode("ascii") self.coordinator = coordination.get_coordinator(backend_url, member_id) assert self.coordinator is not None self.coordinator.start(start_heart=True) @@ -112,19 +107,18 @@ def get_lock(self, name: str): across all nodes. """ # lock name should be bytes - lock_name = (self.prefix + name).encode('ascii') + lock_name = (self.prefix + name).encode("ascii") if self.coordinator is not None: return self.coordinator.get_lock(lock_name) else: - raise exception.LockCreationFailed('Coordinator uninitialized.') + raise exception.LockCreationFailed("Coordinator uninitialized.") def remove_lock(self, glob_name): # Most locks clean up on release, but not the file lock, so we manually # clean them. def _err(file_name: str, exc: Exception) -> None: - LOG.warning('Failed to cleanup lock %(name)s: %(exc)s', - {'name': file_name, 'exc': exc}) + LOG.warning(f"Failed to cleanup lock {file_name}: {exc}") if self._file_path: files = glob.glob(self._file_path + glob_name) @@ -132,10 +126,10 @@ def _err(file_name: str, exc: Exception) -> None: try: os.remove(file_name) except OSError as exc: - if (exc.errno != errno.ENOENT): + if exc.errno != errno.ENOENT: _err(file_name, exc) except Exception as exc: _err(file_name, exc) -COORDINATOR = Coordinator(prefix='staffeln-') +COORDINATOR = Coordinator(prefix="staffeln-") diff --git a/staffeln/conductor/manager.py b/staffeln/conductor/manager.py index 6e04e2a..380c2f7 100755 --- a/staffeln/conductor/manager.py +++ b/staffeln/conductor/manager.py @@ -60,7 +60,9 @@ def _process_wip_tasks(self): LOG.debug( f"try to get lock and run task for volume: {queue.volume_id}." ) - with lock.Lock(self.lock_mgt, queue.volume_id, remove_lock=True) as q_lock: + with lock.Lock( + self.lock_mgt, queue.volume_id, remove_lock=True + ) as q_lock: if q_lock.acquired: self.controller.check_volume_backup_status(queue) else: # time out @@ -110,7 +112,9 @@ def _process_todo_tasks(self): ) if len(tasks_to_start) != 0: for task in tasks_to_start: - with lock.Lock(self.lock_mgt, task.volume_id, remove_lock=True) as t_lock: + with lock.Lock( + self.lock_mgt, task.volume_id, remove_lock=True + ) as t_lock: if t_lock.acquired: # Re-pulling status and make it's up-to-date task = self.controller.get_queue_task_by_id(task_id=task.id) diff --git a/staffeln/conf/conductor.py b/staffeln/conf/conductor.py index 33160cf..ff39f13 100755 --- a/staffeln/conf/conductor.py +++ b/staffeln/conf/conductor.py @@ -115,7 +115,9 @@ coordination_opts = [ - cfg.StrOpt("backend_url", default="", help=_("lock coordination connection backend URL.")), + cfg.StrOpt( + "backend_url", default="", help=_("lock coordination connection backend URL.") + ), ] @@ -133,5 +135,5 @@ def list_opts(): return { "DEFAULT": rotation_opts, conductor_group: backup_opts, - coordination_group: coordination_opts + coordination_group: coordination_opts, } diff --git a/staffeln/exception.py b/staffeln/exception.py index 2faf951..e561506 100644 --- a/staffeln/exception.py +++ b/staffeln/exception.py @@ -17,8 +17,8 @@ """Staffeln base exception handling.""" from typing import Optional, Union # noqa: H301 -from oslo_log import log as logging +from oslo_log import log as logging LOG = logging.getLogger(__name__) @@ -27,10 +27,11 @@ class StaffelnException(Exception): """Base Staffeln Exception To correctly use this class, inherit from it and define - a 'message' property. That message will get printf'd + a "message" property. That message will get printf'd with the keyword arguments provided to the constructor. """ + message = "An unknown exception occurred." code = 500 headers: dict = {} @@ -38,11 +39,11 @@ class StaffelnException(Exception): def __init__(self, message: Optional[Union[str, tuple]] = None, **kwargs): self.kwargs = kwargs - self.kwargs['message'] = message + self.kwargs["message"] = message - if 'code' not in self.kwargs: + if "code" not in self.kwargs: try: - self.kwargs['code'] = self.code + self.kwargs["code"] = self.code except AttributeError: pass @@ -61,22 +62,21 @@ def __init__(self, message: Optional[Union[str, tuple]] = None, **kwargs): self.msg = message super(StaffelnException, self).__init__(message) - # Oslo.messaging use the argument 'message' to rebuild exception + # Oslo.messaging use the argument "message" to rebuild exception # directly at the rpc client side, therefore we should not use it # in our keyword arguments, otherwise, the rebuild process will fail # with duplicate keyword exception. - self.kwargs.pop('message', None) + self.kwargs.pop("message", None) def _log_exception(self) -> None: # kwargs doesn't match a variable in the message # log the issue and the kwargs - LOG.exception('Exception in string format operation:') + LOG.exception("Exception in string format operation:") for name, value in self.kwargs.items(): - LOG.error("%(name)s: %(value)s", - {'name': name, 'value': value}) + LOG.error(f"{name}: {value}") def _should_format(self) -> bool: - return self.kwargs['message'] is None or '%(message)' in self.message + return self.kwargs["message"] is None or "%(message)" in self.message class LockCreationFailed(StaffelnException): From 82c0ae01df7759e96530c25c1aad4f0d80809898 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Mon, 18 Sep 2023 17:50:24 +0000 Subject: [PATCH 07/30] chore: clean-up and minify image --- Dockerfile | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index b21c967..59f0e97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,12 @@ -# syntax=docker/dockerfile-upstream:master-labs - -ARG BUILDER_IMAGE=quay.io/vexxhost/openstack-builder-focal -ARG RUNTIME_IMAGE=quay.io/vexxhost/openstack-runtime-focal - -# hadolint ignore=DL3007 -FROM quay.io/vexxhost/bindep-loci:latest AS bindep - -FROM ${BUILDER_IMAGE}:ced4522d9a10ba7172f373289af6dace06be3b36 AS builder -COPY --from=bindep --link /runtime-pip-packages /runtime-pip-packages - -FROM ${RUNTIME_IMAGE}:a391e31bb33041611e2aa2797debcb21e6f221cd AS runtime -COPY --from=bindep --link /runtime-dist-packages /runtime-dist-packages -COPY --from=builder --link /var/lib/openstack /var/lib/openstack -# hadolint ignore=DL3022 -COPY --from=docker.io/alpine/helm:3.11.2 /usr/bin/helm /usr/local/bin/helm -# hadolint ignore=DL3022 -COPY --from=gcr.io/go-containerregistry/crane /ko-app/crane /usr/local/bin/crane +# syntax=docker/dockerfile:1.5 + +FROM python:3.10 AS builder +RUN python3 -m venv /venv +ENV PATH=/venv/bin:$PATH +ADD . /src +RUN --mount=type=cache,target=/root/.cache \ + pip install /src + +FROM python:3.10-slim AS runtime +ENV PATH=/venv/bin:$PATH +COPY --from=builder /venv /venv From ff534a095a9be2674a62bf7dcb8fba01f184785f Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Mon, 18 Sep 2023 17:57:24 +0000 Subject: [PATCH 08/30] ci: chore: build images to ghcr --- .github/workflows/build.yml | 75 ++++++++++++------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 39f1e15..d0afe6b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,67 +1,38 @@ name: build + concurrency: group: ${{ github.head_ref || github.run_id }} cancel-in-progress: true + on: workflow_dispatch: - inputs: - image_push: - type: boolean - description: 'Push images to Container Registry' - required: false - default: false - pull_request: - types: - - opened - - synchronize - - reopened push: branches: - - main + - 'master' + tags: + - 'v*' + pull_request: + branches: + - 'master' + jobs: image: runs-on: ubuntu-latest - strategy: - matrix: - from: - - jammy - release: - - "2023.1" steps: - - name: Install QEMU static binaries - uses: docker/setup-qemu-action@v2 - - name: Configure Buildkit - uses: docker/setup-buildx-action@v2 - - name: Checkout project - uses: actions/checkout@v3 - - name: Setup environment variables - run: echo "PROJECT_REF=${{ github.sha }}" >> "$GITHUB_ENV" - - name: Authenticate with Quay.io - uses: docker/login-action@v2 - if: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.image_push == true) }} + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + - uses: docker/metadata-action@v5 + id: meta with: - password: ${{ secrets.QUAY_ROBOT_TOKEN }} - registry: quay.io - username: ${{ secrets.QUAY_USERNAME }} - - name: Build image - uses: docker/build-push-action@v3 + images: ghcr.io/vexxhost/staffeln + - uses: docker/login-action@v3 + if: github.event_name != 'pull_request' with: - build-args: |- - BUILDER_IMAGE=quay.io/vexxhost/openstack-builder-${{ matrix.from }} - RUNTIME_IMAGE=quay.io/vexxhost/openstack-runtime-${{ matrix.from }} - RELEASE=${{ matrix.release }} - PROJECT=staffln - PROJECT_REPO=https://github.com/vexxhost/staffeln - PROJECT_REF=${{ env.PROJECT_REF }} - cache-from: type=gha,scope=${{ matrix.from }}-${{ matrix.release }} - cache-to: type=gha,mode=max,scope=${{ matrix.from }}-${{ matrix.release }} - context: . - platforms: linux/amd64 - push: ${{ github.event_name == 'push' || inputs.image_push == true }} - tags: quay.io/vexxhost/staffeln:${{ env.PROJECT_REF }}-${{ matrix.from }} - - name: Promote image - uses: akhilerm/tag-push-action@v2.0.0 - if: github.event_name == 'push' && ((matrix.from == 'focal') || (matrix.from == 'jammy' && matrix.release != 'yoga')) + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/build-push-action@v5 with: - dst: quay.io/vexxhost/staffeln:${{ matrix.release }} - src: quay.io/vexxhost/staffeln:${{ env.PROJECT_REF }}-${{ matrix.from }} + push: ${{ github.event_name != 'pull_request' }} + labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} From d87ecd1c81213f592f9f367d02759dd9e336e569 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Mon, 18 Sep 2023 17:58:08 +0000 Subject: [PATCH 09/30] ci: fix branch name --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d0afe6b..79c8a20 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,12 +8,12 @@ on: workflow_dispatch: push: branches: - - 'master' + - 'main' tags: - 'v*' pull_request: branches: - - 'master' + - 'main' jobs: image: From 77aacf092f39cca08ae08dcacd0a87f3a1d064b4 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Mon, 18 Sep 2023 18:01:52 +0000 Subject: [PATCH 10/30] ci: add checkout --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 79c8a20..5ca0154 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,6 +21,7 @@ jobs: steps: - uses: docker/setup-qemu-action@v3 - uses: docker/setup-buildx-action@v3 + - uses: actions/checkout@v4 - uses: docker/metadata-action@v5 id: meta with: @@ -33,6 +34,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - uses: docker/build-push-action@v5 with: + context: . push: ${{ github.event_name != 'pull_request' }} labels: ${{ steps.meta.outputs.labels }} tags: ${{ steps.meta.outputs.tags }} From a74b04b3956d2ba3ca68379f8595f9dd7e9b266c Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 25 Sep 2023 17:47:37 +0800 Subject: [PATCH 11/30] Using K8s lease for lock Using sherlock lib instead of tooz to support Kubernetes lease. --- requirements.txt | 1 + staffeln/common/lock.py | 43 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8a52b2b..32d5e37 100755 --- a/requirements.txt +++ b/requirements.txt @@ -18,5 +18,6 @@ openstacksdk>0.28.0 pymysql parse tooz # Apache-2.0 +sherlock>=0.4.1 # MIT # email # smtplib diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 5568bfe..88772ab 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -7,6 +7,7 @@ from typing import Optional # noqa: H301 from oslo_log import log +import sherlock from staffeln import conf, exception from tooz import coordination @@ -15,8 +16,9 @@ class LockManager(object): - def __init__(self): - self.coordinator = COORDINATOR + def __init__(self, lock_backend="k8s"): + self.coordinator = COORDINATOR if ( + lock_backend == "tooz") else K8SCOORDINATOR def __enter__(self): self.coordinator.start() @@ -131,5 +133,42 @@ def _err(file_name: str, exc: Exception) -> None: except Exception as exc: _err(file_name, exc) +class K8sCoordinator(object): + """Sherlock kubernetes coordination wrapper. + + :param int expire: Set lock expire seconds + :param int timeout: Set lock acquire action timeout seconds + :param str namespace: Set lock namespace. + """ + + def __init__(self, expire: int = 3600, timeout: int = 10, + namespace: str = "staffeln"): + self.timeout = timeout + self.expire = expire + self.namespace = namespace + self.started = False + + def start(self) -> None: + if self.started: + return + sherlock.configure(expire=self.expire, timeout=self.timeout) + self.started = True + + def stop(self) -> None: + """Disconnect from coordination backend and stop heartbeat.""" + pass + + def get_lock(self, name: str): + """Return a kubernetes lease lock. + + :param str name: The lock name that is used to identify it + across all nodes. + """ + return sherlock.KubernetesLock(name, self.namespace) + + def remove_lock(self, glob_name): + pass + COORDINATOR = Coordinator(prefix="staffeln-") +K8SCOORDINATOR = K8sCoordinator() From d72edee12c625f4c13874d9f38b670545144880b Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 25 Sep 2023 22:03:49 +0800 Subject: [PATCH 12/30] Fix lint --- .hadolint.yaml | 2 ++ staffeln/common/lock.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 .hadolint.yaml diff --git a/.hadolint.yaml b/.hadolint.yaml new file mode 100644 index 0000000..ef55db3 --- /dev/null +++ b/.hadolint.yaml @@ -0,0 +1,2 @@ +ignored: + - DL3020 diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 88772ab..244fa72 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -6,8 +6,8 @@ import uuid from typing import Optional # noqa: H301 -from oslo_log import log import sherlock +from oslo_log import log from staffeln import conf, exception from tooz import coordination @@ -16,9 +16,8 @@ class LockManager(object): - def __init__(self, lock_backend="k8s"): - self.coordinator = COORDINATOR if ( - lock_backend == "tooz") else K8SCOORDINATOR + def __init__(self, backend="k8s"): + self.coordinator = COORDINATOR if backend == "tooz" else K8SCOORDINATOR def __enter__(self): self.coordinator.start() @@ -133,6 +132,7 @@ def _err(file_name: str, exc: Exception) -> None: except Exception as exc: _err(file_name, exc) + class K8sCoordinator(object): """Sherlock kubernetes coordination wrapper. @@ -141,8 +141,9 @@ class K8sCoordinator(object): :param str namespace: Set lock namespace. """ - def __init__(self, expire: int = 3600, timeout: int = 10, - namespace: str = "staffeln"): + def __init__( + self, expire: int = 3600, timeout: int = 10, namespace: str = "staffeln" + ): self.timeout = timeout self.expire = expire self.namespace = namespace From 108b68678f86217241b056babddb91120c3f0d6f Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 25 Sep 2023 22:19:06 +0800 Subject: [PATCH 13/30] Temprory disable docker hadolint --- .github/workflows/linters.yaml | 3 ++- .hadolint.yaml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linters.yaml b/.github/workflows/linters.yaml index 193eabe..6745ffe 100644 --- a/.github/workflows/linters.yaml +++ b/.github/workflows/linters.yaml @@ -1,5 +1,5 @@ name: linters -on: push +on: push jobs: super-lint: @@ -11,5 +11,6 @@ jobs: DEFAULT_BRANCH: main GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} VALIDATE_ALL_CODEBASE: true + VALIDATE_DOCKERFILE_HADOLINT: false VALIDATE_PYTHON_MYPY: false VALIDATE_JSCPD: false diff --git a/.hadolint.yaml b/.hadolint.yaml index ef55db3..f09cb61 100644 --- a/.hadolint.yaml +++ b/.hadolint.yaml @@ -1,2 +1,3 @@ +--- ignored: - DL3020 From d834dcb399b4f158e5dcff0099386e2b29895b24 Mon Sep 17 00:00:00 2001 From: ricolin Date: Mon, 25 Sep 2023 22:39:20 +0800 Subject: [PATCH 14/30] Judge lock backend by config backend_url --- staffeln/common/lock.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 244fa72..d5d6134 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -16,8 +16,11 @@ class LockManager(object): - def __init__(self, backend="k8s"): - self.coordinator = COORDINATOR if backend == "tooz" else K8SCOORDINATOR + def __init__(self): + backend_url = CONF.coordination.backend_url + # This is for now using to check if any backend_url setup + # for tooz backends as K8s should not need one.any + self.coordinator = COORDINATOR if backend_url else K8SCOORDINATOR def __enter__(self): self.coordinator.start() From 067a6ca066d974178a98c65f026204f57893a925 Mon Sep 17 00:00:00 2001 From: ricolin Date: Tue, 26 Sep 2023 14:59:22 +0800 Subject: [PATCH 15/30] Add kubernetes to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 32d5e37..3789929 100755 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,6 @@ pymysql parse tooz # Apache-2.0 sherlock>=0.4.1 # MIT +kubernetes # Apache-2.0 # email # smtplib From 9eb7e113f1c8d22e145d9c4fdff77559e5ce8b25 Mon Sep 17 00:00:00 2001 From: ricolin Date: Wed, 27 Sep 2023 16:25:01 +0800 Subject: [PATCH 16/30] Fix debug log --- staffeln/common/lock.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index d5d6134..4d60ab1 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -43,7 +43,8 @@ def __enter__(self): self.acquired = self.lock.acquire(blocking=False) if not self.acquired: LOG.debug(f"Failed to lock for {self.lock_name}") - LOG.debug(f"acquired lock for {self.lock_name}") + else: + LOG.debug(f"acquired lock for {self.lock_name}") return self def __exit__(self, exc_type, exc_val, exc_tb): From 8f98ab0bbecd2b602949a1bb2793967a7ba062d6 Mon Sep 17 00:00:00 2001 From: ricolin Date: Fri, 29 Sep 2023 23:08:21 +0800 Subject: [PATCH 17/30] Using `openstack` namespace for k8s lease --- staffeln/common/lock.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/staffeln/common/lock.py b/staffeln/common/lock.py index 4d60ab1..4c05626 100644 --- a/staffeln/common/lock.py +++ b/staffeln/common/lock.py @@ -146,12 +146,13 @@ class K8sCoordinator(object): """ def __init__( - self, expire: int = 3600, timeout: int = 10, namespace: str = "staffeln" + self, expire: int = 3600, timeout: int = 10, namespace: str = "openstack" ): self.timeout = timeout self.expire = expire self.namespace = namespace self.started = False + self.prefix = "staffeln-" def start(self) -> None: if self.started: @@ -169,7 +170,7 @@ def get_lock(self, name: str): :param str name: The lock name that is used to identify it across all nodes. """ - return sherlock.KubernetesLock(name, self.namespace) + return sherlock.KubernetesLock(self.prefix + name, self.namespace) def remove_lock(self, glob_name): pass From 9812442b4abf4370530f40d1aa1bc8bf27a6c0b9 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Tue, 14 Nov 2023 08:52:07 +0100 Subject: [PATCH 18/30] correct typo in the backup_service_period help message --- staffeln/conf/conductor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staffeln/conf/conductor.py b/staffeln/conf/conductor.py index ff39f13..ab8e258 100755 --- a/staffeln/conf/conductor.py +++ b/staffeln/conf/conductor.py @@ -21,7 +21,7 @@ "backup_service_period", default=1800, min=60, - help=_("The time of bakup period, the unit is one second."), + help=_("The time of backup period, the unit is one second."), ), cfg.IntOpt( "backup_min_interval", From 9cc29ff05c9b7160332260b6ba55993bee439375 Mon Sep 17 00:00:00 2001 From: ricolin Date: Wed, 15 Nov 2023 00:54:34 +0800 Subject: [PATCH 19/30] Update README --- README.md | 357 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 255 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index f707938..88aa5f1 100755 --- a/README.md +++ b/README.md @@ -2,111 +2,264 @@ ## Project Description -This solution is a volume-level scheduled backup to implement a non-intrusive automatic backup for Openstack VMs. +This solution is a volume-level scheduled backup to implement a non-intrusive +automatic backup for Openstack VMs. All volumes attached to the specified VMs are backed up periodically. -File-level backup will not be provided. The volume can be restored and attached to the target VM to restore any needed files. Users can restore through Horizon or the cli in self-service. - -## Functions - -### Function Overview - -The solution backs up all volumes attached to VMs which have a predefined metadata set, for -example, `backup=yes`. -First, it gets the list of VMs which have backup metadata and the list of volumes attached to the -VMs in the given project by consuming the Openstack API (nova-api and cinder-api). Once the -volume list is prepared, then it consumes cinder-backup API to perform the backup. -Once the backup is successful, the backup time is updated in the metadata - `last-backup-time` of -the VM. - -* *Filtering volumes:* It skips specific volumes if the volume metadata includes a specific -`skip-volume-backup` flag. -* *Limitation:* The number of volumes which users can backup is limited. Once the backup -count exceeds the quota which is defined per project, the backup job would fail. -* *Naming convention:* The backup volume name would be -{VOLUME_NAME}-{BACKUP_DATE}. -* Compression: all backup volumes are compressed at the ceph level. The compression -mode, compression algorithm and required parameters are configured by the user. +File-level backup will not be provided. The volume can be restored and attached +to the target VM to restore any needed files. Users can restore through Horizon +or the cli in self-service. + +## Staffeln Conductor Functions + +Staffeln conductor manage all perodic tasks like backup, retention, and +notification. It's possible to have multiple staffeln conductor services +running. There will only be one service pulling volume and server information +from OpenStack and schedule backups. All conductor on the other hand, will be +able to take scheduled backup tasks and run backups and also check for backup +to completed. For single volume, only one backup task will be generated, and +only one of staffeln conductor service will be able to pick up that task that +the same time. Same as retention tasks. + +### Backup + +Staffeln is a service to help perform backup. What it does is with provided +authorization, Staffeln find a volume list with go through instance list from +OpenStack, and find instances has `backup_metadata_key` (which configured under +`[conductor]` section in `/etc/staffeln/staffeln.conf`) defined in metadata and +got volume attached. Collect those attached volumes into a list. Follow by +using that volume list to generate volume backup tasks in Staffeln. And do +backups, check work-in-progress backups accordingly. With role control, there +is only one Staffeln service that can perform volume collection and backup task +schedules at the same time. But all services can do backup action, and check +progress in parallel. Backup schedule trigger time is controlled by periodic +jobs separately across all Staffeln nodes. It’s possible a following backup +plan starts from a different node near previous success backup (with less than +`backup_service_period` of time) in Staffeln V1, but it’s fixed with +`backup_min_interval` config. And in either case, the Full and Incremental +backup order (config with `full_backup_depth`) is still be honored. +`backup_min_interval` is value that you config for how many seconds you like as +minimum interval between backups for same volume from Staffeln. The +configuration `full_backup_depth` under `[conductor]` section in +`/etc/staffeln/staffeln.conf` will decide how incremental backups are going to +perform. If `full_backup_depth` is set to 1. For each Full backup will follow +by only one incremental backup(not counting ). And 2 incremental if +`full_backup_depth` set to 2. Set to `0` if want all full backups. + +To avoid long stucking backup action, config `backup_cycle_timout` should be +set with a reasonable time that long enough for backups to complete but good +enough to judge the backup process is stucking. When a backup process reach +this timeout, it will remove the backup task and try to delete the volume +backup. A followup backup object (marked as not completed) will be create and +set the create time to 10 years old so the remove progress will be observe and +retry on next retention job. + +`backup_service_period` is no longer the only fector that reflect how long +volume should backup. It’s recommended to set `backup_min_interval` and +`report_period`(see in Report part) and config a related shorter +`backup_service_period`. For example if we set `backup_min_interval` to 3600 +seconds and set `backup_service_period` to 600 seconds, the backup job will +trigger roughly every 10 minutes, and only create new backup when previous +backup for same volume created for more than 1 hours ago. ### Retention -Based on the configured retention policy, the volumes are removed. -Openstack API access policies are customized to make only the retention service be able to delete -the backups and users not. - -### Scaling - -Cinder backup service is running on the dedicated backup host and it can be scaled across multiple -backup hosts. - -### Notification - -Once the backup is finished, the results are notified to the specified users by email regardless of -whether it was successful or not (the email will be one digest of all backups). -Backup result HTML Template -- Backup time -- Current quota usage(Quota/used number/percentage) with proper colors - - 50% <= Quota usage : Green - - 80% > Quota > 50% usage : Yellow - - Quota usage > 80% : Red -- Volume list -- Success/fail: true/false with proper colors - - Fail: Red - - Success: Green -- Fail reason - -### Settings - -Users can configure the settings to control the backup process. The parameters are; -- Backup period -- Volume filtering tag -- Volume skip filter metadata tag -- Volume limit number -- Retention time -- Archival rules -- Compression mode, algorithm and parameters -- Notification receiver list -- Notification email HTML template -- Openstack Credential - -### User Interface - -- Users can get the list of backup volumes on the Horizon cinder-backup panel. This panel -has filtering and pagination functions which are not default ones of Horizon. -- Users cannot delete the volumes on the UI. “Delete Volume Backup” button is disabled on -the cinder-backup panel. - -## Dependencies - -* openstacksdk (API calls) -* Flask (HTTP API) -* oslo.service (long-running daemon) -* pbr (using setup.cfg for build tooling) -* oslo.db (database connections) -* oslo.config (configuration files) - - -## Architecture - -### HTTP API (staffeln-api) - -This project will need a basic HTTP API. The primary reason for this is because when a user will attempt to delete a backup, we will use [oslo.policy via HTTP](https://docs.openstack.org/oslo.policy/victoria/user/plugins.html) to make sure that the backup they are attempting to delete is not an automated backup. - -This API will be unauthenticated and stateless, due to the fact that it is simply going to return the plain-text string True or fail with 401 Unauthorized. Because of the simplicity of this API, [Flask](https://flask.palletsprojects.com/en/1.1.x/) is an excellent tool to be able to build it out. - -The flow of the HTTP call will look like the following: - -1. HTTP request received through oslo.policy when backup being deleted with ID -2. Server look up backup ID using OpenStack API -3. If backup metadata contains `__automated_backup=True` then deny, otherwise allow. - -With that flow, we’ll be able to protect automated backups from being deleted automatically. In order to build a proper architecture, this application will be delivered as a WSGI application so it can be hosted via something like uWSGI later. - -### Daemon (staffeln-conductor) - -The conductor will be an independent daemon that will essentially scan all the virtual machines (grouped by project) which are marked to have automatic backups and then automatically start queueing up backups for them to be executed by Cinder. - -Once backups for a project are done, it should be able to start running the rotation policy that is configured on all the existing volumes and then send out a notification email afterwards to the user. - -The daemon should be stateful and ensure that it has its own state which is stored inside of a database. +On retention, backups which has creation time longer than retention time +(defined by `retention_time` from `/etc/staffeln/staffeln.conf` or +`retention_metadata_key` which added to metadata of instances) will put in list +and try to delete by Staffeln. Note: the actual key value of +`retention_metadata_key` is customizable. Like in test doc, you can see +following property been added to instance ` --property +__staffeln_retention=20min`. Customized `retention_metadata_key` has larger +priority than `retention_time`. If no `retention_metadata_key` defined for +instance, `retention_time` will be used. With incremental backup exist, +retention will honored full and incremental backup order. That means some +backups might stay longer than it’s designed retention time as there are +incremental backups depends on earlier backups. The chain will stop when next +full backup created. Now retention only delete backup object from Staffeln DB +when backup not found in Cinder backup service. + +For honor backups dependencies. When collected retention list for one volumes, +retention will start delete the later created one. And go through that order +till the very early created one. However, as Cinder might not honor the delete +request order. It’s possible that some of delete request in that situation +might failed. In Staffeln, will try to delete those failed request in next +periodic time. + +It’s recommended to config `retention_time` according your default retention +needs, and well setup `retention_metadata_key` and update instance metadata to +schedule for the actual rentnetion for volumes from each instance. +`retention_service_period` is only for trigger checking if there are any +backups should be delete. So no need to set it to a too long period of time. + +### Report + +Report process is part of backup cron job. When one of Staffeln service got +backup schedule role and finish with backup schedule, trigger, and check work +in progress backup are done in this period. It will check if any successed or +failed backup task has not been reported for `report_period` seconds after it +created. It will trigger the report process. `report_period` is defined under +`[conductor]` with unit to seconds. Report will generate an html format of +string with quota, success and failed backup task list with proper html color +format for each specific project that has success or failed backup to report. +As for how the report will sent is base on your config and environment. + +And if email sending failed, it will not send that report but provide message +for email failed in log. Staffeln will try to regenerate and resent report on +next periodic cycle. On the other hand, you can avoid config `sender_email` +from above, and make the report goes to logs directly. If you have specific +email addresses you wish to send to instead of using project name. You can +provide `receiver` config so it will send all project report to receiver list +instead. And if neither `recveiver` or `project_receiver_domain` are set, the +project report will try to grap project member list and gather user emails to +send report to. If no user email can be found from project member, Staffeln +will ignore this report cycle and retry the next cycle. Notice that, to +improve Staffeln performance and to reduce old backup result exist in Staffeln +DB, properly config email is recommended. Otherwise, not config any sender +information and make the reports goes to logs can be considered. When report +successfully sent to email or logs for specific project. all success/failed +tasks for that project will be purged from Staffeln. + +The report interval might goes a bit longer than `report_period` base on the +backup service interval and previous backup works. For example on each backup +schedule role granted, it start all the backup schedule works also check bacup +in progress tasks with other staffeln services. And counting cron job sleep +interval, the report time might take longer than what configed in +`report_period`. But it will never goes earlier than `report_period`. + +For report format. It’s written in html format and categorized by projects. +Collect information from all projects into one report, and sent it through +email or directly to log. And in each project, will provide information about +project name, quote status, backup succeeded list, and backup failed list. And +follow by second project and so on. + +### Staffeln-API + +Staffeln API service allows we defined cinder policy check and make sure all +Cinder volume backups are deleted only when that backup is not makaged by +Staffeln. Once staffeln api service is up. You can define similar policy as +following to `/etc/cinder/policy.yaml`: "backup:delete" : "rule:admin_api or +(project_id:%(project_id)s and +http://Staffeln-api-url:8808/v1/backup?backup_id=%(id)s)" + +And when backup not exist in staffeln, that API will return TRUE and make the +policy allows the backup delete. Else will return False and only allow backup +delete when it's admin in above case. + +## Settings + +Users can configure the settings to control the backup process. Most of +functions are controlled through configurations. You will be able to find all +configurations under +https://github.com/vexxhost/staffeln/tree/main/staffeln/conf + +And defined them in `/etc/staffeln/staffeln.conf` before restart +staffeln-conductor service. + +## User Interface + +Users can get the list of backup volumes on the Horizon cinder-backup panel. +This panel has filtering and pagination functions which are not default ones of +Horizon. Users cannot delete the volumes on the UI if “Delete Volume Backup” +button is disabled on the cinder-backup panel from horizon. + +## Service dependencies + +* openstacksdk that can reach to Cinder, Nova, and Keystone + + Staffeln heavily depends on Cinder backup. So need to make sure that Cinder + Backup service is stable. On the other hand, as backup create or delete + request amount might goes high when staffeln processed with large amount of + volume backup. It’s possible API request is not well processed or the request + order is mixed. For delete backup, Staffeln might not be able to delete a + backup right away if any process failed (like full backup delete request sent + to Cinder, but it’s depends incremental backup delete request still not), but + will keep that backup resource in Staffeln, and try to delete it again in + later periodic job. Avoid unnecessary frequent of backup/retention interval + will help to maintain the overall performance of Cinder. + + Make sure the metadata key that config through `backup_metadata_key` and + `retention_metadata_key` are not conflict to any other services/ user who + using Nova metadata. + +* kubernetes lease (default lock backend) + + Staffeln depends on kubernetes lease that allow multiple services cowork + together. + +## Authentication dependencies + +Staffeln by default uses regular openstack authentication methods. File +`/etc/staffeln/openrc` is usually the authentication file. Staffeln heavily +depends on authentication. Make sure the authentication method you provide +contains the following authorization in OpenStack: +* token authentication get user id set authentication project get project list +* get server list get volume get backup create backup create barbican secret +* (this might required for backup create) delete backup delete barbican secret +* (this might required for backup delete) get backup quota get volume quota get +* user get role assignments + +Notice all authorization required by above operation in OpenStack services +might need to be also granted to login user. It’s possible to switch +authentication when restarting staffeln service, but which work might lead to +unstoppable backup failure (with unauthorized warning) that will not block +services to run. You can resolve the warning by manually deleting the backup. + +Note: Don’t use different authorizations for multiple staffeln services across +nodes. That will be chances lead to unexpected behavior like all other +OpenStack services. For example, staffeln on one node is done with backup +schedule plan and staffeln on another node picks it up and proceeds with it. +That might follow with Create failed from Cinder and lead to warning log pop-up +with no action achieved. + +## Commands + +List of available commands: staffeln-conductor: trigger major staffeln backup +service. staffeln-api: trigger staffeln api service staffeln-db-manage +create_schema staffeln-db-manage upgrade head + + +## Simple verify + +After Staffeln well installed. First thing is to check Staffeln service logs to +see it’s well running. + +First we need is something to backup on: In test scenario we will use cirros or +any smaller image to observe behavor Prepare your test OpenStack environment +with following steps: Make sure cinder backup service is running Make sure your +openrc under `/etc/staffeln/staffeln.conf` provide required authorization shows +in `Authentication` section openstack volume create --size 1 --image {IMAGE_ID} +test-volume openstack server create --flavor {FLAVOR_ID} --volume {VOLUME_ID} +--property __staffeln_backup=true --property __staffeln_retention=20min +--network {NETWROK_ID} staffeln-test openstack volume create --size 1 --image +{IMAGE_ID} test-volume-no-retention openstack server create --flavor +{FLAVOR_ID} --volume {VOLUME_ID} --property __staffeln_backup=true --network +{NETWROK_ID} staffeln-test-no-retention Now you can watch the result with +`watch openstack volume backup list` to check and observe how backup going. + +Staffeln majorly depends on how it’s configuration and authentication provides. +When testing, make sure reference configurations list above in each section. +And it required a service restart to make the configuration/authentication +change works. If using systemd, you can use this example: `systemctl restart +staffeln-conductor staffeln-api` + +And awared that if you have multiple nodes that running staffeln on, the backup +or retention check might goes a bit randomly some time, because it’s totally +depends on how the periodic period config in each node, and also depends on how +long the node been process previous cron job. + +The email report testing is heavily depends on how your email system works. +Staffeln might behave differently or even raise error if your system didn’t +support it’s current process. The email part can directly tests against gmail +if you like. You can use application password for allow python sent email with +google’s smtp. To directly test email sending process. You can directly import +email from staffeln and use it as directly testing method. + + +To verify the setting for staffeln-api. you can directly using api calls to +check if backup check is properly running through `curl -X POST +Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID` or `wget --method=POST +Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID`. + +It should return TRUE when BACKUP_ID is not exist in staffeln, else FALSE. From dce1cae3ce272e9d77b78185412e83159898cde0 Mon Sep 17 00:00:00 2001 From: ricolin Date: Wed, 15 Nov 2023 01:17:37 +0800 Subject: [PATCH 20/30] fix readme lint --- README.md | 121 +++++++++++++++++++++++++++++------------------------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 88aa5f1..772009c 100755 --- a/README.md +++ b/README.md @@ -14,13 +14,13 @@ or the cli in self-service. ## Staffeln Conductor Functions Staffeln conductor manage all perodic tasks like backup, retention, and -notification. It's possible to have multiple staffeln conductor services -running. There will only be one service pulling volume and server information -from OpenStack and schedule backups. All conductor on the other hand, will be +notification. It's possible to have multiple staffeln conductor services +running. There will only be one service pulling volume and server information +from OpenStack and schedule backups. All conductor on the other hand, will be able to take scheduled backup tasks and run backups and also check for backup -to completed. For single volume, only one backup task will be generated, and +to completed. For single volume, only one backup task will be generated, and only one of staffeln conductor service will be able to pick up that task that -the same time. Same as retention tasks. +the same time. Same as retention tasks. ### Backup @@ -28,20 +28,20 @@ Staffeln is a service to help perform backup. What it does is with provided authorization, Staffeln find a volume list with go through instance list from OpenStack, and find instances has `backup_metadata_key` (which configured under `[conductor]` section in `/etc/staffeln/staffeln.conf`) defined in metadata and -got volume attached. Collect those attached volumes into a list. Follow by -using that volume list to generate volume backup tasks in Staffeln. And do -backups, check work-in-progress backups accordingly. With role control, there +got volume attached. Collect those attached volumes into a list. Follow by +using that volume list to generate volume backup tasks in Staffeln. And do +backups, check work-in-progress backups accordingly. With role control, there is only one Staffeln service that can perform volume collection and backup task schedules at the same time. But all services can do backup action, and check -progress in parallel. Backup schedule trigger time is controlled by periodic -jobs separately across all Staffeln nodes. It’s possible a following backup +progress in parallel. Backup schedule trigger time is controlled by periodic +jobs separately across all Staffeln nodes. It’s possible a following backup plan starts from a different node near previous success backup (with less than `backup_service_period` of time) in Staffeln V1, but it’s fixed with `backup_min_interval` config. And in either case, the Full and Incremental backup order (config with `full_backup_depth`) is still be honored. `backup_min_interval` is value that you config for how many seconds you like as -minimum interval between backups for same volume from Staffeln. The -configuration `full_backup_depth` under `[conductor]` section in +minimum interval between backups for same volume from Staffeln. The +configuration `full_backup_depth` under `[conductor]` section in `/etc/staffeln/staffeln.conf` will decide how incremental backups are going to perform. If `full_backup_depth` is set to 1. For each Full backup will follow by only one incremental backup(not counting ). And 2 incremental if @@ -68,24 +68,25 @@ backup for same volume created for more than 1 hours ago. On retention, backups which has creation time longer than retention time (defined by `retention_time` from `/etc/staffeln/staffeln.conf` or `retention_metadata_key` which added to metadata of instances) will put in list -and try to delete by Staffeln. Note: the actual key value of +and try to delete by Staffeln. Note: the actual key-value of `retention_metadata_key` is customizable. Like in test doc, you can see -following property been added to instance ` --property -__staffeln_retention=20min`. Customized `retention_metadata_key` has larger +following property been added to instance +`--property __staffeln_retention=20min`. +Customized `retention_metadata_key` has larger priority than `retention_time`. If no `retention_metadata_key` defined for -instance, `retention_time` will be used. With incremental backup exist, -retention will honored full and incremental backup order. That means some +instance, `retention_time` will be used. With incremental backup exist, +retention will honored full and incremental backup order. That means some backups might stay longer than it’s designed retention time as there are incremental backups depends on earlier backups. The chain will stop when next -full backup created. Now retention only delete backup object from Staffeln DB +full backup created. Now retention only delete backup object from Staffeln DB when backup not found in Cinder backup service. For honor backups dependencies. When collected retention list for one volumes, retention will start delete the later created one. And go through that order -till the very early created one. However, as Cinder might not honor the delete +till the very early created one. However, as Cinder might not honor the delete request order. It’s possible that some of delete request in that situation might failed. In Staffeln, will try to delete those failed request in next -periodic time. +periodic time. It’s recommended to config `retention_time` according your default retention needs, and well setup `retention_metadata_key` and update instance metadata to @@ -99,25 +100,25 @@ Report process is part of backup cron job. When one of Staffeln service got backup schedule role and finish with backup schedule, trigger, and check work in progress backup are done in this period. It will check if any successed or failed backup task has not been reported for `report_period` seconds after it -created. It will trigger the report process. `report_period` is defined under -`[conductor]` with unit to seconds. Report will generate an html format of -string with quota, success and failed backup task list with proper html color +created. It will trigger the report process. `report_period` is defined under +`[conductor]` with unit to seconds. Report will generate an HTML format of +string with quota, success and failed backup task list with proper HTML color format for each specific project that has success or failed backup to report. As for how the report will sent is base on your config and environment. And if email sending failed, it will not send that report but provide message for email failed in log. Staffeln will try to regenerate and resent report on -next periodic cycle. On the other hand, you can avoid config `sender_email` -from above, and make the report goes to logs directly. If you have specific +next periodic cycle. On the other hand, you can avoid config `sender_email` +from above, and make the report goes to logs directly. If you have specific email addresses you wish to send to instead of using project name. You can provide `receiver` config so it will send all project report to receiver list -instead. And if neither `recveiver` or `project_receiver_domain` are set, the +instead. And if neither `recveiver` or `project_receiver_domain` are set, the project report will try to grap project member list and gather user emails to send report to. If no user email can be found from project member, Staffeln -will ignore this report cycle and retry the next cycle. Notice that, to +will ignore this report cycle and retry the next cycle. Notice that, to improve Staffeln performance and to reduce old backup result exist in Staffeln DB, properly config email is recommended. Otherwise, not config any sender -information and make the reports goes to logs can be considered. When report +information and make the reports goes to logs can be considered. When report successfully sent to email or logs for specific project. all success/failed tasks for that project will be purged from Staffeln. @@ -128,7 +129,7 @@ in progress tasks with other staffeln services. And counting cron job sleep interval, the report time might take longer than what configed in `report_period`. But it will never goes earlier than `report_period`. -For report format. It’s written in html format and categorized by projects. +For report format. It’s written in HTML format and categorized by projects. Collect information from all projects into one report, and sent it through email or directly to log. And in each project, will provide information about project name, quote status, backup succeeded list, and backup failed list. And @@ -138,21 +139,22 @@ follow by second project and so on. Staffeln API service allows we defined cinder policy check and make sure all Cinder volume backups are deleted only when that backup is not makaged by -Staffeln. Once staffeln api service is up. You can define similar policy as -following to `/etc/cinder/policy.yaml`: "backup:delete" : "rule:admin_api or -(project_id:%(project_id)s and +Staffeln. Once staffeln API service is up. You can define similar policy as +following to `/etc/cinder/policy.yaml`: +``` +"backup:delete" : "rule:admin_api or (project_id:%(project_id)s and http://Staffeln-api-url:8808/v1/backup?backup_id=%(id)s)" +``` And when backup not exist in staffeln, that API will return TRUE and make the -policy allows the backup delete. Else will return False and only allow backup +policy allows the backup delete. Else will return False and only allow backup delete when it's admin in above case. ## Settings -Users can configure the settings to control the backup process. Most of -functions are controlled through configurations. You will be able to find all -configurations under -https://github.com/vexxhost/staffeln/tree/main/staffeln/conf +Users can configure the settings to control the backup process. Most of +functions are controlled through configurations. You will be able to find all +configurations under `staffeln/conf/*` And defined them in `/etc/staffeln/staffeln.conf` before restart staffeln-conductor service. @@ -161,7 +163,7 @@ staffeln-conductor service. Users can get the list of backup volumes on the Horizon cinder-backup panel. This panel has filtering and pagination functions which are not default ones of -Horizon. Users cannot delete the volumes on the UI if “Delete Volume Backup” +Horizon. Users cannot delete the volumes on the UI if “Delete Volume Backup” button is disabled on the cinder-backup panel from horizon. ## Service dependencies @@ -169,14 +171,14 @@ button is disabled on the cinder-backup panel from horizon. * openstacksdk that can reach to Cinder, Nova, and Keystone Staffeln heavily depends on Cinder backup. So need to make sure that Cinder - Backup service is stable. On the other hand, as backup create or delete + Backup service is stable. On the other hand, as backup create or delete request amount might goes high when staffeln processed with large amount of volume backup. It’s possible API request is not well processed or the request order is mixed. For delete backup, Staffeln might not be able to delete a backup right away if any process failed (like full backup delete request sent to Cinder, but it’s depends incremental backup delete request still not), but will keep that backup resource in Staffeln, and try to delete it again in - later periodic job. Avoid unnecessary frequent of backup/retention interval + later periodic job. Avoid unnecessary frequent of backup/retention interval will help to maintain the overall performance of Cinder. Make sure the metadata key that config through `backup_metadata_key` and @@ -194,7 +196,7 @@ Staffeln by default uses regular openstack authentication methods. File `/etc/staffeln/openrc` is usually the authentication file. Staffeln heavily depends on authentication. Make sure the authentication method you provide contains the following authorization in OpenStack: -* token authentication get user id set authentication project get project list +* token authentication get user ID set authentication project get project list * get server list get volume get backup create backup create barbican secret * (this might required for backup create) delete backup delete barbican secret * (this might required for backup delete) get backup quota get volume quota get @@ -216,10 +218,9 @@ with no action achieved. ## Commands List of available commands: staffeln-conductor: trigger major staffeln backup -service. staffeln-api: trigger staffeln api service staffeln-db-manage +service. staffeln-api: trigger staffeln API service staffeln-db-manage create_schema staffeln-db-manage upgrade head - ## Simple verify After Staffeln well installed. First thing is to check Staffeln service logs to @@ -227,21 +228,30 @@ see it’s well running. First we need is something to backup on: In test scenario we will use cirros or any smaller image to observe behavor Prepare your test OpenStack environment -with following steps: Make sure cinder backup service is running Make sure your -openrc under `/etc/staffeln/staffeln.conf` provide required authorization shows -in `Authentication` section openstack volume create --size 1 --image {IMAGE_ID} -test-volume openstack server create --flavor {FLAVOR_ID} --volume {VOLUME_ID} ---property __staffeln_backup=true --property __staffeln_retention=20min ---network {NETWROK_ID} staffeln-test openstack volume create --size 1 --image -{IMAGE_ID} test-volume-no-retention openstack server create --flavor -{FLAVOR_ID} --volume {VOLUME_ID} --property __staffeln_backup=true --network -{NETWROK_ID} staffeln-test-no-retention Now you can watch the result with +with following steps: + +Make sure cinder backup service is running +Make sure your openrc under `/etc/staffeln/staffeln.conf` +provide required authorization shows in `Authentication` section. +Run + +``` +openstack volume create --size 1 --image {IMAGE_ID} test-volume +openstack server create --flavor {FLAVOR_ID} --volume {VOLUME_ID} \ +--property __staffeln_backup=true --property __staffeln_retention=20min \ +--network {NETWROK_ID} staffeln-test +openstack volume create --size 1 --image {IMAGE_ID} test-volume-no-retention +openstack server create --flavor {FLAVOR_ID} --volume {VOLUME_ID} \ +--property __staffeln_backup=true --network {NETWROK_ID} staffeln-test-no-retention +``` + +Now you can watch the result with `watch openstack volume backup list` to check and observe how backup going. Staffeln majorly depends on how it’s configuration and authentication provides. When testing, make sure reference configurations list above in each section. And it required a service restart to make the configuration/authentication -change works. If using systemd, you can use this example: `systemctl restart +change works. If using systemd, you can use this example: `systemctl restart staffeln-conductor staffeln-api` And awared that if you have multiple nodes that running staffeln on, the backup @@ -251,13 +261,12 @@ long the node been process previous cron job. The email report testing is heavily depends on how your email system works. Staffeln might behave differently or even raise error if your system didn’t -support it’s current process. The email part can directly tests against gmail +support it’s current process. The email part can directly tests against gmail if you like. You can use application password for allow python sent email with google’s smtp. To directly test email sending process. You can directly import email from staffeln and use it as directly testing method. - -To verify the setting for staffeln-api. you can directly using api calls to +To verify the setting for staffeln-api. you can directly using API calls to check if backup check is properly running through `curl -X POST Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID` or `wget --method=POST Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID`. From d9ca1955c00c00f4673193bb8b09073777a25ea1 Mon Sep 17 00:00:00 2001 From: ricolin Date: Wed, 15 Nov 2023 01:23:53 +0800 Subject: [PATCH 21/30] fix: add code block info --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 772009c..7850642 100755 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ Staffeln API service allows we defined cinder policy check and make sure all Cinder volume backups are deleted only when that backup is not makaged by Staffeln. Once staffeln API service is up. You can define similar policy as following to `/etc/cinder/policy.yaml`: -``` +```yaml "backup:delete" : "rule:admin_api or (project_id:%(project_id)s and http://Staffeln-api-url:8808/v1/backup?backup_id=%(id)s)" ``` @@ -235,7 +235,7 @@ Make sure your openrc under `/etc/staffeln/staffeln.conf` provide required authorization shows in `Authentication` section. Run -``` +```shell openstack volume create --size 1 --image {IMAGE_ID} test-volume openstack server create --flavor {FLAVOR_ID} --volume {VOLUME_ID} \ --property __staffeln_backup=true --property __staffeln_retention=20min \ From aa29d00dd33ef15255df1825c318aa87bbb44058 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 11:59:04 +0100 Subject: [PATCH 22/30] use backup_cycle_timeout as value --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7850642..9f0cfc4 100755 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ perform. If `full_backup_depth` is set to 1. For each Full backup will follow by only one incremental backup(not counting ). And 2 incremental if `full_backup_depth` set to 2. Set to `0` if want all full backups. -To avoid long stucking backup action, config `backup_cycle_timout` should be +To avoid long stucking backup action, config `backup_cycle_timeout` should be set with a reasonable time that long enough for backups to complete but good enough to judge the backup process is stucking. When a backup process reach this timeout, it will remove the backup task and try to delete the volume From 86fe7b3124706815a446b6414a99ba9b575f5063 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:00:06 +0100 Subject: [PATCH 23/30] fix typo --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9f0cfc4..2c8f8ee 100755 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ set the create time to 10 years old so the remove progress will be observe and retry on next retention job. `backup_service_period` is no longer the only fector that reflect how long +`backup_service_period` is no longer the only factor that reflect how long volume should backup. It’s recommended to set `backup_min_interval` and `report_period`(see in Report part) and config a related shorter `backup_service_period`. For example if we set `backup_min_interval` to 3600 From d3f01ca7ef0ee90df14ca6fc189d5db7897cfbbd Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:01:30 +0100 Subject: [PATCH 24/30] improve text --- README.md | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 2c8f8ee..0a9918b 100755 --- a/README.md +++ b/README.md @@ -16,30 +16,33 @@ or the cli in self-service. Staffeln conductor manage all perodic tasks like backup, retention, and notification. It's possible to have multiple staffeln conductor services running. There will only be one service pulling volume and server information -from OpenStack and schedule backups. All conductor on the other hand, will be -able to take scheduled backup tasks and run backups and also check for backup -to completed. For single volume, only one backup task will be generated, and -only one of staffeln conductor service will be able to pick up that task that -the same time. Same as retention tasks. +from OpenStack and do the backup scheduling. All conductors, will be +able to take scheduled backup tasks and run backups. They will also check for +backup to be completed. For single a volume, only one backup task will be +generated, and only one of the staffeln conductor service will be able to pick up +that task at the same time. Same as retention tasks. ### Backup -Staffeln is a service to help perform backup. What it does is with provided -authorization, Staffeln find a volume list with go through instance list from -OpenStack, and find instances has `backup_metadata_key` (which configured under -`[conductor]` section in `/etc/staffeln/staffeln.conf`) defined in metadata and -got volume attached. Collect those attached volumes into a list. Follow by -using that volume list to generate volume backup tasks in Staffeln. And do -backups, check work-in-progress backups accordingly. With role control, there +Staffeln is a service to help perform backups. With the provided +authorization, Staffeln uses the cinder volume list from OpenStack and searches +for instances that has `backup_metadata_key` defined in metadata and has volumes attached. +> this is configured under the `[conductor]` section in `/etc/staffeln/staffeln.conf` + +From these attached volumes it will generate volume backup tasks in Staffeln. +It checks work-in-progress backups accordingly. With role control, there is only one Staffeln service that can perform volume collection and backup task -schedules at the same time. But all services can do backup action, and check -progress in parallel. Backup schedule trigger time is controlled by periodic -jobs separately across all Staffeln nodes. It’s possible a following backup -plan starts from a different node near previous success backup (with less than +scheduling at the same time. + +But all services can do backup action, and check progress in parallel. Backup +schedule trigger time is controlled by periodic jobs separately across +all Staffeln nodes. It’s possible that a following backup +plan starts from a different node near previous succeeded backup (with less than `backup_service_period` of time) in Staffeln V1, but it’s fixed with -`backup_min_interval` config. And in either case, the Full and Incremental +`backup_min_interval` config. And in either cases, the Full and Incremental backup order (config with `full_backup_depth`) is still be honored. -`backup_min_interval` is value that you config for how many seconds you like as + +With `backup_min_interval` you can config for how many seconds you like as minimum interval between backups for same volume from Staffeln. The configuration `full_backup_depth` under `[conductor]` section in `/etc/staffeln/staffeln.conf` will decide how incremental backups are going to @@ -55,7 +58,6 @@ backup. A followup backup object (marked as not completed) will be create and set the create time to 10 years old so the remove progress will be observe and retry on next retention job. -`backup_service_period` is no longer the only fector that reflect how long `backup_service_period` is no longer the only factor that reflect how long volume should backup. It’s recommended to set `backup_min_interval` and `report_period`(see in Report part) and config a related shorter From 22d4c702bafd2035b1569703cdc8f483c38cc1d4 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:16:35 +0100 Subject: [PATCH 25/30] Write Staffeln with a capital, it's a name. --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0a9918b..e59454b 100755 --- a/README.md +++ b/README.md @@ -14,12 +14,12 @@ or the cli in self-service. ## Staffeln Conductor Functions Staffeln conductor manage all perodic tasks like backup, retention, and -notification. It's possible to have multiple staffeln conductor services +notification. It's possible to have multiple Staffeln conductor services running. There will only be one service pulling volume and server information from OpenStack and do the backup scheduling. All conductors, will be able to take scheduled backup tasks and run backups. They will also check for backup to be completed. For single a volume, only one backup task will be -generated, and only one of the staffeln conductor service will be able to pick up +generated, and only one of the Staffeln conductor service will be able to pick up that task at the same time. Same as retention tasks. ### Backup @@ -175,7 +175,7 @@ button is disabled on the cinder-backup panel from horizon. Staffeln heavily depends on Cinder backup. So need to make sure that Cinder Backup service is stable. On the other hand, as backup create or delete - request amount might goes high when staffeln processed with large amount of + request amount might goes high when Staffeln processed with large amount of volume backup. It’s possible API request is not well processed or the request order is mixed. For delete backup, Staffeln might not be able to delete a backup right away if any process failed (like full backup delete request sent @@ -207,21 +207,21 @@ contains the following authorization in OpenStack: Notice all authorization required by above operation in OpenStack services might need to be also granted to login user. It’s possible to switch -authentication when restarting staffeln service, but which work might lead to +authentication when restarting Staffeln service, but which work might lead to unstoppable backup failure (with unauthorized warning) that will not block services to run. You can resolve the warning by manually deleting the backup. -Note: Don’t use different authorizations for multiple staffeln services across +Note: Don’t use different authorizations for multiple Staffeln services across nodes. That will be chances lead to unexpected behavior like all other -OpenStack services. For example, staffeln on one node is done with backup -schedule plan and staffeln on another node picks it up and proceeds with it. +OpenStack services. For example, Staffeln on one node is done with backup +schedule plan and Staffeln on another node picks it up and proceeds with it. That might follow with Create failed from Cinder and lead to warning log pop-up with no action achieved. ## Commands -List of available commands: staffeln-conductor: trigger major staffeln backup -service. staffeln-api: trigger staffeln API service staffeln-db-manage +List of available commands: staffeln-conductor: trigger major Staffeln backup +service. staffeln-api: trigger Staffeln API service staffeln-db-manage create_schema staffeln-db-manage upgrade head ## Simple verify @@ -257,7 +257,7 @@ And it required a service restart to make the configuration/authentication change works. If using systemd, you can use this example: `systemctl restart staffeln-conductor staffeln-api` -And awared that if you have multiple nodes that running staffeln on, the backup +And awared that if you have multiple nodes that running Staffeln on, the backup or retention check might goes a bit randomly some time, because it’s totally depends on how the periodic period config in each node, and also depends on how long the node been process previous cron job. @@ -267,11 +267,11 @@ Staffeln might behave differently or even raise error if your system didn’t support it’s current process. The email part can directly tests against gmail if you like. You can use application password for allow python sent email with google’s smtp. To directly test email sending process. You can directly import -email from staffeln and use it as directly testing method. +email from Staffeln and use it as directly testing method. To verify the setting for staffeln-api. you can directly using API calls to check if backup check is properly running through `curl -X POST Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID` or `wget --method=POST Staffeln-api-url:8808/v1/backup?backup_id=BACKUP_ID`. -It should return TRUE when BACKUP_ID is not exist in staffeln, else FALSE. +It should return TRUE when BACKUP_ID is not exist in Staffeln, else FALSE. From 112af07a2598c2376aa274988b586b5e42248371 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:17:11 +0100 Subject: [PATCH 26/30] fix typo. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e59454b..0591f48 100755 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ from above, and make the report goes to logs directly. If you have specific email addresses you wish to send to instead of using project name. You can provide `receiver` config so it will send all project report to receiver list instead. And if neither `recveiver` or `project_receiver_domain` are set, the -project report will try to grap project member list and gather user emails to +project report will try to grep project member list and gather user emails to send report to. If no user email can be found from project member, Staffeln will ignore this report cycle and retry the next cycle. Notice that, to improve Staffeln performance and to reduce old backup result exist in Staffeln From 8df2d188d78a21b226d41cc2e03ad8ada7c87f56 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:17:42 +0100 Subject: [PATCH 27/30] improve text --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0591f48..6b2d93d 100755 --- a/README.md +++ b/README.md @@ -71,9 +71,10 @@ backup for same volume created for more than 1 hours ago. On retention, backups which has creation time longer than retention time (defined by `retention_time` from `/etc/staffeln/staffeln.conf` or `retention_metadata_key` which added to metadata of instances) will put in list -and try to delete by Staffeln. Note: the actual key-value of -`retention_metadata_key` is customizable. Like in test doc, you can see -following property been added to instance +and try to delete by Staffeln. +> The actual key-value of `retention_metadata_key` is customizable. + +Like in test doc, you can see following property been added to instance `--property __staffeln_retention=20min`. Customized `retention_metadata_key` has larger priority than `retention_time`. If no `retention_metadata_key` defined for @@ -99,9 +100,9 @@ backups should be delete. So no need to set it to a too long period of time. ### Report -Report process is part of backup cron job. When one of Staffeln service got -backup schedule role and finish with backup schedule, trigger, and check work -in progress backup are done in this period. It will check if any successed or +Report process is part of backup cron job. When one of Staffeln services got +the backup schedule role and finish with backup schedule, trigger and check work +in progress backup are done in this period. It will check if any succeeded or failed backup task has not been reported for `report_period` seconds after it created. It will trigger the report process. `report_period` is defined under `[conductor]` with unit to seconds. Report will generate an HTML format of From 4d3d3318904e25e30611723ef61f6c5a85d80e97 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:23:23 +0100 Subject: [PATCH 28/30] improve text --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6b2d93d..19641d8 100755 --- a/README.md +++ b/README.md @@ -141,8 +141,8 @@ follow by second project and so on. ### Staffeln-API -Staffeln API service allows we defined cinder policy check and make sure all -Cinder volume backups are deleted only when that backup is not makaged by +Staffeln API service allows defined cinder policy check and make sure all +Cinder volume backups are deleted only when that backup is not marked by Staffeln. Once staffeln API service is up. You can define similar policy as following to `/etc/cinder/policy.yaml`: ```yaml @@ -150,9 +150,9 @@ following to `/etc/cinder/policy.yaml`: http://Staffeln-api-url:8808/v1/backup?backup_id=%(id)s)" ``` -And when backup not exist in staffeln, that API will return TRUE and make the -policy allows the backup delete. Else will return False and only allow backup -delete when it's admin in above case. +And when backup does not exist in Staffeln, that API will return TRUE and make the +policy allows the backup to delete, if thr API returns False, then only a admin +is allowed to delete. ## Settings From 30b1bdc420bb0e84484b8409350e1cc40a092fea Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 12:41:18 +0100 Subject: [PATCH 29/30] type in the help text for receiver --- staffeln/conf/notify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staffeln/conf/notify.py b/staffeln/conf/notify.py index b5a2246..21c67e8 100644 --- a/staffeln/conf/notify.py +++ b/staffeln/conf/notify.py @@ -12,7 +12,7 @@ "receiver", default=[], help=_( - "The receivers of the bakcup result by email." + "The receivers of the backup result by email." "A list of addresses to receive backup result emails to. A bare" " string will be treated as a list with 1 address." ), From 053b0cb913a0b94cecc7e869e091d1b4baed7103 Mon Sep 17 00:00:00 2001 From: Michiel Piscaer Date: Wed, 15 Nov 2023 15:32:42 +0100 Subject: [PATCH 30/30] Revert "use backup_cycle_timeout as value" This reverts commit aa29d00dd33ef15255df1825c318aa87bbb44058. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 19641d8..a561d26 100755 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ perform. If `full_backup_depth` is set to 1. For each Full backup will follow by only one incremental backup(not counting ). And 2 incremental if `full_backup_depth` set to 2. Set to `0` if want all full backups. -To avoid long stucking backup action, config `backup_cycle_timeout` should be +To avoid long stucking backup action, config `backup_cycle_timout` should be set with a reasonable time that long enough for backups to complete but good enough to judge the backup process is stucking. When a backup process reach this timeout, it will remove the backup task and try to delete the volume