Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental standby cluster #317

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,23 @@ list-backups:
description: Lists backups in s3 storage.
pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
promote-standby-cluster:
description: Promotes the standby cluster of choice to a leader. Must be ran against the charm unit leader of the standby cluster.
params:
force:
type: boolean
default: False
description: |
WARNING: this option set to True WILL WIPE OUT your current primary cluster!
If this option and "force-really-really-mean-it" are set both to true, then this unit will take over the primary role.
It only works in the case of cross-cluster replication, where both clusters are connected to each other in the async-primary.
force-really-really-mean-it:
type: boolean
default: False
description: |
WARNING: this option set to True WILL WIPE OUT your current primary cluster!
If this option and "force" are set both to true, then this unit will take over the primary role.
It only works in the case of cross-cluster replication, where both clusters are connected to each other in the async-primary.
restore:
description: Restore a database backup using pgBackRest.
S3 credentials are retrieved from a relation with the S3 integrator charm.
Expand Down
5 changes: 5 additions & 0 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ peers:
interface: upgrade

provides:
async-primary:
interface: async_replication
database:
interface: postgresql_client
db:
Expand All @@ -37,6 +39,9 @@ provides:
limit: 1

requires:
async-replica:
interface: async_replication
limit: 1
certificates:
interface: tls-certificates
limit: 1
Expand Down
1 change: 0 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
USER,
USER_PASSWORD_KEY,
)
from relations.async_replication import PostgreSQLAsyncReplication
from relations.db import EXTENSIONS_BLOCKING_MESSAGE, DbProvides
from relations.postgresql_provider import PostgreSQLProvider
from upgrade import PostgreSQLUpgrade, get_postgresql_dependencies_model
Expand Down Expand Up @@ -182,6 +183,7 @@ def __init__(self, *args):
],
log_slots=[f"{POSTGRESQL_SNAP_NAME}:logs"],
)
self.async_manager = PostgreSQLAsyncReplication(self)

def patroni_scrape_config(self) -> List[Dict]:
"""Generates scrape config for the Patroni metrics endpoint."""
Expand Down Expand Up @@ -676,6 +678,7 @@ def _hosts(self) -> set:
def _patroni(self) -> Patroni:
"""Returns an instance of the Patroni object."""
return Patroni(
self,
self._unit_ip,
self.cluster_name,
self._member_name,
Expand Down
79 changes: 72 additions & 7 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,22 @@
RUNNING_STATES = ["running", "streaming"]


class ClusterNotPromotedError(Exception):
"""Raised when a cluster is not promoted."""


class NotReadyError(Exception):
"""Raised when not all cluster members healthy or finished initial sync."""


class EndpointNotReadyError(Exception):
"""Raised when an endpoint is not ready."""


class StandbyClusterAlreadyPromotedError(Exception):
"""Raised when a standby cluster is already promoted."""


class RemoveRaftMemberFailedError(Exception):
"""Raised when a remove raft member failed for some reason."""

Expand All @@ -68,6 +80,7 @@ class Patroni:

def __init__(
self,
charm,
unit_ip: str,
cluster_name: str,
member_name: str,
Expand All @@ -81,6 +94,7 @@ def __init__(
"""Initialize the Patroni class.

Args:
charm: PostgreSQL charm instance.
unit_ip: IP address of the current unit
cluster_name: name of the cluster
member_name: name of the member inside the cluster
Expand All @@ -91,6 +105,7 @@ def __init__(
rewind_password: password for the user used on rewinds
tls_enabled: whether TLS is enabled
"""
self.charm = charm
self.unit_ip = unit_ip
self.cluster_name = cluster_name
self.member_name = member_name
Expand Down Expand Up @@ -241,6 +256,32 @@ def get_primary(self, unit_name_pattern=False) -> str:
primary = "/".join(primary.rsplit("-", 1))
return primary

def get_standby_leader(self, unit_name_pattern=False) -> str:
"""Get standby leader instance.

Args:
unit_name_pattern: whether to convert pod name to unit name

Returns:
standby leader pod or unit name.
"""
# Request info from cluster endpoint (which returns all members of the cluster).
for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
cluster_status = requests.get(
f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
for member in cluster_status.json()["members"]:
if member["role"] == "standby_leader":
standby_leader = member["name"]
if unit_name_pattern:
# Change the last dash to / in order to match unit name pattern.
standby_leader = "/".join(standby_leader.rsplit("-", 1))
return standby_leader

def get_sync_standby_names(self) -> List[str]:
"""Get the list of sync standby unit names."""
sync_standbys = []
Expand Down Expand Up @@ -296,12 +337,12 @@ def are_all_members_ready(self) -> bool:
except RetryError:
return False

# Check if all members are running and one of them is a leader (primary),
# because sometimes there may exist (for some period of time) only
# replicas after a failed switchover.
# Check if all members are running and one of them is a leader (primary) or
# a standby leader, because sometimes there may exist (for some period of time)
# only replicas after a failed switchover.
return all(
member["state"] in RUNNING_STATES for member in cluster_status.json()["members"]
) and any(member["role"] == "leader" for member in cluster_status.json()["members"])
) and any(member["role"] in ["leader", "standby_leader"] for member in cluster_status.json()["members"])

def get_patroni_health(self) -> Dict[str, str]:
"""Gets, retires and parses the Patroni health endpoint."""
Expand Down Expand Up @@ -423,6 +464,19 @@ def is_member_isolated(self) -> bool:

return len(cluster_status.json()["members"]) == 0

def promote_standby_cluster(self) -> None:
"""Promote a standby cluster to be a regular cluster."""
config_response = requests.get(f"{self._patroni_url}/config", verify=self.verify)
if "standby_cluster" not in config_response.json():
raise StandbyClusterAlreadyPromotedError("standby cluster is already promoted")
requests.patch(
f"{self._patroni_url}/config", verify=self.verify, json={"standby_cluster": None}
)
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
if self.get_primary() is None:
raise ClusterNotPromotedError("cluster not promoted")

def render_file(self, path: str, content: str, mode: int) -> None:
"""Write a content rendered from a template to a file.

Expand Down Expand Up @@ -465,6 +519,9 @@ def render_patroni_yml_file(
# Open the template patroni.yml file.
with open("templates/patroni.yml.j2", "r") as file:
template = Template(file.read())

primary = self.charm.async_manager.get_primary_data()

# Render the template file with the correct values.
rendered = template.render(
conf_path=PATRONI_CONF_PATH,
Expand All @@ -480,8 +537,12 @@ def render_patroni_yml_file(
scope=self.cluster_name,
self_ip=self.unit_ip,
superuser=USER,
superuser_password=self.superuser_password,
replication_password=self.replication_password,
superuser_password=primary["superuser-password"]
if primary
else self.superuser_password,
replication_password=primary["replication-password"]
if primary
else self.replication_password,
rewind_user=REWIND_USER,
rewind_password=self.rewind_password,
enable_pgbackrest=stanza is not None,
Expand All @@ -492,8 +553,12 @@ def render_patroni_yml_file(
version=self.get_postgresql_version().split(".")[0],
minority_count=self.planned_units // 2,
pg_parameters=parameters,
standby_cluster_endpoint=primary["endpoint"] if primary else None,
extra_replication_endpoints={"{}/32".format(primary["endpoint"])}
if primary
else self.charm.async_manager.standby_endpoints(),
)
self.render_file(f"{PATRONI_CONF_PATH}/patroni.yaml", rendered, 0o600)
self.render_file(f"{PATRONI_CONF_PATH}/patroni.yaml", rendered, 0o644)

def start_patroni(self) -> bool:
"""Start Patroni service using snap.
Expand Down
Loading
Loading