Skip to content

Commit

Permalink
Merge branch 'main' into dpe-3064-reusable-workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
dragomirp committed Nov 30, 2023
2 parents eaa5beb + 31ca568 commit e3f0fd0
Show file tree
Hide file tree
Showing 9 changed files with 304 additions and 44 deletions.
29 changes: 27 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
MaintenanceStatus,
Relation,
Unit,
UnknownStatus,
WaitingStatus,
)
from ops.pebble import ChangeError, Layer, PathError, ProtocolError, ServiceStatus
Expand Down Expand Up @@ -498,12 +499,14 @@ def enable_disable_extensions(self, database: str = None) -> None:
continue
extension = plugins_exception.get(extension, extension)
extensions[extension] = enable
self.unit.status = WaitingStatus("Updating extensions")
if not isinstance(original_status, UnknownStatus):
self.unit.status = WaitingStatus("Updating extensions")
try:
self.postgresql.enable_disable_extensions(extensions, database)
except PostgreSQLEnableDisableExtensionError as e:
logger.exception("failed to change plugins: %s", str(e))
self.unit.status = original_status
if not isinstance(original_status, UnknownStatus):
self.unit.status = original_status

def _add_members(self, event) -> None:
"""Add new cluster members.
Expand Down Expand Up @@ -603,6 +606,23 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
self.set_secret(APP_SCOPE, MONITORING_PASSWORD_KEY, new_password())

self._cleanup_old_cluster_resources()
client = Client()
try:
endpoint = client.get(Endpoints, name=self.cluster_name, namespace=self._namespace)
if "leader" not in endpoint.metadata.annotations:
patch = {
"metadata": {
"annotations": {"leader": self._unit_name_to_pod_name(self._unit)}
}
}
client.patch(
Endpoints, name=self.cluster_name, namespace=self._namespace, obj=patch
)
self.app_peer_data.pop("cluster_initialised", None)
except ApiError as e:
# Ignore the error only when the resource doesn't exist.
if e.status.code != 404:
raise e

# Create resources and add labels needed for replication.
try:
Expand Down Expand Up @@ -931,6 +951,11 @@ def _on_get_primary(self, event: ActionEvent) -> None:
logger.error(f"failed to get primary with error {e}")

def _on_stop(self, _):
# Remove data from the drive when scaling down to zero to prevent
# the cluster from getting stuck when scaling back up.
if self.app.planned_units() == 0:
self.unit_peer_data.clear()

# Patch the services to remove them when the StatefulSet is deleted
# (i.e. application is removed).
try:
Expand Down
26 changes: 13 additions & 13 deletions src/grafana_dashboards/postgresql-metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -1673,7 +1673,7 @@
"current": true,
"max": true,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
Expand Down Expand Up @@ -1772,7 +1772,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"total": true,
"values": true
Expand Down Expand Up @@ -1873,7 +1873,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sideWidth": null,
"sort": "current",
Expand Down Expand Up @@ -1971,7 +1971,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
Expand Down Expand Up @@ -2068,7 +2068,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
Expand Down Expand Up @@ -2167,7 +2167,7 @@
"hideEmpty": false,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
Expand Down Expand Up @@ -2265,7 +2265,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "total",
"sortDesc": true,
Expand Down Expand Up @@ -2362,7 +2362,7 @@
"current": true,
"max": true,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": false,
Expand Down Expand Up @@ -2459,7 +2459,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
Expand Down Expand Up @@ -2556,7 +2556,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"total": false,
"values": true
Expand Down Expand Up @@ -2647,7 +2647,7 @@
"current": true,
"max": true,
"min": true,
"rightSide": true,
"rightSide": false,
"show": true,
"total": false,
"values": true
Expand Down Expand Up @@ -2766,7 +2766,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"total": true,
"values": true
Expand Down Expand Up @@ -2864,7 +2864,7 @@
"current": true,
"max": false,
"min": false,
"rightSide": true,
"rightSide": false,
"show": true,
"total": true,
"values": true
Expand Down
11 changes: 11 additions & 0 deletions src/relations/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,13 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None:
event.defer()
return

# Set a flag to avoid deleting database users when this unit
# is removed and receives relation broken events from related applications.
# This is needed because of https://bugs.launchpad.net/juju/+bug/1979811.
if event.departing_unit == self.charm.unit:
self.charm._peers.data[self.charm.unit].update({"departing": "True"})
return

if not self.charm.unit.is_leader():
return

Expand Down Expand Up @@ -275,6 +282,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
event.defer()
return

if "departing" in self.charm._peers.data[self.charm.unit]:
logger.debug("Early exit on_relation_broken: Skipping departing unit")
return

if not self.charm.unit.is_leader():
return

Expand Down
17 changes: 16 additions & 1 deletion src/relations/postgresql_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
PostgreSQLDeleteUserError,
PostgreSQLGetPostgreSQLVersionError,
)
from ops.charm import CharmBase, RelationBrokenEvent
from ops.charm import CharmBase, RelationBrokenEvent, RelationDepartedEvent
from ops.framework import Object
from ops.model import ActiveStatus, BlockedStatus, Relation

Expand Down Expand Up @@ -45,6 +45,9 @@ def __init__(self, charm: CharmBase, relation_name: str = "database") -> None:
self.relation_name = relation_name

super().__init__(charm, self.relation_name)
self.framework.observe(
charm.on[self.relation_name].relation_departed, self._on_relation_departed
)
self.framework.observe(
charm.on[self.relation_name].relation_broken, self._on_relation_broken
)
Expand Down Expand Up @@ -128,6 +131,14 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None:
else f"Failed to initialize {self.relation_name} relation"
)

def _on_relation_departed(self, event: RelationDepartedEvent) -> None:
"""Set a flag to avoid deleting database users when not wanted."""
# Set a flag to avoid deleting database users when this unit
# is removed and receives relation broken events from related applications.
# This is needed because of https://bugs.launchpad.net/juju/+bug/1979811.
if event.departing_unit == self.charm.unit:
self.charm._peers.data[self.charm.unit].update({"departing": "True"})

def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
"""Remove the user created for this relation."""
# Check for some conditions before trying to access the PostgreSQL instance.
Expand All @@ -144,6 +155,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:

self._update_unit_status(event.relation)

if "departing" in self.charm._peers.data[self.charm.unit]:
logger.debug("Early exit on_relation_broken: Skipping departing unit")
return

if not self.charm.unit.is_leader():
return

Expand Down
42 changes: 42 additions & 0 deletions tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
get_password,
get_unit_address,
run_command_on_unit,
scale_application,
)
from .helpers import (
are_all_db_processes_down,
Expand Down Expand Up @@ -394,3 +395,44 @@ async def test_network_cut(
), "Connection is not possible after network restore"

await is_cluster_updated(ops_test, primary_name)


@pytest.mark.group(1)
async def test_scaling_to_zero(ops_test: OpsTest, continuous_writes) -> None:
"""Scale the database to zero units and scale up again."""
# Locate primary unit.
app = await app_name(ops_test)

# Start an application that continuously writes data to the database.
await start_continuous_writes(ops_test, app)

# Scale the database to zero units.
logger.info("scaling database to zero units")
await scale_application(ops_test, app, 0)

# Scale the database to three units.
logger.info("scaling database to three units")
await scale_application(ops_test, app, 3)

# Verify all units are up and running.
logger.info("waiting for the database service to start in all units")
for unit in ops_test.model.applications[app].units:
assert await is_postgresql_ready(
ops_test, unit.name
), f"unit {unit.name} not restarted after cluster restart."

logger.info("checking whether writes are increasing")
await are_writes_increasing(ops_test)

# Verify that all units are part of the same cluster.
logger.info("checking whether all units are part of the same cluster")
member_ips = await fetch_cluster_members(ops_test)
ip_addresses = [
await get_unit_address(ops_test, unit.name)
for unit in ops_test.model.applications[app].units
]
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."

# Verify that no writes to the database were missed after stopping the writes.
logger.info("checking whether no writes to the database were missed after stopping the writes")
await check_writes(ops_test)
18 changes: 12 additions & 6 deletions tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,12 +638,18 @@ async def scale_application(ops_test: OpsTest, application_name: str, scale: int
scale: The number of units to scale to
"""
await ops_test.model.applications[application_name].scale(scale)
await ops_test.model.wait_for_idle(
apps=[application_name],
status="active",
timeout=1000,
wait_for_exact_units=scale,
)
if scale == 0:
await ops_test.model.block_until(
lambda: len(ops_test.model.applications[DATABASE_APP_NAME].units) == scale,
timeout=1000,
)
else:
await ops_test.model.wait_for_idle(
apps=[application_name],
status="active",
timeout=1000,
wait_for_exact_units=scale,
)


async def set_password(
Expand Down
Loading

0 comments on commit e3f0fd0

Please sign in to comment.