From 24ff49fb374c9d2c7954a6a1250d26cfa0c9413e Mon Sep 17 00:00:00 2001 From: phvalguima Date: Tue, 13 Aug 2024 13:48:17 +0200 Subject: [PATCH] [DPE-4860][DPE-4982] `get_relation(PeerRelationName)` returns `None` if `remove-application` has been requested (#361) The main issue here is the `storage-detaching` after remove-application. That gets more complex if we consider the last unit going away. In this scenario, we cannot consider that relations will be available anymore, such as peer relation databag. For example, the `get_host_ip` uses the `charm.model.get_binding(peer_relation_name)`, which returns a valid object even if we have a single unit. The `unit_ip` is also called outside of the `helper_networking.py`, on `opensearch_distro`. However, all these methods have previous checks, i.e. we never call `unit_ip` for another unit that its own outside of `unit_ips`. Also, we skip acquiring the lock in this case, as the application is going away. ## The `get_relation` behavior ### Before `remove-application` with single unit Even if we have a single unit, we still have the peer relation databag available. The databag will be used by that single unit across the unit's lifecycle. We can see it with: ``` root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# nano src/charm.py root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# nano lib/charms/opensearch/v0/opensearch_base_charm.py root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# ./dispatch 2024-07-26 14:27:56,864 DEBUG ops 2.14.1 up and running. --Return-- > /var/lib/juju/agents/unit-opensearch-0/charm/lib/charms/opensearch/v0/opensearch_base_charm.py(250)__init__()->None -> import pdb; pdb.set_trace() (Pdb) p self.model.get_relation(PeerRelationName) ``` ### After `remove-application` After remove-application has been received, we can see that relation object will not be retrieved anymore: ``` root@juju-6884bf-2:/var/lib/juju/agents/unit-opensearch-1/charm# ./dispatch 2024-08-05 18:38:42,246 DEBUG ops 2.15.0 up and running. --Return-- > /var/lib/juju/agents/unit-opensearch-1/charm/lib/charms/opensearch/v0/opensearch_base_charm.py(250)__init_ _()->None -> import pdb; pdb.set_trace() (Pdb) p self.model.get_relation(PeerRelationName) None (Pdb) ``` Closes #360 Closes #378 Closes #379 --------- Co-authored-by: Mehdi Bendriss --- lib/charms/opensearch/v0/helper_networking.py | 2 + .../opensearch/v0/opensearch_base_charm.py | 10 +++-- .../opensearch/v0/opensearch_internal_data.py | 7 ++- tests/integration/test_charm.py | 43 +++++++++++++++++-- 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/lib/charms/opensearch/v0/helper_networking.py b/lib/charms/opensearch/v0/helper_networking.py index 5b1ed3ae4..8c144d5af 100644 --- a/lib/charms/opensearch/v0/helper_networking.py +++ b/lib/charms/opensearch/v0/helper_networking.py @@ -71,6 +71,8 @@ def unit_ip(charm: CharmBase, unit: Unit, peer_relation_name: str) -> str: def units_ips(charm: CharmBase, peer_relation_name: str) -> Dict[str, str]: """Returns the mapping "unit id / ip address" of all units.""" unit_ip_map = {} + if not charm.model.get_relation(peer_relation_name): + return unit_ip_map for unit in charm.model.get_relation(peer_relation_name).units: unit_id = unit.name.split("/")[1] diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index 8bb981141..385003814 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -472,7 +472,8 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no "Removing units during an upgrade is not supported. The charm may be in a broken, unrecoverable state" ) # acquire lock to ensure only 1 unit removed at a time - if not self.node_lock.acquired: + # Closes canonical/opensearch-operator#378 + if self.app.planned_units() > 1 and not self.node_lock.acquired: # Raise uncaught exception to prevent Juju from removing unit raise Exception("Unable to acquire lock: Another unit is starting or stopping.") @@ -486,7 +487,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no if node.name != self.unit_name ] self._compute_and_broadcast_updated_topology(remaining_nodes) - elif self.app.planned_units() == 0: + elif self.app.planned_units() == 0 and self.model.get_relation(PeerRelationName): self.peers_data.delete(Scope.APP, "bootstrap_contributors_count") self.peers_data.delete(Scope.APP, "nodes_config") @@ -512,8 +513,9 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no else: raise OpenSearchHAError(ClusterHealthUnknown) finally: - # release lock - self.node_lock.release() + if self.app.planned_units() > 1 and (self.opensearch.is_node_up() or self.alt_hosts): + # release lock + self.node_lock.release() def _on_update_status(self, event: UpdateStatusEvent): """On update status event. diff --git a/lib/charms/opensearch/v0/opensearch_internal_data.py b/lib/charms/opensearch/v0/opensearch_internal_data.py index b4befbe6f..4e060315e 100644 --- a/lib/charms/opensearch/v0/opensearch_internal_data.py +++ b/lib/charms/opensearch/v0/opensearch_internal_data.py @@ -91,7 +91,7 @@ def cast(str_val: str) -> Union[bool, int, float, str]: def put_or_delete(data: Dict[str, str], key: str, value: Optional[str]): """Put data into the key/val data store or delete if value is None.""" if value is None: - del data[key] + data.pop(key, None) return data.update({key: str(value)}) @@ -111,7 +111,6 @@ def put(self, scope: Scope, key: str, value: Optional[Union[any]]) -> None: raise ValueError("Scope undefined.") data = self._get_relation_data(scope) - self.put_or_delete(data, key, value) @override @@ -142,7 +141,7 @@ def has(self, scope: Scope, key: str): if scope is None: raise ValueError("Scope undefined.") - return key in self._get_relation_data(scope) + return key in (self._get_relation_data(scope) or {}) @override def get( @@ -189,7 +188,7 @@ def _get_relation_data(self, scope: Scope) -> Dict[str, str]: relation_scope = self._charm.app if scope == Scope.APP else self._charm.unit - return relation.data[relation_scope] + return relation.data.get(relation_scope) @staticmethod def _default_encoder(o: Any) -> Any: diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 5e5a15231..c9c705cd2 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -38,7 +38,35 @@ @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail -@pytest.mark.skip_if_deployed +async def test_deploy_and_remove_single_unit(ops_test: OpsTest) -> None: + """Build and deploy OpenSearch with a single unit and remove it.""" + my_charm = await ops_test.build_charm(".") + await ops_test.model.set_config(MODEL_CONFIG) + + await ops_test.model.deploy( + my_charm, + num_units=1, + series=SERIES, + ) + # Deploy TLS Certificates operator. + config = {"ca-common-name": "CN_CA"} + await ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=config) + # Relate it to OpenSearch to set up TLS. + await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME) + await wait_until( + ops_test, + apps=[APP_NAME], + apps_statuses=["active"], + units_statuses=["active"], + wait_for_exact_units=1, + ) + assert len(ops_test.model.applications[APP_NAME].units) == 1 + await ops_test.model.remove_application(APP_NAME, block_until_done=True) + await ops_test.model.remove_application(TLS_CERTIFICATES_APP_NAME, block_until_done=True) + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest) -> None: """Build and deploy a couple of OpenSearch units.""" my_charm = await ops_test.build_charm(".") @@ -63,8 +91,10 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: @pytest.mark.abort_on_fail async def test_actions_get_admin_password(ops_test: OpsTest) -> None: """Test the retrieval of admin secrets.""" + leader_id = await get_leader_unit_id(ops_test) + # 1. run the action prior to finishing the config of TLS - result = await run_action(ops_test, 0, "get-password") + result = await run_action(ops_test, leader_id, "get-password") assert result.status == "failed" # Deploy TLS Certificates operator. @@ -94,7 +124,7 @@ async def test_actions_get_admin_password(ops_test: OpsTest) -> None: assert http_resp_code == 200 # 3. test retrieving password from non-supported user - result = await run_action(ops_test, 0, "get-password", {"username": "non-existent"}) + result = await run_action(ops_test, leader_id, "get-password", {"username": "non-existent"}) assert result.status == "failed" @@ -290,3 +320,10 @@ async def test_all_units_have_internal_users_synced(ops_test: OpsTest) -> None: for unit in ops_test.model.applications[APP_NAME].units: unit_conf = get_conf_as_dict(ops_test, unit.name, filename) assert leader_conf == unit_conf + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_remove_application(ops_test: OpsTest) -> None: + """Removes the application with two units.""" + await ops_test.model.remove_application(APP_NAME, block_until_done=True)