Skip to content

Commit

Permalink
[DPE-4860][DPE-4982] get_relation(PeerRelationName) returns None
Browse files Browse the repository at this point in the history
…if `remove-application` has been requested (#361)

The main issue here is the `storage-detaching` after remove-application.
That gets more complex if we consider the last unit going away. In this
scenario, we cannot consider that relations will be available anymore,
such as peer relation databag.

For example, the `get_host_ip` uses the
`charm.model.get_binding(peer_relation_name)`, which returns a valid
object even if we have a single unit.

The `unit_ip` is also called outside of the `helper_networking.py`, on
`opensearch_distro`. However, all these methods have previous checks,
i.e. we never call `unit_ip` for another unit that its own outside of
`unit_ips`.

Also, we skip acquiring the lock in this case, as the application is
going away.

## The `get_relation` behavior

### Before `remove-application` with single unit

Even if we have a single unit, we still have the peer relation databag
available. The databag will be used by that single unit across the
unit's lifecycle. We can see it with:
```
root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# nano src/charm.py 
root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# nano lib/charms/opensearch/v0/opensearch_base_charm.py 
root@juju-941823-0:/var/lib/juju/agents/unit-opensearch-0/charm# ./dispatch 
2024-07-26 14:27:56,864 DEBUG    ops 2.14.1 up and running.
--Return--
> /var/lib/juju/agents/unit-opensearch-0/charm/lib/charms/opensearch/v0/opensearch_base_charm.py(250)__init__()->None
-> import pdb; pdb.set_trace()
(Pdb) p self.model.get_relation(PeerRelationName)
<ops.model.Relation opensearch-peers:0>
```

### After `remove-application`

After remove-application has been received, we can see that relation
object will not be retrieved anymore:
```
root@juju-6884bf-2:/var/lib/juju/agents/unit-opensearch-1/charm# ./dispatch                                 
2024-08-05 18:38:42,246 DEBUG    ops 2.15.0 up and running.                                                 
--Return--                                                                                                  
> /var/lib/juju/agents/unit-opensearch-1/charm/lib/charms/opensearch/v0/opensearch_base_charm.py(250)__init_
_()->None                                                                                                   
-> import pdb; pdb.set_trace()                                                                              
(Pdb) p self.model.get_relation(PeerRelationName)                                                           
None                                                                                                        
(Pdb)                                                                                                       
```

Closes #360
Closes #378
Closes #379

---------

Co-authored-by: Mehdi Bendriss <[email protected]>
  • Loading branch information
phvalguima and Mehdi-Bendriss authored Aug 13, 2024
1 parent 088db63 commit 24ff49f
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 11 deletions.
2 changes: 2 additions & 0 deletions lib/charms/opensearch/v0/helper_networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def unit_ip(charm: CharmBase, unit: Unit, peer_relation_name: str) -> str:
def units_ips(charm: CharmBase, peer_relation_name: str) -> Dict[str, str]:
"""Returns the mapping "unit id / ip address" of all units."""
unit_ip_map = {}
if not charm.model.get_relation(peer_relation_name):
return unit_ip_map

for unit in charm.model.get_relation(peer_relation_name).units:
unit_id = unit.name.split("/")[1]
Expand Down
10 changes: 6 additions & 4 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,8 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
"Removing units during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
)
# acquire lock to ensure only 1 unit removed at a time
if not self.node_lock.acquired:
# Closes canonical/opensearch-operator#378
if self.app.planned_units() > 1 and not self.node_lock.acquired:
# Raise uncaught exception to prevent Juju from removing unit
raise Exception("Unable to acquire lock: Another unit is starting or stopping.")

Expand All @@ -486,7 +487,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
if node.name != self.unit_name
]
self._compute_and_broadcast_updated_topology(remaining_nodes)
elif self.app.planned_units() == 0:
elif self.app.planned_units() == 0 and self.model.get_relation(PeerRelationName):
self.peers_data.delete(Scope.APP, "bootstrap_contributors_count")
self.peers_data.delete(Scope.APP, "nodes_config")

Expand All @@ -512,8 +513,9 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
else:
raise OpenSearchHAError(ClusterHealthUnknown)
finally:
# release lock
self.node_lock.release()
if self.app.planned_units() > 1 and (self.opensearch.is_node_up() or self.alt_hosts):
# release lock
self.node_lock.release()

def _on_update_status(self, event: UpdateStatusEvent):
"""On update status event.
Expand Down
7 changes: 3 additions & 4 deletions lib/charms/opensearch/v0/opensearch_internal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def cast(str_val: str) -> Union[bool, int, float, str]:
def put_or_delete(data: Dict[str, str], key: str, value: Optional[str]):
"""Put data into the key/val data store or delete if value is None."""
if value is None:
del data[key]
data.pop(key, None)
return

data.update({key: str(value)})
Expand All @@ -111,7 +111,6 @@ def put(self, scope: Scope, key: str, value: Optional[Union[any]]) -> None:
raise ValueError("Scope undefined.")

data = self._get_relation_data(scope)

self.put_or_delete(data, key, value)

@override
Expand Down Expand Up @@ -142,7 +141,7 @@ def has(self, scope: Scope, key: str):
if scope is None:
raise ValueError("Scope undefined.")

return key in self._get_relation_data(scope)
return key in (self._get_relation_data(scope) or {})

@override
def get(
Expand Down Expand Up @@ -189,7 +188,7 @@ def _get_relation_data(self, scope: Scope) -> Dict[str, str]:

relation_scope = self._charm.app if scope == Scope.APP else self._charm.unit

return relation.data[relation_scope]
return relation.data.get(relation_scope)

@staticmethod
def _default_encoder(o: Any) -> Any:
Expand Down
43 changes: 40 additions & 3 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,35 @@
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
async def test_deploy_and_remove_single_unit(ops_test: OpsTest) -> None:
"""Build and deploy OpenSearch with a single unit and remove it."""
my_charm = await ops_test.build_charm(".")
await ops_test.model.set_config(MODEL_CONFIG)

await ops_test.model.deploy(
my_charm,
num_units=1,
series=SERIES,
)
# Deploy TLS Certificates operator.
config = {"ca-common-name": "CN_CA"}
await ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=config)
# Relate it to OpenSearch to set up TLS.
await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME)
await wait_until(
ops_test,
apps=[APP_NAME],
apps_statuses=["active"],
units_statuses=["active"],
wait_for_exact_units=1,
)
assert len(ops_test.model.applications[APP_NAME].units) == 1
await ops_test.model.remove_application(APP_NAME, block_until_done=True)
await ops_test.model.remove_application(TLS_CERTIFICATES_APP_NAME, block_until_done=True)


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
"""Build and deploy a couple of OpenSearch units."""
my_charm = await ops_test.build_charm(".")
Expand All @@ -63,8 +91,10 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
@pytest.mark.abort_on_fail
async def test_actions_get_admin_password(ops_test: OpsTest) -> None:
"""Test the retrieval of admin secrets."""
leader_id = await get_leader_unit_id(ops_test)

# 1. run the action prior to finishing the config of TLS
result = await run_action(ops_test, 0, "get-password")
result = await run_action(ops_test, leader_id, "get-password")
assert result.status == "failed"

# Deploy TLS Certificates operator.
Expand Down Expand Up @@ -94,7 +124,7 @@ async def test_actions_get_admin_password(ops_test: OpsTest) -> None:
assert http_resp_code == 200

# 3. test retrieving password from non-supported user
result = await run_action(ops_test, 0, "get-password", {"username": "non-existent"})
result = await run_action(ops_test, leader_id, "get-password", {"username": "non-existent"})
assert result.status == "failed"


Expand Down Expand Up @@ -290,3 +320,10 @@ async def test_all_units_have_internal_users_synced(ops_test: OpsTest) -> None:
for unit in ops_test.model.applications[APP_NAME].units:
unit_conf = get_conf_as_dict(ops_test, unit.name, filename)
assert leader_conf == unit_conf


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_remove_application(ops_test: OpsTest) -> None:
"""Removes the application with two units."""
await ops_test.model.remove_application(APP_NAME, block_until_done=True)

0 comments on commit 24ff49f

Please sign in to comment.