Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate runtime updates for etcd #71

Merged
merged 11 commits into from
Apr 22, 2024
28 changes: 24 additions & 4 deletions charms/worker/k8s/lib/charms/k8s/v0/k8sd_api_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,24 @@ class UserFacingClusterConfig(BaseModel):
cloud_provider: str = Field(None, alias="cloud-provider")


class UserFacingDatastoreConfig(BaseModel, allow_population_by_field_name=True): # type: ignore[call-arg]
"""Aggregated configuration model for the user-facing datastore aspects of a cluster.

Attributes:
type: Type of the datastore. For runtime updates, this needs to be "external".
servers: Server addresses of the external datastore.
ca_crt: CA certificate of the external datastore cluster in PEM format.
client_crt: client certificate of the external datastore cluster in PEM format.
client_key: client key of the external datastore cluster in PEM format.
"""

type: str = Field(None)
servers: List[str] = Field(None)
ca_crt: str = Field(None, alias="ca-crt")
client_crt: str = Field(None, alias="client-crt")
client_key: str = Field(None, alias="client-key")
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved


class BootstrapConfig(BaseModel):
"""Configuration model for bootstrapping a Canonical K8s cluster.

Expand Down Expand Up @@ -341,21 +359,23 @@ class UpdateClusterConfigRequest(BaseModel):

Attributes:
config (Optional[UserFacingClusterConfig]): The cluster configuration.
datastore (Optional[UserFacingDatastoreConfig]): The clusters datastore configuration.
"""

config: UserFacingClusterConfig
config: UserFacingClusterConfig = Field(None)
datastore: UserFacingDatastoreConfig = Field(None)
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved


class DatastoreStatus(BaseModel):
"""information regarding the active datastore.

Attributes:
datastore_type (str): external or k8s-dqlite datastore
external_url: (str): list of external_urls
servers: (List(str)): list of server addresses of the external datastore cluster.
"""

datastore_type: str = Field(None, alias="type")
external_url: str = Field(None, alias="external-url")
servers: List[str] = Field(None, alias="servers")


class ClusterStatus(BaseModel):
Expand Down Expand Up @@ -627,7 +647,7 @@ def update_cluster_config(self, config: UpdateClusterConfigRequest):
config (UpdateClusterConfigRequest): The cluster configuration.
"""
endpoint = "/1.0/k8sd/cluster/config"
body = config.dict(exclude_none=True)
body = config.dict(exclude_none=True, by_alias=True)
self._send_request(endpoint, "PUT", EmptyResponse, body)

def get_cluster_status(self) -> GetClusterStatusResponse:
Expand Down
32 changes: 32 additions & 0 deletions charms/worker/k8s/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
UnixSocketConnectionFactory,
UpdateClusterConfigRequest,
UserFacingClusterConfig,
UserFacingDatastoreConfig,
)
from charms.kubernetes_libs.v0.etcd import EtcdReactiveRequires
from charms.node_base import LabelMaker
Expand Down Expand Up @@ -361,6 +362,36 @@ def _enable_functionalities(self):

self.api_manager.update_cluster_config(update_request)

@on_error(
WaitingStatus("Ensure that the cluster configuration is up-to-date"),
InvalidResponseError,
K8sdConnectionError,
)
def _ensure_cluster_config(self):
"""Ensure that the cluster configuration is up-to-date.

The snap will detect any changes and only perform necessary steps.
There is no need to track changes in the charm.
"""
status.add(ops.MaintenanceStatus("Ensure cluster config"))
log.info("Ensure cluster-config")

update_request = UpdateClusterConfigRequest()

# TODO: Ensure other configs here as well.

if self.config.get("datastore") == "etcd":
etcd_config = self.etcd.get_client_credentials()
update_request.datastore = UserFacingDatastoreConfig(
type="external",
servers=self.etcd.get_connection_string().split(","),
ca_crt=etcd_config.get("client_ca", ""),
client_crt=etcd_config.get("client_cert", ""),
client_key=etcd_config.get("client_key", ""),
)

self.api_manager.update_cluster_config(update_request)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I continue to encounter this error when the third etcd unit tries to post it's update:

charms.k8s.v0.k8sd_api_manager.InvalidResponseError: Error status 500
	method=PUT
	endpoint=/1.0/k8sd/cluster/config
	reason=Internal Server Error
	body={"type":"error","status":"","status_code":0,"operation":"","error_code":500,"error":"failed to reconcile network: failed to refresh network: failed to refresh network component: failed to upgrade component 'network': another operation (install/upgrade/rollback) is in progress","metadata":null}

Copy link
Contributor

@addyess addyess Apr 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are the k8sd and charm logs around the same moment of this failure
https://pastebin.canonical.com/p/njZJqx55yX/

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

api server logs around the same moment
https://pastebin.canonical.com/p/QzMpFCmQCV/

is it possible that adding extra etcd units in rapid succcession caused the API server to restart when the first datastore came online, and when we published the second datastore URL -- the config wouldn't take?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey Adam,
Thanks for the review. I think the issue you were facing was before canonical/k8s-snap#356 was merged.

I cannot reproduce this issue and the CI also seems to be happy.


def _get_scrape_jobs(self):
"""Retrieve the Prometheus Scrape Jobs.

Expand Down Expand Up @@ -474,6 +505,7 @@ def _reconcile(self, event):
self._create_cos_tokens()
self._apply_cos_requirements()
self._revoke_cluster_tokens()
self._ensure_cluster_config()
self._join_cluster()
self._configure_cos_integration()
self._update_status()
Expand Down
1 change: 1 addition & 0 deletions charms/worker/k8s/tests/unit/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def mock_reconciler_handlers(harness):
"_apply_cos_requirements",
"_copy_internal_kubeconfig",
"_revoke_cluster_tokens",
"_ensure_cluster_config",
"_expose_ports",
}

Expand Down
21 changes: 19 additions & 2 deletions charms/worker/k8s/tests/unit/test_k8sd_api_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
UnixSocketHTTPConnection,
UpdateClusterConfigRequest,
UserFacingClusterConfig,
UserFacingDatastoreConfig,
)


Expand Down Expand Up @@ -262,13 +263,29 @@ def test_update_cluster_config(self, mock_send_request):

dns_config = DNSConfig(enabled=True)
user_config = UserFacingClusterConfig(dns=dns_config)
request = UpdateClusterConfigRequest(config=user_config)
datastore = UserFacingDatastoreConfig(
type="external",
servers=["localhost:123"],
ca_crt="ca-crt",
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved
client_crt="client-crt",
client_key="client-key",
)
request = UpdateClusterConfigRequest(config=user_config, datastore=datastore)
self.api_manager.update_cluster_config(request)
mock_send_request.assert_called_once_with(
"/1.0/k8sd/cluster/config",
"PUT",
EmptyResponse,
{"config": {"dns": {"enabled": True}}},
{
"config": {"dns": {"enabled": True}},
"datastore": {
"type": "external",
"servers": ["localhost:123"],
"ca-crt": "ca-crt",
"client-crt": "client-crt",
"client-key": "client-key",
},
},
)

@patch("lib.charms.k8s.v0.k8sd_api_manager.K8sdAPIManager._send_request")
Expand Down
24 changes: 23 additions & 1 deletion tests/integration/test_etcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,26 @@ async def test_etcd_datastore(kubernetes_cluster: model.Model):
status = json.loads(result.results["stdout"])
assert status["ready"], "Cluster isn't ready"
assert status["datastore"]["type"] == "external", "Not bootstrapped against etcd"
assert status["datastore"]["external-url"] == f"https://{etcd.public_address}:{etcd_port}"
assert status["datastore"]["servers"] == [f"https://{etcd.public_address}:{etcd_port}"]


@pytest.mark.abort_on_fail
async def test_update_etcd_cluster(kubernetes_cluster: model.Model):
"""Test that adding etcd clusters are propagated to the k8s cluster."""
k8s: unit.Unit = kubernetes_cluster.applications["k8s"].units[0]
etcd = kubernetes_cluster.applications["etcd"]

await etcd.add_unit()
await etcd.add_unit()
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved

expected_servers = []
for u in etcd.units:
etcd_port = u.safe_data["ports"][0]["number"]
expected_servers.append(f"https://{u.public_address}:{etcd_port}")

event = await k8s.run("k8s status --output-format json")
result = await event.wait()
status = json.loads(result.results["stdout"])
assert status["ready"], "Cluster isn't ready"
assert status["datastore"]["type"] == "external", "Not bootstrapped against etcd"
assert set(status["datastore"]["servers"]) == set(expected_servers)
Loading