From 2e492f86d46a92bf9e540bfab692770c1c97053d Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Date: Thu, 7 Nov 2024 11:27:51 -0300 Subject: [PATCH 1/8] add tls and tls-ca fields to databag (#666) --- src/charm.py | 4 ++++ src/relations/postgresql_provider.py | 23 +++++++++++++++++++++++ tests/unit/test_postgresql_provider.py | 3 +++ 3 files changed, 30 insertions(+) diff --git a/src/charm.py b/src/charm.py index b228e15ad0..16390b2385 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1709,6 +1709,9 @@ def update_config(self, is_creating_backup: bool = False) -> bool: # in a bundle together with the TLS certificates operator. This flag is used to # know when to call the Patroni API using HTTP or HTTPS. self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""}) + self.postgresql_client_relation.update_tls_flag( + "True" if self.is_tls_enabled else "False" + ) logger.debug("Early exit update_config: Workload not started yet") return True @@ -1784,6 +1787,7 @@ def _handle_postgresql_restart_need(self, enable_tls: bool) -> None: # Ignore the error, as it happens only to indicate that the configuration has not changed. pass self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""}) + self.postgresql_client_relation.update_tls_flag("True" if self.is_tls_enabled else "False") # Restart PostgreSQL if TLS configuration has changed # (so the both old and new connections use the configuration). diff --git a/src/relations/postgresql_provider.py b/src/relations/postgresql_provider.py index 0ad7198368..3b10f801c8 100644 --- a/src/relations/postgresql_provider.py +++ b/src/relations/postgresql_provider.py @@ -108,6 +108,17 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None: # Set the database name self.database_provides.set_database(event.relation.id, database) + # Set TLS flag + self.database_provides.set_tls( + event.relation.id, + "True" if self.charm.is_tls_enabled else "False", + ) + + # Set TLS CA + if self.charm.is_tls_enabled: + _, ca, _ = self.charm.tls.get_tls_files() + self.database_provides.set_tls_ca(event.relation.id, ca) + # Update the read/write and read-only endpoints. self.update_endpoints(event) @@ -215,6 +226,18 @@ def update_endpoints(self, event: DatabaseRequestedEvent = None) -> None: f"postgresql://{user}:{password}@{self.charm.primary_endpoint}:{DATABASE_PORT}/{database}", ) + def update_tls_flag(self, tls: str) -> None: + """Update TLS flag and CA in relation databag.""" + relations = self.model.relations[self.relation_name] + if tls == "True": + _, ca, _ = self.charm.tls.get_tls_files() + else: + ca = "" + + for relation in relations: + self.database_provides.set_tls(relation.id, tls) + self.database_provides.set_tls_ca(relation.id, ca) + def _check_multiple_endpoints(self) -> bool: """Checks if there are relations with other endpoints.""" relation_names = {relation.name for relation in self.charm.client_relations} diff --git a/tests/unit/test_postgresql_provider.py b/tests/unit/test_postgresql_provider.py index 8560be5f7a..0a77509742 100644 --- a/tests/unit/test_postgresql_provider.py +++ b/tests/unit/test_postgresql_provider.py @@ -142,6 +142,7 @@ def test_on_database_requested(harness): "password": "test-password", "version": POSTGRESQL_VERSION, "database": f"{DATABASE}", + "tls": "False", } # Assert no BlockedStatus was set. @@ -153,6 +154,7 @@ def test_on_database_requested(harness): # No data is set in the databag by the database. assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "data": f'{{"database": "{DATABASE}", "extra-user-roles": "{EXTRA_USER_ROLES}"}}', + "tls": "False", } # BlockedStatus due to a PostgreSQLCreateDatabaseError. @@ -161,6 +163,7 @@ def test_on_database_requested(harness): # No data is set in the databag by the database. assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "data": f'{{"database": "{DATABASE}", "extra-user-roles": "{EXTRA_USER_ROLES}"}}', + "tls": "False", } # BlockedStatus due to a PostgreSQLGetPostgreSQLVersionError. From 1acd0eef8ed1a960e250a76024ef723663f8ca04 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:54:55 +0200 Subject: [PATCH 2/8] Sync docs from Discourse (#648) Co-authored-by: GitHub Actions <41898282+github-actions[bot]@users.noreply.github.com> --- docs/how-to/h-configure-s3-radosgw.md | 9 +- docs/how-to/h-create-backup.md | 10 +- docs/how-to/h-deploy-azure.md | 347 ++++++++++++++++++ docs/how-to/h-deploy-multi-az.md | 188 ++++++++++ docs/how-to/h-enable-alert-rules.md | 5 +- docs/how-to/h-integrate.md | 2 +- docs/how-to/h-restore-backup.md | 4 +- docs/how-to/h-rollback-minor.md | 19 +- docs/how-to/h-scale.md | 4 +- docs/how-to/h-upgrade-minor.md | 13 +- docs/overview.md | 13 +- docs/reference/r-alert-rules.md | 54 +++ .../e-statuses.md => reference/r-statuses.md} | 5 +- 13 files changed, 637 insertions(+), 36 deletions(-) create mode 100644 docs/how-to/h-deploy-azure.md create mode 100644 docs/how-to/h-deploy-multi-az.md create mode 100644 docs/reference/r-alert-rules.md rename docs/{explanation/e-statuses.md => reference/r-statuses.md} (84%) diff --git a/docs/how-to/h-configure-s3-radosgw.md b/docs/how-to/h-configure-s3-radosgw.md index d13a40d37b..d07d4b34ba 100644 --- a/docs/how-to/h-configure-s3-radosgw.md +++ b/docs/how-to/h-configure-s3-radosgw.md @@ -6,9 +6,14 @@ If you are using an earlier version, check the [Juju 3.0 Release Notes](https:// # Configure S3 for RadosGW -A Charmed PostgreSQL backup can be stored on any S3-compatible storage. S3 access and configurations are managed with the [s3-integrator charm](https://charmhub.io/s3-integrator). +A PostgreSQL backup can be stored on any S3-compatible storage. S3 access and configurations are managed with the [s3-integrator charm](https://charmhub.io/s3-integrator). -This guide will teach you how to deploy and configure the s3-integrator charm on Ceph via [RadosGW](https://docs.ceph.com/en/quincy/man/8/radosgw/), send the configuration to a Charmed PostgreSQL application, and update it. (To configure S3 for AWS, see [this guide](/t/9681)) +This guide will teach you how to deploy and configure the s3-integrator charm on Ceph via [RadosGW](https://docs.ceph.com/en/quincy/man/8/radosgw/), send the configuration to a Charmed PostgreSQL application, and update it. +> For AWS, see the guide [How to configure S3 for AWS](/t/9681) + +[note] +The Charmed PostgreSQL backup tool ([pgBackRest](https://pgbackrest.org/)) can currently only interact with S3-compatible storages if they work with [SSL/TLS](https://github.com/pgbackrest/pgbackrest/issues/2340) (backup via the plain HTTP is currently not supported). +[/note] ## Configure s3-integrator First, install the MinIO client and create a bucket: diff --git a/docs/how-to/h-create-backup.md b/docs/how-to/h-create-backup.md index 8d9325c1cf..de5a558953 100644 --- a/docs/how-to/h-create-backup.md +++ b/docs/how-to/h-create-backup.md @@ -9,9 +9,9 @@ If you are using an earlier version, check the [Juju 3.0 Release Notes](https:// This guide contains recommended steps and useful commands for creating and managing backups to ensure smooth restores. ## Prerequisites -* A cluster with at [least three nodes](/t/charmed-postgresql-how-to-manage-units/9689?channel=14/stable) deployed +* A cluster with at [least three nodes](/t/9689?channel=14/stable) deployed * Access to S3 storage -* [Configured settings for S3 storage](/t/charmed-postgresql-how-to-configure-s3/9681?channel=14/stable) +* [Configured settings for S3 storage](/t/9681?channel=14/stable) ## Summary - [Save your current cluster credentials](#heading--save-credentials), as you'll need them for restoring @@ -38,7 +38,7 @@ Once Charmed PostgreSQL is `active` and `idle`, you can create your first backup ```shell juju run postgresql/leader create-backup ``` -By default, backups created with command above will be **full** backups: a copy of *all* your data will be stored in S3. There are 2 other supported types of backups (available in revision 416+, currently in channel `14/edge` only): +By default, backups created with the command above will be **full** backups: a copy of *all* your data will be stored in S3. There are 2 other supported types of backups (available in revision 416+, currently in channel `14/edge` only): * Differential: Only modified files since the last full backup will be stored. * Incremental: Only modified files since the last successful backup (of any type) will be stored. @@ -48,8 +48,8 @@ juju run postgresql/leader create-backup type={full|differential|incremental} ``` **Tip**: To avoid unnecessary service downtime, always use non-primary units for the action `create-backup`. Keep in mind that: -* TLS enabled: disables the command from running on *primary units*. -* TLS **not** enabled: disables the command from running on *non-primary units*. +* When TLS is enabled, `create-backup` can only run on replicas (non-primary) +* When TLS is **not** enabled, `create-backup` can only run in the primary unit

List backups

You can list your available, failed, and in progress backups by running the `list-backups` command: diff --git a/docs/how-to/h-deploy-azure.md b/docs/how-to/h-deploy-azure.md new file mode 100644 index 0000000000..dd266a6916 --- /dev/null +++ b/docs/how-to/h-deploy-azure.md @@ -0,0 +1,347 @@ +# How to deploy on Azure + +[Azure](https://azure.com/) is a cloud computing platform developed by Microsoft. It has management, access and development of applications and services to individuals, companies, and governments through its global infrastructure. Access the Azure web console at [portal.azure.com](https://portal.azure.com/). + +## Summary +* [Set up Juju and Azure tooling](#set-up-juju-and-azure-tooling) + * [Install Juju and Azure CLI](#install-juju-and-azure-cli) + * [Authenticate](#authenticate) + * [Bootstrap Juju controller on Azure](#bootstrap-juju-controller) +* [Deploy charms](#deploy-charms) +* [Expose database (optional)](#expose-database-optional) +* [Clean up](#clean-up) + +--- + +## Set up Juju and Azure tooling +[note type="caution"] +**Warning**: The described `Azure interactive` method (with web browser authentication `service-principal-secret-via-browser`) described here is only supported starting Juju 3.6-rc1+! +[/note] +### Install Juju and Azure CLI +Install Juju via snap: +```shell +sudo snap install juju --channel 3.6/edge +``` + +Follow the installation guides for: +* [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt) - the Azure CLI for Linux + +To check they are all correctly installed, you can run the commands demonstrated below with sample outputs: + +```console +> juju version +3.6-rc1-genericlinux-amd64 + +> az --version +azure-cli 2.65.0 +core 2.65.0 +telemetry 1.1.0 + +Dependencies: +msal 1.31.0 +azure-mgmt-resource 23.1.1 +... + +Your CLI is up-to-date. +``` + +### Authenticate + +Please follow [the official Juju Azure documentation](https://juju.is/docs/juju/microsoft-azure) and check [the extra explanation about possible options](/t/15219). Choose the authentication method which fits you best. + +We are describing here the currently recommended `interactive` method with web browser authentication `service-principal-secret-via-browser`. This method does not require logging in with the Azure CLI locally, but it **requires an Azure subscription**. + +The first mandatory step is to [create an Azure subscription](https://learn.microsoft.com/en-us/azure/cost-management-billing/manage/create-subscription) - you will need the Azure subscription ID for Juju. + +Once you have it, add Azure credentials to Juju: +```none +juju add-credential azure +``` +This will start a script that will help you set up the credentials, where you will be asked to fill in a set of parameters: +* `credential-name`: Fill this with a sensible name that will help you identify the credential set, say `` +* `region`: Select any default region that is more convenient for you to deploy your controller and applications. Note that credentials are not region-specific. +* `auth type`: select `interactive`, which is the recommended way to authenticate to Azure using Juju +* `subscription_id`: Use the value `` from the Azure subscription created in the previous step. +* `application_name`: Generate a random string to avoid collision with other users or applications +* `role-definition-name`: Generate a random string to avoid collision with other users or applications, and store it as `` + +After prompting this information, you will be asked to authenticate the requests via web browser, as shown in the following example outputs: + +```shell +To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code to authenticate. +``` + +In the browser, open the [authentication page](https://microsoft.com/devicelogin) and enter the code `` provided in the output. + +You will be asked to authenticate twice, for allowing the creation of two different resources in Azure. + +If successful, you will see a confirmation that the credentials have been correctly added locally: + +```shell +Credential added locally for cloud "azure". +``` + +[details=Full sample output of `juju add-credential azure`] +```shell +> juju add-credential azure + +This operation can be applied to both a copy on this client and to the one on a controller. +No current controller was detected and there are no registered controllers on this client: either bootstrap one or register one. +Enter credential name: azure-test-credentials1 + +Regions + centralus + eastus + ... + +Select region [any region, credential is not region specific]: eastus + +Auth Types + interactive + service-principal-secret + managed-identity + +Select auth type [interactive]: interactive + +Enter subscription-id: [USE-YOUR-REAL-AZURE-SUBSCRIPTION-ID] + +Enter application-name (optional): azure-test-name1 + +Enter role-definition-name (optional): azure-test-role1 + +Note: your user account needs to have a role assignment to the +Azure Key Vault application (....). +You can do this from the Azure portal or using the az cli: + az ad sp create --id ... + +Initiating interactive authentication. + +To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code HIDDEN to authenticate. +To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code HIDDEN to authenticate. +Credential "azure-test-credentials1" added locally for cloud "azure". +``` +[/details] + +### Bootstrap Juju controller + +Once successfully completed, bootstrap the new Juju controller on Azure: +```shell +> juju bootstrap azure azure + +Creating Juju controller "azure" on azure/centralus +Looking for packaged Juju agent version 3.6-rc1 for amd64 +No packaged binary found, preparing local Juju agent binary +Launching controller instance(s) on azure/centralus... + - juju-aeb5ea-0 (arch=amd64 mem=3.5G cores=1) +Installing Juju agent on bootstrap instance +Waiting for address +Attempting to connect to 192.168.16.4:22 +Attempting to connect to 172.170.35.99:22 +Connected to 172.170.35.99 +Running machine configuration script... +Bootstrap agent now started +Contacting Juju controller at 192.168.16.4 to verify accessibility... + +Bootstrap complete, controller "azure" is now available +Controller machines are in the "controller" model + +Now you can run + juju add-model +to create a new model to deploy workloads. +``` + +You can check the [Azure instances availability](https://portal.azure.com/#browse/Microsoft.Compute%2FVirtualMachines): + +![image|689x313](upload://bB5lCMIHtL1KToftKQVv7z86aoi.png) + + +## Deploy charms + +Create a new Juju model if you don't have one already +```shell +juju add-model welcome +``` +> (Optional) Increase the debug level if you are troubleshooting charms: +> ```shell +> juju model-config logging-config='=INFO;unit=DEBUG' +> ``` + +The following command deploys PostgreSQL and [Data Integrator](https://charmhub.io/data-integrator), a charm that can be used to requests a test database: + +```shell +juju deploy postgresql +juju deploy data-integrator --config database-name=test123 +juju integrate postgresql data-integrator +``` +Check the status: +```shell +> juju status --relations + +Model Controller Cloud/Region Version SLA Timestamp +welcome azure azure/centralus 3.6-rc1.1 unsupported 12:56:16+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +data-integrator active 1 data-integrator latest/stable 41 no +postgresql 14.12 active 1 postgresql 14/stable 468 no + +Unit Workload Agent Machine Public address Ports Message +data-integrator/0* active idle 1 172.170.35.131 +postgresql/0* active idle 0 172.170.35.199 5432/tcp Primary + +Machine State Address Inst id Base AZ Message +0 started 172.170.35.199 juju-491ebe-0 ubuntu@22.04 +1 started 172.170.35.131 juju-491ebe-1 ubuntu@22.04 + +Integration provider Requirer Interface Type Message +data-integrator:data-integrator-peers data-integrator:data-integrator-peers data-integrator-peers peer +postgresql:database data-integrator:postgresql postgresql_client regular +postgresql:database-peers postgresql:database-peers postgresql_peers peer +postgresql:restart postgresql:restart rolling_op peer +postgresql:upgrade postgresql:upgrade upgrade peer +``` + +Once deployed, request the credentials for your newly bootstrapped PostgreSQL database: +```shell +juju run data-integrator/leader get-credentials +``` + +Example output: +```shell +postgresql: + data: '{"database": "test123", "external-node-connectivity": "true", "requested-secrets": + "[\"username\", \"password\", \"tls\", \"tls-ca\", \"uris\"]"}' + database: test123 + endpoints: 192.168.0.5:5432 + password: Jqi0QckCAADOFagl + uris: postgresql://relation-4:Jqi0QckCAADOFagl@192.168.0.5:5432/test123 + username: relation-4 + version: "14.12" +``` + +At this point, you can access your DB inside Azure VM using the internal IP address. All further Juju applications will use the database through the internal network: +```shell +> psql postgresql://relation-4:Jqi0QckCAADOFagl@192.168.0.5:5432/test123 + +psql (14.12 (Ubuntu 14.12-0ubuntu0.22.04.1)) +Type "help" for help. + +test123=> +``` + +From here you can begin to use your newly deployed PostgreSQL. Learn more about operations like scaling, enabling TLS, managing users and passwords, and more in the [Charmed PostgreSQL tutorial](/t/9707). + +## Expose database (optional) + +If it is necessary to access the database from outside of Azure, open the Azure firewall using the simple [juju expose](https://juju.is/docs/juju/juju-expose) functionality: +```shell +juju expose postgresql +``` +> Be wary that [opening ports to the public is risky](https://www.beyondtrust.com/blog/entry/what-is-an-open-port-what-are-the-security-implications). + +Once exposed, you can connect your database using the same credentials as above. This time use the Azure VM public IP assigned to the PostgreSQL instance. You can see this with `juju status`: +```shell +> juju status postgresql + +... +Model Controller Cloud/Region Version SLA Timestamp +welcome azure azure/centralus 3.6-rc1.1 unsupported 13:11:26+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +data-integrator active 1 data-integrator latest/stable 41 no +postgresql 14.12 active 1 postgresql 14/stable 468 yes + +Unit Workload Agent Machine Public address Ports Message +data-integrator/0* active idle 1 172.170.35.131 +postgresql/0* active idle 0 172.170.35.199 5432/tcp Primary + +Machine State Address Inst id Base AZ Message +0 started 172.170.35.199 juju-491ebe-0 ubuntu@22.04 +1 started 172.170.35.131 juju-491ebe-1 ubuntu@22.04 + +Integration provider Requirer Interface Type Message +data-integrator:data-integrator-peers data-integrator:data-integrator-peers data-integrator-peers peer +postgresql:database data-integrator:postgresql postgresql_client regular +postgresql:database-peers postgresql:database-peers postgresql_peers peer +postgresql:restart postgresql:restart rolling_op peer +postgresql:upgrade postgresql:upgrade upgrade peer +... +``` +Note the IP and port (`172.170.35.199:5432`) and connect via `psql`: +``` +> psql postgresql://relation-4:Jqi0QckCAADOFagl@172.170.35.199:5432/test123 + +psql (14.12 (Ubuntu 14.12-0ubuntu0.22.04.1)) +Type "help" for help. + +test123=> +``` +To close public access, run: +```shell +juju unexpose postgresql +``` + +## Clean up + +[note type="caution"] +Always clean Azure resources that are no longer necessary - they could be costly! +[/note] + +See all controllers in your machine with the following command: +``` +> juju controllers +... +Controller Model User Access Cloud/Region Models Nodes HA Version +azure* welcome admin superuser azure/centralus 2 1 none 3.6-rc1.1 +``` + +To destroy the `azure` Juju controller and remove the Azure instance, run the command below. **All your data will be permanently removed.** +```shell +juju destroy-controller azure --destroy-all-models --destroy-storage --force +``` + +Next, check and manually delete all unnecessary Azure VM instances and resources. To show the list of all your Azure VMs, run the following command (make sure no running resources are left): +```shell +az vm list +az resource list +``` + +List your Juju credentials: +```shell +> juju credentials + +... +Client Credentials: +Cloud Credentials +azure azure-test-name1 +... +``` +Remove Azure CLI credentials from Juju: +```shell +juju remove-credential azure azure-test-name1 +``` + +After deleting the credentials, the `interactive` process may still leave the role resource and its assignment hanging around. +We recommend you to check if these are still present with: + +```shell +az role definition list --name azure-test-role1 +``` +> Use it without specifying the `--name` argument to get the full list. + +You can also check whether you still have a role assignment bound to `azure-test-role1` registered using: + +```shell +az role assignment list --role azure-test-role1 +``` + +If this is the case, you can remove the role assignment first and then the role itself with the following commands: + +```shell +az role assignment delete --role azure-test-role1 +az role definition delete --name azure-test-role1 +``` + +Finally, log out of the Azure CLI user credentials to prevent any credential leakage: +```shell +az logout +``` \ No newline at end of file diff --git a/docs/how-to/h-deploy-multi-az.md b/docs/how-to/h-deploy-multi-az.md new file mode 100644 index 0000000000..5cb4ffeda2 --- /dev/null +++ b/docs/how-to/h-deploy-multi-az.md @@ -0,0 +1,188 @@ +# Deploy on multiple availability zones (AZ) + +During the deployment to hardware/VMs, it is important to spread all the +database copies (Juju units) to different hardware servers, +or even better, to the different [availability zones](https://en.wikipedia.org/wiki/Availability_zone) (AZ). This will guarantee no shared service-critical components across the DB cluster (eliminate the case with all eggs in the same basket). + +This guide will take you through deploying a PostgreSQL cluster on GCE using 3 available zones. All Juju units will be set up to sit in their dedicated zones only, which effectively guarantees database copy survival across all available AZs. + +[note] +This documentation assumes that your cloud supports and provides availability zones concepts. This is enabled by default on EC2/GCE and supported by LXD/MicroCloud. + +See the [Additional resources](#additional-resources) section for more details about AZ on specific clouds. +[/note] + +## Summary +* [Set up GCE on Google Cloud](#set-up-gce-on-google-cloud) +* [Deploy PostgreSQL with Juju zones constraints](#deploy-postgresql-with-juju-zones-constraints) + * [Simulation: A node gets drained](#simulation-a-node-gets-drained) +* [Additional resources](#additional-resources) +--- + +## Set up GCE on Google Cloud + +Let's deploy the [PostgreSQL Cluster on GKE (us-east4)](/t/11237) using all 3 zones there (`us-east4-a`, `us-east4-b`, `us-east4-c`) and make sure all pods always sits in the dedicated zones only. + +[note type="caution"] +**Warning**: Creating the following GKE resources may cost you money - be sure to monitor your GCloud costs. +[/note] + +Log into Google Cloud and [bootstrap GCE on Google Cloud](/t/15722): +```shell +gcloud auth login +gcloud iam service-accounts keys create sa-private-key.json --iam-account=juju-gce-account@[your-gcloud-project-12345].iam.gserviceaccount.com +sudo mv sa-private-key.json /var/snap/juju/common/sa-private-key.json +sudo chmod a+r /var/snap/juju/common/sa-private-key.json + +juju add-credential google +juju bootstrap google gce +juju add-model mymodel +``` + +## Deploy PostgreSQL with Juju zones constraints + +Juju provides the support for availability zones using **constraints**. Read more about zones in [Juju documentation](https://juju.is/docs/juju/constraint#heading--zones). + +The command below demonstrates how Juju automatically deploys Charmed PostgreSQL VM using [Juju constraints](https://juju.is/docs/juju/constraint#heading--zones): + +```shell +juju deploy postgresql -n 3 \ + --constraints zones=us-east1-b,us-east1-c,us-east1-d +``` + +After a successful deployment, `juju status` will show an active application: +```shell +Model Controller Cloud/Region Version SLA Timestamp +mymodel gce google/us-east1 3.5.4 unsupported 00:16:52+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +postgresql 14.12 active 3 postgresql 14/stable 468 no + +Unit Workload Agent Machine Public address Ports Message +postgresql/0 active idle 0 34.148.44.51 5432/tcp +postgresql/1 active idle 1 34.23.202.220 5432/tcp +postgresql/2* active idle 2 34.138.167.85 5432/tcp Primary + +Machine State Address Inst id Base AZ Message +0 started 34.148.44.51 juju-e7c0db-0 ubuntu@22.04 us-east1-d RUNNING +1 started 34.23.202.220 juju-e7c0db-1 ubuntu@22.04 us-east1-c RUNNING +2 started 34.138.167.85 juju-e7c0db-2 ubuntu@22.04 us-east1-b RUNNING +``` + +and each unit/vm will sit in the separate AZ out of the box: +```shell +> gcloud compute instances list +NAME ZONE MACHINE_TYPE PREEMPTIBLE INTERNAL_IP EXTERNAL_IP STATUS +juju-a82dd9-0 us-east1-b n1-highcpu-4 10.142.0.30 34.23.252.144 RUNNING # Juju Controller +juju-e7c0db-2 us-east1-b n2-highcpu-2 10.142.0.32 34.138.167.85 RUNNING # postgresql/2 +juju-e7c0db-1 us-east1-c n2-highcpu-2 10.142.0.33 34.23.202.220 RUNNING # postgresql/1 +juju-e7c0db-0 us-east1-d n2-highcpu-2 10.142.0.31 34.148.44.51 RUNNING # postgresql/0 +``` + +### Simulation: A node gets lost +Let's destroy a GCE node and recreate it using the same AZ: +```shell +> gcloud compute instances delete juju-e7c0db-1 +No zone specified. Using zone [us-east1-c] for instance: [juju-e7c0db-1]. +The following instances will be deleted. Any attached disks configured to be auto-deleted will be deleted unless they are attached to any other instances or the `--keep-disks` flag is given and specifies them for keeping. Deleting a disk is +irreversible and any data on the disk will be lost. + - [juju-e7c0db-1] in [us-east1-c] + +Do you want to continue (Y/n)? Y + +Deleted [https://www.googleapis.com/compute/v1/projects/data-platform-testing-354909/zones/us-east1-c/instances/juju-e7c0db-1]. +``` + +```shell +Model Controller Cloud/Region Version SLA Timestamp +mymodel gce google/us-east1 3.5.4 unsupported 00:25:14+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +postgresql 14.12 active 2/3 postgresql 14/stable 468 no + +Unit Workload Agent Machine Public address Ports Message +postgresql/0 active idle 0 34.148.44.51 5432/tcp +postgresql/1 unknown lost 1 34.23.202.220 5432/tcp agent lost, see 'juju show-status-log postgresql/1' +postgresql/2* active idle 2 34.138.167.85 5432/tcp Primary + +Machine State Address Inst id Base AZ Message +0 started 34.148.44.51 juju-e7c0db-0 ubuntu@22.04 us-east1-d RUNNING +1 down 34.23.202.220 juju-e7c0db-1 ubuntu@22.04 us-east1-c RUNNING +2 started 34.138.167.85 juju-e7c0db-2 ubuntu@22.04 us-east1-b RUNNING +``` + +Here we should remove the no-longer available `server/vm/GCE` node and add a new one. Juju will create it in the same AZ `us-east4-c`: +```shell +> juju remove-unit postgresql/1 --force --no-wait +WARNING This command will perform the following actions: +will remove unit postgresql/1 + +Continue [y/N]? y +``` + +The command `juju status` shows the machines in a healthy state, but PostgreSQL HA recovery is necessary: +```shell +Model Controller Cloud/Region Version SLA Timestamp +mymodel gce google/us-east1 3.5.4 unsupported 00:30:09+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +postgresql 14.12 active 2 postgresql 14/stable 468 no + +Unit Workload Agent Machine Public address Ports Message +postgresql/0 active idle 0 34.148.44.51 5432/tcp +postgresql/2* active idle 2 34.138.167.85 5432/tcp Primary + +Machine State Address Inst id Base AZ Message +0 started 34.148.44.51 juju-e7c0db-0 ubuntu@22.04 us-east1-d RUNNING +2 started 34.138.167.85 juju-e7c0db-2 ubuntu@22.04 us-east1-b RUNNING +``` + +Request Juju to add a new unit in the proper AZ: +```shell +juju add-unit postgresql -n 1 +``` + +Juju uses the right AZ where the node is missing. Run `juju status`: +```shell +Model Controller Cloud/Region Version SLA Timestamp +mymodel gce google/us-east1 3.5.4 unsupported 00:30:42+02:00 + +App Version Status Scale Charm Channel Rev Exposed Message +postgresql active 2/3 postgresql 14/stable 468 no + +Unit Workload Agent Machine Public address Ports Message +postgresql/0 active idle 0 34.148.44.51 5432/tcp +postgresql/2* active idle 2 34.138.167.85 5432/tcp Primary +postgresql/3 waiting allocating 3 waiting for machine + +Machine State Address Inst id Base AZ Message +0 started 34.148.44.51 juju-e7c0db-0 ubuntu@22.04 us-east1-d RUNNING +2 started 34.138.167.85 juju-e7c0db-2 ubuntu@22.04 us-east1-b RUNNING +3 pending juju-e7c0db-3 ubuntu@22.04 us-east1-c starting +``` + +## Remove GCE setup + +[note type="caution"] +**Warning**: Do not forget to remove your test setup - it can be costly! +[/note] + +Check the list of currently running GCE instances: +```shell +> gcloud compute instances list +NAME ZONE MACHINE_TYPE PREEMPTIBLE INTERNAL_IP EXTERNAL_IP STATUS +juju-a82dd9-0 us-east1-b n1-highcpu-4 10.142.0.30 34.23.252.144 RUNNING +juju-e7c0db-2 us-east1-b n2-highcpu-2 10.142.0.32 34.138.167.85 RUNNING +juju-e7c0db-3 us-east1-c n2d-highcpu-2 10.142.0.34 34.23.202.220 RUNNING +juju-e7c0db-0 us-east1-d n2-highcpu-2 10.142.0.31 34.148.44.51 RUNNING +``` + +Request Juju to clean all GCE resources: +```shell +juju destroy-controller gce --no-prompt --force --destroy-all-models +``` + +Re-check that there are no running GCE instances left (it should be empty): +```shell +gcloud compute instances list +``` \ No newline at end of file diff --git a/docs/how-to/h-enable-alert-rules.md b/docs/how-to/h-enable-alert-rules.md index 5f034055d7..52638adcb8 100644 --- a/docs/how-to/h-enable-alert-rules.md +++ b/docs/how-to/h-enable-alert-rules.md @@ -2,7 +2,7 @@ This guide will show how to set up [Pushover](https://pushover.net/) to receive alert notifications from the COS Alert Manager with [Awesome Alert Rules](https://samber.github.io/awesome-prometheus-alerts/). -Charmed PostgreSQL VM ships a pre-configured and pre-enabled [list of Awesome Alert Rules](https://github.com/canonical/postgresql-operator/tree/main/src/prometheus_alert_rules). +Charmed PostgreSQL VM ships a pre-configured and pre-enabled [list of Awesome Alert Rules].
Screenshot of alert rules in the Grafana web interface @@ -73,4 +73,5 @@ Do you have questions? [Contact us]! [Contact us]: /t/11852 [Charmed PostgreSQL VM operator]: /t/9697 -[COS Monitoring]: /t/10600 \ No newline at end of file +[COS Monitoring]: /t/10600 +[list of Awesome Alert Rules]: /t/15841 \ No newline at end of file diff --git a/docs/how-to/h-integrate.md b/docs/how-to/h-integrate.md index 10e07bc872..54a9761380 100644 --- a/docs/how-to/h-integrate.md +++ b/docs/how-to/h-integrate.md @@ -30,7 +30,7 @@ Integrations with charmed applications are supported via the modern [`postgresql ### Modern `postgresql_client` interface To integrate with a charmed application that supports the `postgresql_client` interface, run ```shell -juju integrate postgresql +juju integrate postgresql:database ``` To remove the integration, run diff --git a/docs/how-to/h-restore-backup.md b/docs/how-to/h-restore-backup.md index c1bbb00c07..1807f66571 100644 --- a/docs/how-to/h-restore-backup.md +++ b/docs/how-to/h-restore-backup.md @@ -78,4 +78,6 @@ However, if the user needs to restore to a specific point in time between differ juju run postgresql/leader restore restore-to-time="YYYY-MM-DDTHH:MM:SSZ" ``` -Your restore will then be in progress. \ No newline at end of file +Your restore will then be in progress. + +It’s also possible to restore to the latest point from a specific timeline by passing the ID of a backup taken on that timeline and `restore-to-time=latest` when requesting a restore. \ No newline at end of file diff --git a/docs/how-to/h-rollback-minor.md b/docs/how-to/h-rollback-minor.md index 5989080da7..e7d7754ff5 100644 --- a/docs/how-to/h-rollback-minor.md +++ b/docs/how-to/h-rollback-minor.md @@ -10,14 +10,13 @@ If you are using an earlier version, check the [Juju 3.0 Release Notes](https:// After a `juju refresh`, if there are any version incompatibilities in charm revisions, its dependencies, or any other unexpected failure in the upgrade process, the process will be halted and enter a failure state. -Even if the underlying PostgreSQL cluster continues to work, it’s important to roll back the charm to -a previous revision so that an update can be attempted after further inspection of the failure. +Even if the underlying PostgreSQL cluster continues to work, it’s important to roll back the charm to a previous revision so that an update can be attempted after further inspection of the failure. [note type="caution"] **Warning:** Do NOT trigger `rollback` during the running `upgrade` action! It may cause an unpredictable PostgreSQL cluster state! [/note] -## Summary +## Summary of the rollback steps 1. **Prepare** the Charmed PostgreSQL VM application for the in-place rollback. 2. **Rollback**. Once started, all units in a cluster will be executed sequentially. The rollback will be aborted (paused) if the unit rollback has failed. 3. **Check**. Make sure the charm and cluster are in a healthy state again. @@ -26,7 +25,7 @@ a previous revision so that an update can be attempted after further inspection To execute a rollback, we use a similar procedure to the upgrade. The difference is the charm revision to upgrade to. In this guide's example, we will refresh the charm back to revision `182`. -It is necessary to re-run `pre-upgrade-check` action on the leader unit, to enter the upgrade recovery state: +It is necessary to re-run `pre-upgrade-check` action on the leader unit in order to enter the upgrade recovery state: ```shell juju run postgresql/leader pre-upgrade-check ``` @@ -38,16 +37,16 @@ When using a charm from charmhub: juju refresh postgresql --revision=182 ``` -When deploying from a local charm file, one must have the previous revision charm file and run: - -``` +When deploying from a local charm file, one must have the previous revision charm file and run the following command: +```shell juju refresh postgresql --path=./postgresql_ubuntu-22.04-amd64.charm ``` - -Where `postgresql_ubuntu-22.04-amd64.charm` is the previous revision charm file. +> where `postgresql_ubuntu-22.04-amd64.charm` is the previous revision charm file. The first unit will be rolled out and should rejoin the cluster after settling down. After the refresh command, the juju controller revision for the application will be back in sync with the running Charmed PostgreSQL revision. ## Step 3: Check -Future [improvements are planned](https://warthogs.atlassian.net/browse/DPE-2621) to check the state on pods/clusters on a low level. At the moment check `juju status` to make sure the cluster [state](/t/10844) is OK. \ No newline at end of file +Future [improvements are planned](https://warthogs.atlassian.net/browse/DPE-2621) to check the state on pods/clusters on a low level. + +For now, check `juju status` to make sure the cluster [state](/t/10844) is OK. \ No newline at end of file diff --git a/docs/how-to/h-scale.md b/docs/how-to/h-scale.md index a2abf2319a..ec2a0e08e2 100644 --- a/docs/how-to/h-scale.md +++ b/docs/how-to/h-scale.md @@ -6,7 +6,7 @@ If you are using an earlier version, check the [Juju 3.0 Release Notes](https:// # How to scale units -Replication in PostgreSQL is the process of creating copies of the stored data. This provides redundancy, which means the application can provide self-healing capabilities in case one replica fails. In this context, each replica is equivalent one juju unit. +Replication in PostgreSQL is the process of creating copies of the stored data. This provides redundancy, which means the application can provide self-healing capabilities in case one replica fails. In this context, each replica is equivalent to one juju unit. This guide will show you how to establish and change the amount of juju units used to replicate your data. @@ -16,6 +16,7 @@ To deploy PostgreSQL with multiple replicas, specify the number of desired units ```shell juju deploy postgresql --channel 14/stable -n ``` +> It is recommended to use an odd number to prevent a [split-brain](https://en.wikipedia.org/wiki/Split-brain_(computing) scenario. ### Primary vs. leader unit @@ -27,6 +28,7 @@ To retrieve the juju unit that corresponds to the PostgreSQL primary, use the ac ```shell juju run postgresql/leader get-primary ``` + Similarly, the primary replica is displayed as a status message in `juju status`. However, one should note that this hook gets called on regular time intervals and the primary may be outdated if the status hook has not been called recently. [note] diff --git a/docs/how-to/h-upgrade-minor.md b/docs/how-to/h-upgrade-minor.md index ffcf34ff92..ec1ea6402d 100644 --- a/docs/how-to/h-upgrade-minor.md +++ b/docs/how-to/h-upgrade-minor.md @@ -7,9 +7,9 @@ If you are using an earlier version, check the [Juju 3.0 Release Notes](https:// # Perform a minor upgrade **Example**: PostgreSQL 14.8 -> PostgreSQL 14.9
-(including simple charm revision bump: from revision 193 to revision 196). +(including charm revision bump: e.g. Revision 193 -> Revision 196) -This guide is part of [Charmed PostgreSQL Upgrades](/t/12086). Please refer to this page for more information and an overview of the content. +This guide is part of [Charmed PostgreSQL Upgrades](/t/12086). Refer to this page for more information and an overview of the content. ## Summary - [**Pre-upgrade checks**](#pre-upgrade-checks): Important information to consider before starting an upgrade. @@ -39,6 +39,7 @@ Some examples are operations like (but not limited to) the following: * Upgrading other connected/related/integrated applications simultaneously Concurrency with other operations is not supported, and it can lead the cluster into inconsistent states. + ### Backups **Make sure to have a backup of your data when running any type of upgrade.** @@ -57,7 +58,7 @@ This step is only valid when deploying from [charmhub](https://charmhub.io/). If a [local charm](https://juju.is/docs/sdk/deploy-a-charm) is deployed (revision is small, e.g. 0-10), make sure the proper/current local revision of the `.charm` file is available BEFORE going further. You might need it for a rollback. [/note] -The first step is to record the revision of the running application as a safety measure for a rollback action. To accomplish this, simply run the `juju status` command and look for the deployed Charmed PostgreSQL revision in the command output, e.g.: +The first step is to record the revision of the running application as a safety measure for a rollback action. To accomplish this, run the `juju status` command and look for the deployed Charmed PostgreSQL revision in the command output, e.g.: ```shell Model Controller Cloud/Region Version SLA Timestamp @@ -115,7 +116,7 @@ All units will be refreshed (i.e. receive new charm content), and the upgrade wi First the `replica` units, then the `sync-standby` units, and lastly, the `leader`(or `primary`) unit. [/note] - `juju status` will look like: + `juju status` will look like similar to the output below: ```shell Model Controller Cloud/Region Version SLA Timestamp @@ -170,7 +171,9 @@ After a `juju refresh`, if there are any version incompatibilities in charm revi The step must be skipped if the upgrade went well! -Although the underlying PostgreSQL Cluster continues to work, it’s important to roll back the charm to a previous revision so that an update can be attempted after further inspection of the failure. Please switch to the dedicated [minor rollback](/t/12090) tutorial if necessary. +Although the underlying PostgreSQL Cluster continues to work, it’s important to roll back the charm to a previous revision so that an update can be attempted after further inspection of the failure. + +> See: [How to perform a minor rollback](/t/12090) ## Post-upgrade check diff --git a/docs/overview.md b/docs/overview.md index b2a68da5e3..69b9e37c86 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -1,3 +1,5 @@ +> This is a **IAAS/VM** operator. To deploy on Kubernetes, see [Charmed PostgreSQL K8s](https://charmhub.io/postgresql-k8s). + # Charmed PostgreSQL documentation Charmed PostgreSQL is an open-source software operator designed to deploy and operate object-relational databases on IAAS/VM. It packages the powerful database management system [PostgreSQL](https://www.postgresql.org/) into a charmed operator for deployment with [Juju](https://juju.is/docs/juju). @@ -7,12 +9,6 @@ This charm offers automated operations management from day 0 to day 2. It is equ Charmed PostgreSQL meets the need of deploying PostgreSQL in a structured and consistent manner while providing flexibility in configuration. It simplifies deployment, scaling, configuration and management of relational databases in large-scale production environments reliably. This charmed operator is made for anyone looking for a comprehensive database management interface, whether for operating a complex production environment or simply as a playground to learn more about databases and charms. - -[note] -This operator is built for **IAAS/VM**. - -For deployments in **Kubernetes** environments, see [Charmed PostgreSQL K8s](https://charmhub.io/postgresql-k8s). -[/note] +[info]: https://img.shields.io/badge/info-blue +[warning]: https://img.shields.io/badge/warning-yellow +[critical]: https://img.shields.io/badge/critical-red \ No newline at end of file diff --git a/docs/explanation/e-statuses.md b/docs/reference/r-statuses.md similarity index 84% rename from docs/explanation/e-statuses.md rename to docs/reference/r-statuses.md index a35965e00f..20dc754ba7 100644 --- a/docs/explanation/e-statuses.md +++ b/docs/reference/r-statuses.md @@ -1,6 +1,6 @@ -# Charm Statuses Explanations +# Charm statuses -> :warning: **WARNING** : it is an work-in-progress article. Do NOT use it in production! Contact [Canonical Data Platform team](https://chat.charmhub.io/charmhub/channels/data-platform) if you are interested in the topic. +> :warning: **WARNING** : This is an work-in-progress article. Do NOT use it in production! Contact [Canonical Data Platform team](https://chat.charmhub.io/charmhub/channels/data-platform) if you are interested in the topic. The charm follows [standard Juju applications statuses](https://juju.is/docs/olm/status-values#heading--application-status). Here you can find the expected end-users reaction on different statuses: @@ -17,6 +17,7 @@ The charm follows [standard Juju applications statuses](https://juju.is/docs/olm | **blocked** | failed to start Patroni | TODO: error/retry? | | | **blocked** | Failed to create postgres user | The charm couldn't create the default `postgres` database user due to connection problems | Connect to the database using the `operator` user and the password from the `get-password` action, then run `CREATE ROLE postgres WITH LOGIN SUPERUSER;` | | **blocked** | Failed to restore backup | The database couldn't start after the restore | The charm needs fix in the code to recover from this status and enable a new restore to be requested | +| **blocked** | Please choose one endpoint to use. No need to relate all of them simultaneously! | [The modern / legacy interfaces](https://charmhub.io/postgresql/docs/e-legacy-charm) should not be used simultaneously. | Remove modern or legacy relation. Choose one to use at a time. | | **error** | any | An unhanded internal error happened | Read the message hint. Execute `juju resolve ` after addressing the root of the error state | | **terminated** | any | The unit is gone and will be cleaned by Juju soon | No actions possible | | **unknown** | any | Juju doesn't know the charm app/unit status. Possible reason: K8s charm termination in progress. | Manual investigation required if status is permanent | \ No newline at end of file From 08ba852335a6080b8e928299d9efb9bd8f430559 Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:17:34 +0200 Subject: [PATCH 3/8] [MISC] Merge update_tls_flag into update_endpoints (#669) * Merge update_tls_flag into update_endpoints * No peer data --- src/charm.py | 8 +++---- src/relations/postgresql_provider.py | 30 +++++++------------------- tests/unit/test_postgresql_provider.py | 9 +++++--- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/charm.py b/src/charm.py index 16390b2385..37fbfb1552 100755 --- a/src/charm.py +++ b/src/charm.py @@ -822,6 +822,8 @@ def _units_ips(self) -> set[str]: @property def members_ips(self) -> set[str]: """Returns the list of IPs addresses of the current members of the cluster.""" + if not self._peers: + return set() return set(json.loads(self._peers.data[self.app].get("members_ips", "[]"))) def _add_to_members_ips(self, ip: str) -> None: @@ -1709,9 +1711,7 @@ def update_config(self, is_creating_backup: bool = False) -> bool: # in a bundle together with the TLS certificates operator. This flag is used to # know when to call the Patroni API using HTTP or HTTPS. self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""}) - self.postgresql_client_relation.update_tls_flag( - "True" if self.is_tls_enabled else "False" - ) + self.postgresql_client_relation.update_endpoints() logger.debug("Early exit update_config: Workload not started yet") return True @@ -1787,7 +1787,7 @@ def _handle_postgresql_restart_need(self, enable_tls: bool) -> None: # Ignore the error, as it happens only to indicate that the configuration has not changed. pass self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""}) - self.postgresql_client_relation.update_tls_flag("True" if self.is_tls_enabled else "False") + self.postgresql_client_relation.update_endpoints() # Restart PostgreSQL if TLS configuration has changed # (so the both old and new connections use the configuration). diff --git a/src/relations/postgresql_provider.py b/src/relations/postgresql_provider.py index 3b10f801c8..6b462124ba 100644 --- a/src/relations/postgresql_provider.py +++ b/src/relations/postgresql_provider.py @@ -108,17 +108,6 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None: # Set the database name self.database_provides.set_database(event.relation.id, database) - # Set TLS flag - self.database_provides.set_tls( - event.relation.id, - "True" if self.charm.is_tls_enabled else "False", - ) - - # Set TLS CA - if self.charm.is_tls_enabled: - _, ca, _ = self.charm.tls.get_tls_files() - self.database_provides.set_tls_ca(event.relation.id, ca) - # Update the read/write and read-only endpoints. self.update_endpoints(event) @@ -201,6 +190,12 @@ def update_endpoints(self, event: DatabaseRequestedEvent = None) -> None: else "" ) + tls = "True" if self.charm.is_tls_enabled else "False" + if tls == "True": + _, ca, _ = self.charm.tls.get_tls_files() + else: + ca = "" + for relation_id in rel_data: user = f"relation-{relation_id}" database = rel_data[relation_id].get("database") @@ -226,17 +221,8 @@ def update_endpoints(self, event: DatabaseRequestedEvent = None) -> None: f"postgresql://{user}:{password}@{self.charm.primary_endpoint}:{DATABASE_PORT}/{database}", ) - def update_tls_flag(self, tls: str) -> None: - """Update TLS flag and CA in relation databag.""" - relations = self.model.relations[self.relation_name] - if tls == "True": - _, ca, _ = self.charm.tls.get_tls_files() - else: - ca = "" - - for relation in relations: - self.database_provides.set_tls(relation.id, tls) - self.database_provides.set_tls_ca(relation.id, ca) + self.database_provides.set_tls(relation_id, tls) + self.database_provides.set_tls_ca(relation_id, ca) def _check_multiple_endpoints(self) -> bool: """Checks if there are relations with other endpoints.""" diff --git a/tests/unit/test_postgresql_provider.py b/tests/unit/test_postgresql_provider.py index 0a77509742..ab8cf5d00c 100644 --- a/tests/unit/test_postgresql_provider.py +++ b/tests/unit/test_postgresql_provider.py @@ -142,7 +142,6 @@ def test_on_database_requested(harness): "password": "test-password", "version": POSTGRESQL_VERSION, "database": f"{DATABASE}", - "tls": "False", } # Assert no BlockedStatus was set. @@ -154,7 +153,6 @@ def test_on_database_requested(harness): # No data is set in the databag by the database. assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "data": f'{{"database": "{DATABASE}", "extra-user-roles": "{EXTRA_USER_ROLES}"}}', - "tls": "False", } # BlockedStatus due to a PostgreSQLCreateDatabaseError. @@ -163,7 +161,6 @@ def test_on_database_requested(harness): # No data is set in the databag by the database. assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "data": f'{{"database": "{DATABASE}", "extra-user-roles": "{EXTRA_USER_ROLES}"}}', - "tls": "False", } # BlockedStatus due to a PostgreSQLGetPostgreSQLVersionError. @@ -256,6 +253,7 @@ def test_update_endpoints_with_event(harness): "endpoints": "1.1.1.1:5432", "read-only-endpoints": "2.2.2.2:5432", "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", } assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == {} _fetch_my_relation_data.assert_called_once_with([2], ["password"]) @@ -265,6 +263,7 @@ def test_update_endpoints_with_event(harness): assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", } assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == {} @@ -331,11 +330,13 @@ def test_update_endpoints_without_event(harness): "endpoints": "1.1.1.1:5432", "read-only-endpoints": "2.2.2.2:5432", "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", } assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", "read-only-endpoints": "2.2.2.2:5432", "uris": "postgresql://relation-3:test_password@1.1.1.1:5432/test_db2", + "tls": "False", } _fetch_my_relation_data.assert_called_once_with(None, ["password"]) @@ -344,8 +345,10 @@ def test_update_endpoints_without_event(harness): assert harness.get_relation_data(rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", "uris": "postgresql://relation-2:test_password@1.1.1.1:5432/test_db", + "tls": "False", } assert harness.get_relation_data(another_rel_id, harness.charm.app.name) == { "endpoints": "1.1.1.1:5432", "uris": "postgresql://relation-3:test_password@1.1.1.1:5432/test_db2", + "tls": "False", } From 3f31bbf7f965f95d5ec9c1ed068d9faa91fb65c9 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:36:32 +0100 Subject: [PATCH 4/8] Migrate config .github/renovate.json5 (#673) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/renovate.json5 | 58 ++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/.github/renovate.json5 b/.github/renovate.json5 index 3a11766c18..34085c9225 100644 --- a/.github/renovate.json5 +++ b/.github/renovate.json5 @@ -1,30 +1,44 @@ { - "$schema": "https://docs.renovatebot.com/renovate-schema.json", - "extends": ["github>canonical/data-platform//renovate_presets/charm.json5"], - "reviewers": ["team:data-platform-postgresql"], - "packageRules": [ - // Later rules override earlier rules + $schema: 'https://docs.renovatebot.com/renovate-schema.json', + extends: [ + 'github>canonical/data-platform//renovate_presets/charm.json5', + ], + reviewers: [ + 'team:data-platform-postgresql', + ], + packageRules: [ { - "matchPackageNames": ["pydantic"], - "allowedVersions": "<2.0.0" + matchPackageNames: [ + 'pydantic', + ], + allowedVersions: '<2.0.0', }, { - "matchManagers": ["regex"], - "matchDepNames": ["juju"], - "matchDatasources": ["pypi"], - "allowedVersions": "<3", - "groupName": "Juju agents" - } + matchManagers: [ + 'custom.regex', + ], + matchDepNames: [ + 'juju', + ], + matchDatasources: [ + 'pypi', + ], + allowedVersions: '<3', + groupName: 'Juju agents', + }, ], - "regexManagers": [ + customManagers: [ { - "fileMatch": ["^\\.github/workflows/[^/]+\\.ya?ml$"], - "matchStrings": [ - "(libjuju: )==(?.*?) +# renovate: latest libjuju 2" + customType: 'regex', + fileMatch: [ + '^\\.github/workflows/[^/]+\\.ya?ml$', + ], + matchStrings: [ + '(libjuju: )==(?.*?) +# renovate: latest libjuju 2', ], - "depNameTemplate": "juju", - "datasourceTemplate": "pypi", - "versioningTemplate": "loose" - } - ] + depNameTemplate: 'juju', + datasourceTemplate: 'pypi', + versioningTemplate: 'loose', + }, + ], } From 3951e95e64d7b7598f0ccb850c06f766985b5223 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 10:03:24 -0300 Subject: [PATCH 5/8] Update data-platform-workflows to v23.0.5 (#676) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/ci.yaml | 6 +++--- .github/workflows/release.yaml | 4 ++-- .github/workflows/sync_docs.yaml | 2 +- poetry.lock | 18 +++++++++--------- pyproject.toml | 8 ++++---- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2907e3eb59..45bcaa710e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,7 +23,7 @@ on: jobs: lint: name: Lint - uses: canonical/data-platform-workflows/.github/workflows/lint.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/lint.yaml@v23.0.5 unit-test: name: Unit test charm @@ -45,7 +45,7 @@ jobs: build: name: Build charm - uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v23.0.5 with: cache: true @@ -77,7 +77,7 @@ jobs: - lint - unit-test - build - uses: canonical/data-platform-workflows/.github/workflows/integration_test_charm.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/integration_test_charm.yaml@v23.0.5 with: artifact-prefix: ${{ needs.build.outputs.artifact-prefix }} architecture: ${{ matrix.architecture }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index b356a84476..f709bf43b1 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -25,14 +25,14 @@ jobs: build: name: Build charm - uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v23.0.5 release: name: Release charm needs: - ci-tests - build - uses: canonical/data-platform-workflows/.github/workflows/release_charm.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/release_charm.yaml@v23.0.5 with: channel: 14/edge artifact-prefix: ${{ needs.build.outputs.artifact-prefix }} diff --git a/.github/workflows/sync_docs.yaml b/.github/workflows/sync_docs.yaml index cc6cfbc480..3a41cc31cc 100644 --- a/.github/workflows/sync_docs.yaml +++ b/.github/workflows/sync_docs.yaml @@ -10,7 +10,7 @@ on: jobs: sync-docs: name: Sync docs from Discourse - uses: canonical/data-platform-workflows/.github/workflows/sync_docs.yaml@v23.0.4 + uses: canonical/data-platform-workflows/.github/workflows/sync_docs.yaml@v23.0.5 with: reviewers: a-velasco,izmalk permissions: diff --git a/poetry.lock b/poetry.lock index a298423bd0..7b4d2ac24f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,8 +31,8 @@ pytest = "*" [package.source] type = "git" url = "https://github.com/canonical/data-platform-workflows" -reference = "v23.0.4" -resolved_reference = "60f088b7f0f967a8e35d45339f5123a6e74786f7" +reference = "v23.0.5" +resolved_reference = "e3f522c648375decee87fc0982c012e46ffb0b98" subdirectory = "python/pytest_plugins/allure_pytest_collection_report" [[package]] @@ -1802,8 +1802,8 @@ develop = false [package.source] type = "git" url = "https://github.com/canonical/data-platform-workflows" -reference = "v23.0.4" -resolved_reference = "60f088b7f0f967a8e35d45339f5123a6e74786f7" +reference = "v23.0.5" +resolved_reference = "e3f522c648375decee87fc0982c012e46ffb0b98" subdirectory = "python/pytest_plugins/github_secrets" [[package]] @@ -1840,8 +1840,8 @@ pyyaml = "*" [package.source] type = "git" url = "https://github.com/canonical/data-platform-workflows" -reference = "v23.0.4" -resolved_reference = "60f088b7f0f967a8e35d45339f5123a6e74786f7" +reference = "v23.0.5" +resolved_reference = "e3f522c648375decee87fc0982c012e46ffb0b98" subdirectory = "python/pytest_plugins/pytest_operator_cache" [[package]] @@ -1859,8 +1859,8 @@ pytest = "*" [package.source] type = "git" url = "https://github.com/canonical/data-platform-workflows" -reference = "v23.0.4" -resolved_reference = "60f088b7f0f967a8e35d45339f5123a6e74786f7" +reference = "v23.0.5" +resolved_reference = "e3f522c648375decee87fc0982c012e46ffb0b98" subdirectory = "python/pytest_plugins/pytest_operator_groups" [[package]] @@ -2533,4 +2533,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "be84825d8bc3d6716d62a2c7f283a49f386445927c12dea73df65e317df7b3d9" +content-hash = "a24006bb8af98b161cd722b73b93b3ce7fbc5f44e46ee2d4faa24e438c09e0de" diff --git a/pyproject.toml b/pyproject.toml index 500cffe99d..ea24e76a47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,10 +61,10 @@ optional = true [tool.poetry.group.integration.dependencies] pytest = "^8.3.3" -pytest-github-secrets = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.4", subdirectory = "python/pytest_plugins/github_secrets"} +pytest-github-secrets = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.5", subdirectory = "python/pytest_plugins/github_secrets"} pytest-operator = "^0.38.0" -pytest-operator-cache = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.4", subdirectory = "python/pytest_plugins/pytest_operator_cache"} -pytest-operator-groups = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.4", subdirectory = "python/pytest_plugins/pytest_operator_groups"} +pytest-operator-cache = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.5", subdirectory = "python/pytest_plugins/pytest_operator_cache"} +pytest-operator-groups = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.5", subdirectory = "python/pytest_plugins/pytest_operator_groups"} # renovate caret doesn't work: https://github.com/renovatebot/renovate/issues/26940 juju = "<=3.5.0.0" boto3 = "*" @@ -73,7 +73,7 @@ landscape-api-py3 = "^0.9.0" mailmanclient = "^3.3.5" psycopg2-binary = "^2.9.10" allure-pytest = "^2.13.5" -allure-pytest-collection-report = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.4", subdirectory = "python/pytest_plugins/allure_pytest_collection_report"} +allure-pytest-collection-report = {git = "https://github.com/canonical/data-platform-workflows", tag = "v23.0.5", subdirectory = "python/pytest_plugins/allure_pytest_collection_report"} # Testing tools configuration [tool.coverage.run] From 84f381ea636121d4fd6e81a22a3db30db43d4e97 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 10:59:00 -0300 Subject: [PATCH 6/8] Update codecov/codecov-action action to v5 (#674) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 45bcaa710e..8d160e58dc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -39,7 +39,7 @@ jobs: - name: Run tests run: tox run -e unit - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} From 1b6a748c71109f2ccf98f5cc44fc6102afe66b60 Mon Sep 17 00:00:00 2001 From: shayancanonical <99665202+shayancanonical@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:46:45 -0500 Subject: [PATCH 7/8] Test against juju 3.6/candidate + upgrade dpw to v23.0.5 (#675) * Test against juju 3.6/candidate + upgrade dpw to v23.0.5 * Update 3.6 another set of nightly tests to run against 3.6/candidate instead of 3.6/beta --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8d160e58dc..b76a889f3c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -59,7 +59,7 @@ jobs: allure_on_amd64: false - agent: 3.4.6 # renovate: juju-agent-pin-minor allure_on_amd64: true - - snap_channel: 3.6/beta + - snap_channel: 3.6/candidate allure_on_amd64: false architecture: - amd64 @@ -69,7 +69,7 @@ jobs: allure_on_amd64: true architecture: arm64 - juju: - snap_channel: 3.6/beta + snap_channel: 3.6/candidate allure_on_amd64: false architecture: arm64 name: Integration | ${{ matrix.juju.agent || matrix.juju.snap_channel }} | ${{ matrix.architecture }} From f50d3732b9a755f5891dca993aa5866c49623e09 Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:08:58 +0200 Subject: [PATCH 8/8] Bump libs (#677) --- lib/charms/postgresql_k8s/v0/postgresql.py | 16 +- .../tempo_coordinator_k8s/v0/charm_tracing.py | 389 ++++++++++++++++-- 2 files changed, 370 insertions(+), 35 deletions(-) diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 2f2b2f9990..4d8d6dc30c 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 37 +LIBPATCH = 39 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -244,7 +244,7 @@ def create_user( privilege for privilege in privileges if privilege not in valid_privileges ] if len(invalid_privileges) > 0: - logger.error(f'Invalid extra user roles: {", ".join(privileges)}') + logger.error(f"Invalid extra user roles: {', '.join(privileges)}") raise PostgreSQLCreateUserError(INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE) with self._connect_to_database() as connection, connection.cursor() as cursor: @@ -256,7 +256,7 @@ def create_user( user_definition = "CREATE ROLE {}" user_definition += f"WITH {'NOLOGIN' if user == 'admin' else 'LOGIN'}{' SUPERUSER' if admin else ''} ENCRYPTED PASSWORD '{password}'{'IN ROLE admin CREATEDB' if admin_role else ''}" if privileges: - user_definition += f' {" ".join(privileges)}' + user_definition += f" {' '.join(privileges)}" cursor.execute(sql.SQL("BEGIN;")) cursor.execute(sql.SQL("SET LOCAL log_statement = 'none';")) cursor.execute(sql.SQL(f"{user_definition};").format(sql.Identifier(user))) @@ -375,8 +375,12 @@ def _generate_database_privileges_statements( UNION SELECT 2 AS index,'ALTER SEQUENCE '|| sequence_schema || '."' || sequence_name ||'" OWNER TO {};' AS statement FROM information_schema.sequences WHERE NOT sequence_schema IN ('pg_catalog', 'information_schema') UNION SELECT 3 AS index,'ALTER FUNCTION '|| nsp.nspname || '."' || p.proname ||'"('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' AS statement -FROM pg_proc p JOIN pg_namespace nsp ON p.pronamespace = nsp.oid WHERE NOT nsp.nspname IN ('pg_catalog', 'information_schema') -UNION SELECT 4 AS index,'ALTER VIEW '|| schemaname || '."' || viewname ||'" OWNER TO {};' AS statement +FROM pg_proc p JOIN pg_namespace nsp ON p.pronamespace = nsp.oid WHERE NOT nsp.nspname IN ('pg_catalog', 'information_schema') AND p.prokind = 'f' +UNION SELECT 4 AS index,'ALTER PROCEDURE '|| nsp.nspname || '."' || p.proname ||'"('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' AS statement +FROM pg_proc p JOIN pg_namespace nsp ON p.pronamespace = nsp.oid WHERE NOT nsp.nspname IN ('pg_catalog', 'information_schema') AND p.prokind = 'p' +UNION SELECT 5 AS index,'ALTER AGGREGATE '|| nsp.nspname || '."' || p.proname ||'"('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' AS statement +FROM pg_proc p JOIN pg_namespace nsp ON p.pronamespace = nsp.oid WHERE NOT nsp.nspname IN ('pg_catalog', 'information_schema') AND p.prokind = 'a' +UNION SELECT 6 AS index,'ALTER VIEW '|| schemaname || '."' || viewname ||'" OWNER TO {};' AS statement FROM pg_catalog.pg_views WHERE NOT schemaname IN ('pg_catalog', 'information_schema')) AS statements ORDER BY index) LOOP EXECUTE format(r.statement); END LOOP; @@ -386,6 +390,8 @@ def _generate_database_privileges_statements( sql.Identifier(user), sql.Identifier(user), sql.Identifier(user), + sql.Identifier(user), + sql.Identifier(user), ) ) statements.append( diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py index 1e7ff8405a..cf8def11ac 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py @@ -69,6 +69,9 @@ def my_tracing_endpoint(self) -> Optional[str]: - every event as a span (including custom events) - every charm method call (except dunders) as a span +We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests +go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. + ## TLS support If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), @@ -114,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer: See the official opentelemetry Python SDK documentation for usage: https://opentelemetry-python.readthedocs.io/en/latest/ + +## Caching traces +The `trace_charm` machinery will buffer any traces collected during charm execution and store them +to a file on the charm container until a tracing backend becomes available. At that point, it will +flush them to the tracing receiver. + +By default, the buffer is configured to start dropping old traces if any of these conditions apply: + +- the storage size exceeds 10 MiB +- the number of buffered events exceeds 100 + +You can configure this by, for example: + +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # only cache up to 42 events + buffer_max_events=42, + # only cache up to 42 MiB + buffer_max_size_mib=42, # minimum 10! +) +class MyCharm(CharmBase): + ... +``` + +Note that setting `buffer_max_events` to 0 will effectively disable the buffer. + +The path of the buffer file is by default in the charm's execution root, which for k8s charms means +that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage +(or add a new one) such as: + +```yaml +storage: + data: + type: filesystem + location: /charm-traces +``` + +and then configure the `@trace_charm` decorator to use it as path for storing the buffer: +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # store traces to a PVC so they're not lost on pod restart. + buffer_path="/charm-traces/buffer.file", +) +class MyCharm(CharmBase): + ... +``` + ## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already @@ -171,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]: 3) If you were passing a certificate (str) using `server_cert`, you need to change it to provide an *absolute* path to the certificate file instead. """ +import typing + +from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import ( + encode_spans, +) +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter def _remove_stale_otel_sdk_packages(): @@ -222,6 +282,9 @@ def _remove_stale_otel_sdk_packages(): otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") +# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. +# it could be trouble if someone ever decides to implement their own tracer parallel to +# ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() import functools @@ -235,6 +298,7 @@ def _remove_stale_otel_sdk_packages(): Any, Callable, Generator, + List, Optional, Sequence, Type, @@ -247,8 +311,12 @@ def _remove_stale_otel_sdk_packages(): import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + SpanExporter, + SpanExportResult, +) from opentelemetry.trace import INVALID_SPAN, Tracer from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( @@ -269,7 +337,7 @@ def _remove_stale_otel_sdk_packages(): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 4 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -277,7 +345,7 @@ def _remove_stale_otel_sdk_packages(): dev_logger = logging.getLogger("tracing-dev") # set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) +dev_logger.setLevel(logging.ERROR) _CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof _C = TypeVar("_C", bound=_CharmType) @@ -287,6 +355,186 @@ def _remove_stale_otel_sdk_packages(): _GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" +BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw" +# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in +# any portable format. Json dumping is supported, but loading isn't. +# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003 + +BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10 +_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10 +BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100 +_MiB_TO_B = 2**20 # megabyte to byte conversion rate +_OTLP_SPAN_EXPORTER_TIMEOUT = 1 +"""Timeout in seconds that the OTLP span exporter has to push traces to the backend.""" + + +class _Buffer: + """Handles buffering for spans emitted while no tracing backend is configured or available. + + Use the max_event_history_length_buffering param of @trace_charm to tune + the amount of memory that this will hog on your units. + + The buffer is formatted as a bespoke byte dump (protobuf limitation). + We cannot store them as json because that is not well-supported by the sdk + (see https://github.com/open-telemetry/opentelemetry-python/issues/3364). + """ + + _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__" + + def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int): + self._db_file = db_file + self._max_event_history_length = max_event_history_length + self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN) + + # set by caller + self.exporter: Optional[OTLPSpanExporter] = None + + def save(self, spans: typing.Sequence[ReadableSpan]): + """Save the spans collected by this exporter to the cache file. + + This method should be as fail-safe as possible. + """ + if self._max_event_history_length < 1: + dev_logger.debug("buffer disabled: max history length < 1") + return + + current_history_length = len(self.load()) + new_history_length = current_history_length + len(spans) + if (diff := self._max_event_history_length - new_history_length) < 0: + self.drop(diff) + self._save(spans) + + def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes: + # encode because otherwise we can't json-dump them + return encode_spans(spans).SerializeToString() + + def _save(self, spans: Sequence[ReadableSpan], replace: bool = False): + dev_logger.debug(f"saving {len(spans)} new spans to buffer") + old = [] if replace else self.load() + new = self._serialize(spans) + + try: + # if the buffer exceeds the size limit, we start dropping old spans until it does + + while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B): + if not old: + # if we've already dropped all spans and still we can't get under the + # size limit, we can't save this span + logger.error( + f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); " + f"buffering FAILED" + ) + return + + old = old[1:] + logger.warning( + f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... " + f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed." + ) + + self._db_file.write_bytes(new + self._SPANSEP.join(old)) + except Exception: + logger.exception("error buffering spans") + + def load(self) -> List[bytes]: + """Load currently buffered spans from the cache file. + + This method should be as fail-safe as possible. + """ + if not self._db_file.exists(): + dev_logger.debug("buffer file not found. buffer empty.") + return [] + try: + spans = self._db_file.read_bytes().split(self._SPANSEP) + except Exception: + logger.exception(f"error parsing {self._db_file}") + return [] + return spans + + def drop(self, n_spans: Optional[int] = None): + """Drop some currently buffered spans from the cache file.""" + current = self.load() + if n_spans: + dev_logger.debug(f"dropping {n_spans} spans from buffer") + new = current[n_spans:] + else: + dev_logger.debug("emptying buffer") + new = [] + + self._db_file.write_bytes(self._SPANSEP.join(new)) + + def flush(self) -> Optional[bool]: + """Export all buffered spans to the given exporter, then clear the buffer. + + Returns whether the flush was successful, and None if there was nothing to flush. + """ + if not self.exporter: + dev_logger.debug("no exporter set; skipping buffer flush") + return False + + buffered_spans = self.load() + if not buffered_spans: + dev_logger.debug("nothing to flush; buffer empty") + return None + + errors = False + for span in buffered_spans: + try: + out = self.exporter._export(span) # type: ignore + if not (200 <= out.status_code < 300): + # take any 2xx status code as a success + errors = True + except ConnectionError: + dev_logger.debug( + "failed exporting buffered span; backend might be down or still starting" + ) + errors = True + except Exception: + logger.exception("unexpected error while flushing span batch from buffer") + errors = True + + if not errors: + self.drop() + else: + logger.error("failed flushing spans; buffer preserved") + return not errors + + @property + def is_empty(self): + """Utility to check whether the buffer has any stored spans. + + This is more efficient than attempting a load() given how large the buffer might be. + """ + return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0) + + +class _OTLPSpanExporter(OTLPSpanExporter): + """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster.""" + + # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured, + # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't + # flush the traces and keeps retrying for 'too long' + + _MAX_RETRY_TIMEOUT = 4 + # we give the exporter 4 seconds in total to succeed pushing the traces to tempo + # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk. + # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait + + +class _BufferedExporter(InMemorySpanExporter): + def __init__(self, buffer: _Buffer) -> None: + super().__init__() + self._buffer = buffer + + def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: + self._buffer.save(spans) + return super().export(spans) + + def force_flush(self, timeout_millis: int = 0) -> bool: + # parent implementation is fake, so the timeout_millis arg is not doing anything. + result = super().force_flush(timeout_millis) + self._buffer.save(self.get_finished_spans()) + return result def is_enabled() -> bool: @@ -423,7 +671,10 @@ def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, server_cert_attr: Optional[str], - service_name: Optional[str] = None, + service_name: Optional[str], + buffer_path: Optional[Path], + buffer_max_events: int, + buffer_max_size_mib: int, ): """Patch the charm's initializer.""" original_init = charm_type.__init__ @@ -442,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): logger.info("Tracing DISABLED: skipping root span initialization") return - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - original_event_context = framework._event_context # default service name isn't just app name because it could conflict with the workload service name _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. resource = Resource.create( attributes={ "service.name": _service_name, @@ -471,33 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) + buffer_only = False + # whether we're only exporting to buffer, or also to the otlp exporter. + if not tracing_endpoint: # tracing is off if tracing_endpoint is None - return + # however we can buffer things until tracing comes online + buffer_only = True server_cert: Optional[Union[str, Path]] = ( _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) - if tracing_endpoint.startswith("https://") and not server_cert: + if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert: logger.error( "Tracing endpoint is https, but no server_cert has been passed." "Please point @trace_charm to a `server_cert` attr. " "This might also mean that the tracing provider is related to a " "certificates provider, but this application is not (yet). " "In that case, you might just have to wait a bit for the certificates " - "integration to settle. " + "integration to settle. This span will be buffered." ) - return + buffer_only = True - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, + buffer = _Buffer( + db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME, + max_event_history_length=buffer_max_events, + max_buffer_size_mib=buffer_max_size_mib, ) + previous_spans_buffered = not buffer.is_empty + + exporters: List[SpanExporter] = [] + if buffer_only: + # we have to buffer because we're missing necessary backend configuration + dev_logger.debug("buffering mode: ON") + exporters.append(_BufferedExporter(buffer)) + + else: + dev_logger.debug("buffering mode: FALLBACK") + # in principle, we have the right configuration to be pushing traces, + # but if we fail for whatever reason, we will put everything in the buffer + # and retry the next time + otlp_exporter = _OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=_OTLP_SPAN_EXPORTER_TIMEOUT, # give individual requests 1 second to succeed + ) + exporters.append(otlp_exporter) + exporters.append(_BufferedExporter(buffer)) + buffer.exporter = otlp_exporter + + for exporter in exporters: + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) set_tracer_provider(provider) _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) @@ -521,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") + dev_logger.debug(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -535,12 +806,50 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") + dev_logger.debug("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long + flush_successful = tp.force_flush(timeout_millis=1000) # don't block for too long + + if buffer_only: + # if we're in buffer_only mode, it means we couldn't even set up the exporter for + # tempo as we're missing some data. + # so attempting to flush the buffer doesn't make sense + dev_logger.debug("tracing backend unavailable: all spans pushed to buffer") + + else: + dev_logger.debug("tracing backend found: attempting to flush buffer...") + + # if we do have an exporter for tempo, and we could send traces to it, + # we can attempt to flush the buffer as well. + if not flush_successful: + logger.error("flushing FAILED: unable to push traces to backend.") + else: + dev_logger.debug("flush succeeded.") + + # the backend has accepted the spans generated during this event, + if not previous_spans_buffered: + # if the buffer was empty to begin with, any spans we collected now can be discarded + buffer.drop() + dev_logger.debug("buffer dropped: this trace has been sent already") + else: + # if the buffer was nonempty, we can attempt to flush it + dev_logger.debug("attempting buffer flush...") + buffer_flush_successful = buffer.flush() + if buffer_flush_successful: + dev_logger.debug("buffer flush OK") + elif buffer_flush_successful is None: + # TODO is this even possible? + dev_logger.debug("buffer flush OK; empty: nothing to flush") + else: + # this situation is pretty weird, I'm not even sure it can happen, + # because it would mean that we did manage + # to push traces directly to the tempo exporter (flush_successful), + # but the buffer flush failed to push to the same exporter! + logger.error("buffer flush FAILED") + tp.shutdown() original_close() @@ -555,6 +864,9 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Union[str, Path]] = None, ) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. @@ -596,6 +908,10 @@ def trace_charm( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ def _decorator(charm_type: _T) -> _T: @@ -606,6 +922,9 @@ def _decorator(charm_type: _T) -> _T: server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, + buffer_path=Path(buffer_path) if buffer_path else None, + buffer_max_size_mib=buffer_max_size_mib, + buffer_max_events=buffer_max_events, ) return charm_type @@ -618,6 +937,9 @@ def _autoinstrument( server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Path] = None, ) -> _T: """Set up tracing on this charm class. @@ -650,13 +972,20 @@ def _autoinstrument( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ - dev_logger.info(f"instrumenting {charm_type}") + dev_logger.debug(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, tracing_endpoint_attr, server_cert_attr=server_cert_attr, service_name=service_name, + buffer_path=buffer_path, + buffer_max_events=buffer_max_events, + buffer_max_size_mib=buffer_max_size_mib, ) trace_type(charm_type) for type_ in extra_types: @@ -672,12 +1001,12 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - dev_logger.info(f"instrumenting {cls}") + dev_logger.debug(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") + dev_logger.debug(f"discovered {method}") if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") + dev_logger.debug(f"skipping {method} (dunder)") continue # the span title in the general case should be: @@ -723,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F: def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") + dev_logger.debug(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable)