From b30b7d35bc71dd3c336e4b85d9f7cd8b49570fcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Duarte?= Date: Tue, 23 Jul 2024 23:54:09 +0100 Subject: [PATCH 1/3] INCIDENT-001: postgres doesnt have Point of Recoverability so it might fail when there's no replicas left Also upgrade cnpg version while I'm at it. We backup now to an R2 bucket that has the postgres backups. --- .../postgresql/cnpg-backup-secrets.yaml | 10 ++++++ .../databases/postgresql/cnpg-cluster.yaml | 33 ++++++++++++++++++- .../databases/postgresql/cnpg-secrets.yaml | 4 +++ .../databases/postgresql/deploy-cnpg-dev.sh | 3 +- .../databases/postgresql/deploy-cnpg-prod.sh | 5 ++- .../longhorn-strict-local-no-backup.yaml | 17 ++++++++++ 6 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 services/databases/postgresql/cnpg-backup-secrets.yaml create mode 100644 services/storage/longhorn/storageClasses/longhorn-strict-local-no-backup.yaml diff --git a/services/databases/postgresql/cnpg-backup-secrets.yaml b/services/databases/postgresql/cnpg-backup-secrets.yaml new file mode 100644 index 0000000..7bfaa41 --- /dev/null +++ b/services/databases/postgresql/cnpg-backup-secrets.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: cnpg-backup-secret + namespace: pg +type: Opaque +stringData: + ACCESS_KEY_ID: + ACCESS_SECRET_KEY: diff --git a/services/databases/postgresql/cnpg-cluster.yaml b/services/databases/postgresql/cnpg-cluster.yaml index 35c62c4..83b4c36 100644 --- a/services/databases/postgresql/cnpg-cluster.yaml +++ b/services/databases/postgresql/cnpg-cluster.yaml @@ -3,6 +3,7 @@ apiVersion: postgresql.cnpg.io/v1 kind: Cluster metadata: name: cnpg-cluster + namespace: pg spec: instances: 3 @@ -36,4 +37,34 @@ spec: storage: size: 10Gi - storageClass: longhorn-strict-local-retain + #backups are handled by cloudnative postgres + storageClass: longhorn-strict-local-no-backup + + backup: + barmanObjectStore: + destinationPath: s3://niployments-postgres-backup/ + endpointURL: https://52d22ed664e31a094229250acd87ccfb.eu.r2.cloudflarestorage.com + s3Credentials: + accessKeyId: + name: cnpg-backup-secret + key: ACCESS_KEY_ID + secretAccessKey: + name: cnpg-backup-secret + key: ACCESS_SECRET_KEY + wal: + compression: gzip + retentionPolicy: "15d" +--- +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: cluster-backup-object-store + namespace: pg +spec: + cluster: + name: cnpg-cluster + method: barmanObjectStore + #Run on sundays, tuesdays and thursdays + schedule: '0 0 0 * * 0,2,4' + backupOwnerReference: cluster + immediate: true diff --git a/services/databases/postgresql/cnpg-secrets.yaml b/services/databases/postgresql/cnpg-secrets.yaml index e649291..b252df4 100644 --- a/services/databases/postgresql/cnpg-secrets.yaml +++ b/services/databases/postgresql/cnpg-secrets.yaml @@ -6,6 +6,7 @@ stringData: kind: Secret metadata: name: tts-secret + namespace: pg type: kubernetes.io/basic-auth --- apiVersion: v1 @@ -15,6 +16,7 @@ stringData: kind: Secret metadata: name: ni-secret + namespace: pg type: kubernetes.io/basic-auth --- apiVersion: v1 @@ -24,6 +26,7 @@ stringData: kind: Secret metadata: name: plausible-secret + namespace: pg type: kubernetes.io/basic-auth --- apiVersion: v1 @@ -33,4 +36,5 @@ stringData: kind: Secret metadata: name: sinf-website-2023-secret + namespace: pg type: kubernetes.io/basic-auth \ No newline at end of file diff --git a/services/databases/postgresql/deploy-cnpg-dev.sh b/services/databases/postgresql/deploy-cnpg-dev.sh index cd83d8c..b8d685a 100755 --- a/services/databases/postgresql/deploy-cnpg-dev.sh +++ b/services/databases/postgresql/deploy-cnpg-dev.sh @@ -7,10 +7,11 @@ port=5432 # Define the desired port here cnpg_dir='./services/databases/postgresql' pods=$(cat $cnpg_dir/cnpg-cluster.yaml | awk '{if ($1 == "instances:") print $2}') -kubectl apply --server-side -f https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/release-1.22/releases/cnpg-1.22.2.yaml +kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/release-1.23/releases/cnpg-1.23.2.yaml kubectl wait --for=condition=available=true -n cnpg-system deployment/cnpg-controller-manager --timeout=120s kubectl create namespace pg +kubectl apply -f $(dirname $0)/cnpg-backup-secrets.yaml -n pg kubectl apply -f $(dirname $0)/cnpg-secrets.yaml -n pg kubectl apply -f $(dirname $0)/cnpg-cluster.yaml -n pg sleep 5 # Wait a little bit for first pod to be created diff --git a/services/databases/postgresql/deploy-cnpg-prod.sh b/services/databases/postgresql/deploy-cnpg-prod.sh index 5e0b1a1..a3beb46 100755 --- a/services/databases/postgresql/deploy-cnpg-prod.sh +++ b/services/databases/postgresql/deploy-cnpg-prod.sh @@ -4,10 +4,13 @@ pods=$(cat $(dirname $0)/cnpg-cluster.yaml | awk '{if ($1 == "instances:") print $2}') -kubectl apply --server-side -f https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/release-1.22/releases/cnpg-1.22.2.yaml +# NOTE(luisd): https://cloudnative-pg.io/documentation/1.23/installation_upgrade/#server-side-apply-of-manifests +# they recommend force conflicts because such errors might happend when upgrading the controler +kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/release-1.23/releases/cnpg-1.23.2.yaml kubectl wait --for=condition=available=true -n cnpg-system deployment/cnpg-controller-manager --timeout=120s kubectl create namespace pg +kubectl apply -f $(dirname $0)/cnpg-backup-secrets.yaml -n pg kubectl apply -f $(dirname $0)/cnpg-secrets.yaml -n pg kubectl apply -f $(dirname $0)/cnpg-cluster.yaml -n pg sleep 5 # Wait a little bit for first pod to be created diff --git a/services/storage/longhorn/storageClasses/longhorn-strict-local-no-backup.yaml b/services/storage/longhorn/storageClasses/longhorn-strict-local-no-backup.yaml new file mode 100644 index 0000000..c4cb8e0 --- /dev/null +++ b/services/storage/longhorn/storageClasses/longhorn-strict-local-no-backup.yaml @@ -0,0 +1,17 @@ +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: longhorn-strict-local-no-backup +provisioner: driver.longhorn.io +allowVolumeExpansion: true +reclaimPolicy: "Delete" +volumeBindingMode: Immediate +parameters: + numberOfReplicas: "1" + staleReplicaTimeout: "720" + fromBackup: "" + fsType: "ext4" + dataLocality: "strict-local" + replicaAutoBalance: "ignored" +# diskSelector: "ssd,fast" +# nodeSelector: "storage,fast" From 23b525af42655c34af87927f9dd06855f9ae72dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Duarte?= Date: Sun, 28 Jul 2024 10:48:07 +0100 Subject: [PATCH 2/3] double size of volumes due to WALs --- services/databases/postgresql/cnpg-cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/databases/postgresql/cnpg-cluster.yaml b/services/databases/postgresql/cnpg-cluster.yaml index 83b4c36..0cd43c8 100644 --- a/services/databases/postgresql/cnpg-cluster.yaml +++ b/services/databases/postgresql/cnpg-cluster.yaml @@ -36,7 +36,7 @@ spec: name: sinf-website-2023-secret storage: - size: 10Gi + size: 20Gi #backups are handled by cloudnative postgres storageClass: longhorn-strict-local-no-backup From 99543ef9484b01ec4fd0c7d880dffdecb0d59da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Duarte?= Date: Thu, 8 Aug 2024 22:53:13 +0100 Subject: [PATCH 3/3] Add max wal keep size --- services/databases/postgresql/cnpg-cluster.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/databases/postgresql/cnpg-cluster.yaml b/services/databases/postgresql/cnpg-cluster.yaml index 0cd43c8..dc74c20 100644 --- a/services/databases/postgresql/cnpg-cluster.yaml +++ b/services/databases/postgresql/cnpg-cluster.yaml @@ -40,6 +40,10 @@ spec: #backups are handled by cloudnative postgres storageClass: longhorn-strict-local-no-backup + postgresql: + parameters: + max_slot_wal_keep_size: "10GB" + backup: barmanObjectStore: destinationPath: s3://niployments-postgres-backup/