From 40784389618dccbf8f7f4c56f19c9b4c74d4759f Mon Sep 17 00:00:00 2001 From: "D. Ror." Date: Mon, 21 Oct 2024 16:47:13 -0400 Subject: [PATCH] [deploy] Fix helm config and maintenance scripts (#3404) Update deploy_qa.yml workflow's step for removing old images Restore pullSecretName in deploy/helm/thecombine/values.yaml Remove one-shot maintenance script db_update_audio_type.py Restore python3 shebang in maintenance scripts Update kubernetes documentation, including prep for OpenTelemetry --- .github/workflows/deploy_qa.yml | 16 +++-- .gitignore | 2 +- .../templates/job-create-admin-user.yaml | 4 +- deploy/helm/thecombine/values.yaml | 2 +- deploy/scripts/helm_utils.py | 2 +- deploy/scripts/setup_combine.py | 2 + docs/deploy/README.md | 44 ++++++++++---- docs/deploy/kubernetes_design/README.md | 14 +++-- maintenance/scripts/add_user_to_proj.py | 2 +- maintenance/scripts/combine_backup.py | 2 +- maintenance/scripts/combine_restore.py | 2 +- maintenance/scripts/db_update_audio_type.py | 59 ------------------- maintenance/scripts/get_fonts.py | 2 +- maintenance/scripts/monitor.py | 2 +- maintenance/scripts/rm_project.py | 2 +- maintenance/scripts/update_cert.py | 2 +- 16 files changed, 67 insertions(+), 92 deletions(-) delete mode 100755 maintenance/scripts/db_update_audio_type.py diff --git a/.github/workflows/deploy_qa.yml b/.github/workflows/deploy_qa.yml index 30cc2b46df..e1b7bcce2f 100644 --- a/.github/workflows/deploy_qa.yml +++ b/.github/workflows/deploy_qa.yml @@ -64,9 +64,6 @@ jobs: build_component: ${{ matrix.component }} clean_ecr_repo: needs: build - env: - RM_PATTERN_1: \d+\.\d+\.\d+-master\.\d+ - RM_PATTERN_2: \d+\.\d+\.\d+-[a-z]+\.\d+-master\.\d+ runs-on: ubuntu-latest steps: # See https://docs.stepsecurity.io/harden-runner/getting-started/ for instructions on @@ -89,7 +86,18 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_DEFAULT_REGION }} - name: Remove old AWS ECR images - run: scripts/clean_aws_repo.py combine_frontend combine_backend combine_maint combine_database --keep ${{ needs.build.outputs.image_tag }} --remove "${{ env.RM_PATTERN_1 }}" "${{ env.RM_PATTERN_2 }}" --verbose + # Remove all images for previous version numbers. + # Example: for tag beginning with v1.2.5-, remove all images with tag v1.2.4-* + # Example: for tag beginning with v2.4.0-, remove all images with tag v2.3.* + run: | + TAG=${{ needs.build.outputs.image_tag }} + if [[ $TAG =~ ^v([0-9]+)\.([0-9]+)\.([0-9]+)-.* ]]; then + VA=${BASH_REMATCH[1]}; VB=${BASH_REMATCH[2]}; VC=${BASH_REMATCH[3]} + if [[ $VC > 0 ]]; then REM="v${VA}\.${VB}\.$((VC - 1))-.*" + elif [[ $VB > 0 ]]; then REM="v${VA}\.$((VB - 1))\..*" + else REM="v$((VA - 1))\..*"; fi + scripts/clean_aws_repo.py combine_frontend combine_backend combine_maint combine_database --remove "${REM}" --verbose + fi deploy_update: needs: build # Only push to the QA server when built on the master branch diff --git a/.gitignore b/.gitignore index 835a807bf8..6eeb9a3afc 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,7 @@ scripts/*.js !scripts/createBackendLicenses.js !scripts/printVersion.js !scripts/setRelease.js - +setup_cluster.log npm-debug.log* yarn-debug.log* yarn-error.log* diff --git a/deploy/helm/create-admin-user/templates/job-create-admin-user.yaml b/deploy/helm/create-admin-user/templates/job-create-admin-user.yaml index 0022f1df25..7c60ca5e2c 100644 --- a/deploy/helm/create-admin-user/templates/job-create-admin-user.yaml +++ b/deploy/helm/create-admin-user/templates/job-create-admin-user.yaml @@ -9,7 +9,9 @@ metadata: "helm.sh/hook": post-install, post-upgrade "helm.sh/hook-delete-policy": before-hook-creation spec: - ttlSecondsAfterFinished: 300 + # keep completed jobs for 24 hrs so that logs are + # available in case of issues + ttlSecondsAfterFinished: 86400 template: metadata: creationTimestamp: null diff --git a/deploy/helm/thecombine/values.yaml b/deploy/helm/thecombine/values.yaml index 7f4ad5b282..99638a66d0 100644 --- a/deploy/helm/thecombine/values.yaml +++ b/deploy/helm/thecombine/values.yaml @@ -37,7 +37,7 @@ global: # Define the image registry to use (may be blank for local images) imageRegistry: awsEcr imageTag: "latest" - pullSecretName: "None" + pullSecretName: aws-login-credentials # Update strategy should be "Recreate" or "Rolling Update" updateStrategy: Recreate diff --git a/deploy/scripts/helm_utils.py b/deploy/scripts/helm_utils.py index 8241901691..8da82a4854 100644 --- a/deploy/scripts/helm_utils.py +++ b/deploy/scripts/helm_utils.py @@ -45,7 +45,7 @@ def create_secrets( def get_installed_charts(helm_cmd: List[str], helm_namespace: str) -> List[str]: """Create a list of the helm charts that are already installed on the target.""" - lookup_results = run_cmd(helm_cmd + ["list", "-n", helm_namespace, "-o", "yaml"]) + lookup_results = run_cmd(helm_cmd + ["list", "-a", "-n", helm_namespace, "-o", "yaml"]) chart_info: List[Dict[str, str]] = yaml.safe_load(lookup_results.stdout) chart_list: List[str] = [] for chart in chart_info: diff --git a/deploy/scripts/setup_combine.py b/deploy/scripts/setup_combine.py index e871486d50..f51bf63f8d 100755 --- a/deploy/scripts/setup_combine.py +++ b/deploy/scripts/setup_combine.py @@ -176,8 +176,10 @@ def main() -> None: chart_namespace = config["charts"][chart]["namespace"] logging.debug(f"Namespace: {chart_namespace}") if add_namespace(chart_namespace, kube_env.get_kubectl_cmd()): + logging.debug(f"Namespace '{chart_namespace}' created") installed_charts: List[str] = [] else: + logging.debug(f"Namespace '{chart_namespace}' already exists") # Get list of charts in target namespace installed_charts = get_installed_charts(helm_cmd, chart_namespace) logging.debug(f"Installed charts: {installed_charts}") diff --git a/docs/deploy/README.md b/docs/deploy/README.md index ccbc089065..d21356c2a4 100644 --- a/docs/deploy/README.md +++ b/docs/deploy/README.md @@ -62,6 +62,7 @@ separate organization. The characteristics of these systems are: - the namespace `thecombine` is created - the TLS certificate for the server is installed in `thecombine` namespace as a `kubernetes.io/tls` secret with the name `thecombine-app-tls` + - PersistentVolumeClaims for `backend-data`, `database-data`, and `font-data` - The QA server has services to login to a private AWS Elastic Container Registry to run private images for _The Combine_. In contrast, the Production server only runs public images. @@ -284,10 +285,25 @@ setup automatically by the Ansible playbook run in the previous section. For the Production or QA server, -1. login to the Rancher Dashboard for the Production (or QA) server. You need to have an account on the server that was +1. Login to the Rancher Dashboard for the Production (or QA) server. You need to have an account on the server that was created by the operations group. 2. Copy your `kubectl` configuration to the clipboard and paste it into a file on your host machine, e.g. `${HOME}/.kube/prod/config` for the production server. +3. Check that the PVCs are annotated and labeled: + - Get the full list of ``s with `kubectl [--context ] -n thecombine get pvc` + - Check the content of a `` with `kubectl [--context ] -n thecombine get pvc -o yaml` + - For all of them, make sure that `metadata:` includes the following lines: + ``` + annotations: + meta.helm.sh/release-name: thecombine + meta.helm.sh/release-namespace: thecombine + ``` + and + ``` + labels: + app.kubernetes.io/managed-by: Helm + ``` + - You can edit a `` with `kubectl [--context ] -n thecombine edit pvc ` ### Setup Environment @@ -308,6 +324,7 @@ deployments (NUC): - COMBINE_CAPTCHA_SECRET_KEY - COMBINE_SMTP_USERNAME - COMBINE_SMTP_PASSWORD +- HONEYCOMB_API_KEY You may also set the KUBECONFIG environment variable to the location of the `kubectl` configuration file. This is not necessary if the configuration file is at `${HOME}/.kube/config`. @@ -343,7 +360,8 @@ If using the Docker image, ## Install Helm Charts Required by _The Combine_ -This step sets up the NGINX Ingress Controller and the Certificate Manager, [cert-manager.io](https://cert-manager.io/). +This step sets up the NGINX Ingress Controller, the Certificate Manager ([cert-manager.io](https://cert-manager.io/)), +and the OpenTelemetry analytics collector. If using the Docker image, [open the Docker image terminal](#open-docker-image-terminal) and run: @@ -358,6 +376,10 @@ cd /deploy/scripts ./setup_cluster.py ``` +Note: This script is not used for the QA/Production deployments. If you need to do a completely fresh install for either +of those, you can see all the cluster setup steps by executing `setup_cluster.py` with +`--type development --debug 2> setup_cluster.log`. + ## Install _The Combine_ This step installs _The Combine_ application itself. @@ -397,19 +419,13 @@ Notes: ### Maintenance Scripts for Kubernetes -There are several maintenance scripts that can be run in the kubernetes cluster: - -- `combine-backup-job.sh` - performs a backup of _The Combine_ database and backend files, pushes the backup to AWS S3 - storage and then removes old backups keeping the latest 3 backups. -- `combine_backup.py` - just performs the backup and pushes the result to AWS S3 storage. -- `combine-clean-aws.py` - removes the oldest backups, keeping up to `max_backups`. The default for `max_backups` is 3. -- `combine_restore.py` - restores _The Combine_ database and backend files from one of the backups in AWS S3 storage. +There are several maintenance scripts that can be run in the kubernetes cluster; they are listed in +[./kubernetes_design/README.md#combine_maint-image](./kubernetes_design/README.md#combine_maint-image). The `combine-backup-job.sh` is currently being run daily on _The Combine_ QA and Production servers as a Kubernetes CronJob. -In addition to the daily backup, any of the scripts can be run on-demand using the `kubectl` command. Using the -`kubectl` command takes the form: +In addition to the daily backup, any of the scripts can be run on-demand using the `kubectl` command as follows: ```bash kubectl [--kubeconfig=] [-n thecombine] exec -it deployment/maintenance --