From b24d62128ad35cd5be4f7b19b3bfe6b68ce67224 Mon Sep 17 00:00:00 2001 From: t4lz Date: Sat, 17 Aug 2024 13:14:54 +0300 Subject: [PATCH] Add CRDs and permissions for SQS (#84) Co-authored-by: Dmitry Dodzin --- .gitignore | 2 + mirrord-operator/Chart.yaml | 4 +- mirrord-operator/README.md | 30 +- mirrord-operator/templates/cluster-role.yaml | 47 +++ mirrord-operator/templates/crd.yaml | 354 ++++++++++++++++++ mirrord-operator/templates/deployment.yaml | 2 + .../templates/service-account.yaml | 4 + mirrord-operator/values.yaml | 5 + 8 files changed, 443 insertions(+), 5 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13ed353 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +license.pem + diff --git a/mirrord-operator/Chart.yaml b/mirrord-operator/Chart.yaml index f89b6e6..4233b4d 100644 --- a/mirrord-operator/Chart.yaml +++ b/mirrord-operator/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.6.5 +version: 1.6.6 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "3.91.0" +appVersion: "3.92.0" diff --git a/mirrord-operator/README.md b/mirrord-operator/README.md index 05e3051..9ed1f81 100644 --- a/mirrord-operator/README.md +++ b/mirrord-operator/README.md @@ -8,15 +8,39 @@ If you have a license key (usually obtained from https://app.metalbear.co) you c * Or you can create a secret with key `OPERATOR_LICENSE_KEY` and set the given key as value, then use `license.keyRef` to reference that secret. If you have a certificate license (usually part of Enterprise offering) you can: -* Add license file to `license.file.secret.data.license.pem` in `values.yaml` +* Add the contents of your license file to `license.file.secret.data.license.pem` in `values.yaml` * Or you can create a secret with the following format: ```yaml apiVersion: v1 kind: Secret metadata: - name: secret - namespace: mirrord + name: mirrord-operator-license-pem + namespace: mirrord stringData: license.pem: LICENSE_CONTENT ``` then reference it using `license.pemRef` in `values.yaml` + + +### SQS queue splitting + +#### IAM Role for the operator's service account + +For mirrord's SQS queue splitting feature, the operator has to be able to create, read from, write to, and delete SQS queues. +If the queue messages are encrypted, the operator also needs the `kms:Encrypt`, `kms:Decrypt` and `kms:GenerateDataKey` permissions. + +For that, an IAM role with an appropriate policy has to be assigned to the operator's service acount. +Follow AWS's documentation on how to do that: + +https://docs.aws.amazon.com/eks/latest/userguide/associate-service-account-role.html + +Pass the ARN of the role in `sa.roleArn` in `values.yaml` or via `--set sa.roleArn=arn:aws:iam::$account_id:role/mirrord-operator-role`. + +#### Permissions for target workloads + +In order to be targeted with SQS queue splitting, a workload has to be able to read from queues that are created by mirrord. +Any temporary queues created by mirrord are created with the same policy as the orignal queues they are splitting (with the single change of the queue name in the policy), so if a queue has a policy that allows the target workload to call `ReceiveMessage` on it, that is enough. +However, if the wokrload gets its access to the queue by an IAM policy (and not an SQS policy, see [SQS docs](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-using-identity-based-policies.html#sqs-using-sqs-and-iam-policies)) that grants access to that specific queue by its exact name, you would have to add a policy that would allow that workload to also read from new temporary queues created by mirrord on the run. + + +> **Note:** the names of all queues created and deleted by mirrord begin with "mirrord-". diff --git a/mirrord-operator/templates/cluster-role.yaml b/mirrord-operator/templates/cluster-role.yaml index ecbb2b0..6ea78a2 100644 --- a/mirrord-operator/templates/cluster-role.yaml +++ b/mirrord-operator/templates/cluster-role.yaml @@ -27,6 +27,21 @@ rules: - get - list - watch +{{- if .Values.operator.sqsSplitting }} +# For patching target workloads to use different queue. +- apiGroups: + - apps + resources: + - deployments + verbs: + - patch +- apiGroups: + - argoproj.io + resources: + - rollouts + verbs: + - patch +{{- end }} - apiGroups: - "" - batch @@ -76,6 +91,38 @@ rules: verbs: - list - get +{{- if .Values.operator.sqsSplitting }} +- apiGroups: + - queues.mirrord.metalbear.co + resources: + - mirrordworkloadqueueregistries + verbs: + - list +- apiGroups: + - queues.mirrord.metalbear.co + resources: + - mirrordworkloadqueueregistries/status + verbs: + - update +- apiGroups: + - queues.mirrord.metalbear.co + resources: + - mirrordsqssessions + verbs: + - create + - watch + - list + - get + - delete + - deletecollection + - patch +- apiGroups: + - queues.mirrord.metalbear.co + resources: + - mirrordsqssessions/status + verbs: + - update +{{- end }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/mirrord-operator/templates/crd.yaml b/mirrord-operator/templates/crd.yaml index 7ffd57e..5662f0b 100644 --- a/mirrord-operator/templates/crd.yaml +++ b/mirrord-operator/templates/crd.yaml @@ -83,4 +83,358 @@ spec: served: true storage: true subresources: {} +{{ if .Values.operator.sqsSplitting }} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: mirrordworkloadqueueregistries.queues.mirrord.metalbear.co +spec: + group: queues.mirrord.metalbear.co + names: + categories: [] + kind: MirrordWorkloadQueueRegistry + plural: mirrordworkloadqueueregistries + shortNames: + - qs + singular: mirrordworkloadqueueregistry + scope: Namespaced + versions: + - additionalPrinterColumns: [] + name: v1alpha + schema: + openAPIV3Schema: + description: Auto-generated derived type for MirrordWorkloadQueueRegistrySpec via `CustomResource` + properties: + spec: + description: |- + Defines a Custom Resource that holds a central configuration for splitting queues for a QueueConsumer (a target workload for which queues should be split). + This means there should be 1 such resource per queue splitting target. + properties: + consumer: + description: The resource (deployment or Argo rollout) that reads from the queues. + properties: + container: + description: If a container is not specified, the workload queue registry will apply to every run that targets any of the workload's containers. + nullable: true + type: string + name: + type: string + workload_type: + description: A workload that is a consumer of a queue that is being split. + enum: + - Deployment + - Rollout + type: string + required: + - name + - workload_type + type: object + queues: + additionalProperties: + description: The details of a queue that should be split. + oneOf: + - required: + - nameSource + - queueType + properties: + nameSource: + description: Where the application gets the queue name from. Will be used to read messages from that queue and distribute them to the output queues. When running with mirrord and splitting this queue, applications will get a modified name from that source. + oneOf: + - required: + - envVar + properties: + envVar: + type: string + type: object + queueType: + enum: + - SQS + type: string + tags: + additionalProperties: + type: string + description: These tags will be set for all temporary SQS queues created by mirrord for queues defined in this MirrordWorkloadQueueRegistry, alongside with the original tags of the respective original queue. In case of a collision, the temporary queue will get the value from the tag passed in here. + nullable: true + type: object + type: object + description: A map of the queues that should be split. The key is used by users to associate filters to the right queues. + type: object + required: + - consumer + - queues + type: object + status: + nullable: true + properties: + sqsDetails: + description: Optional even though it's currently the only field, because in the future there will be fields for other queue types. + nullable: true + properties: + direct_env_vars: + additionalProperties: + type: string + description: Names of env vars that contain the queue name directly in the pod template, without config map refs, mapped to their queue id. + type: object + env_updates: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + type: object + queue_names: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: For each queue_id, the actual queue name as retrieved from the target's pod spec or config map, together with the name of its temporary output queue. + type: object + required: + - direct_env_vars + - env_updates + - queue_names + type: object + type: object + required: + - spec + title: MirrordWorkloadQueueRegistry + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: mirrordsqssessions.queues.mirrord.metalbear.co +spec: + group: queues.mirrord.metalbear.co + names: + categories: [] + kind: MirrordSQSSession + plural: mirrordsqssessions + shortNames: [] + singular: mirrordsqssession + scope: Namespaced + versions: + - additionalPrinterColumns: [] + name: v1alpha + schema: + openAPIV3Schema: + description: Auto-generated derived type for MirrordSqsSessionSpec via `CustomResource` + properties: + spec: + description: The operator creates this object when a user runs mirrord against a target that is a queue consumer. + properties: + queueConsumer: + description: The target of this session. + properties: + container: + description: If a container is not specified, the workload queue registry will apply to every run that targets any of the workload's containers. + nullable: true + type: string + name: + type: string + workload_type: + description: A workload that is a consumer of a queue that is being split. + enum: + - Deployment + - Rollout + type: string + required: + - name + - workload_type + type: object + queueFilters: + additionalProperties: + additionalProperties: + type: string + type: object + description: For each queue_id, a mapping from attribute name, to attribute value regex. The queue_id for a queue is determined at the queue registry. It is not (necessarily) The name of the queue on AWS. + type: object + sessionId: + description: The id of the mirrord exec session, from the operator. + type: string + required: + - queueConsumer + - queueFilters + - sessionId + type: object + status: + nullable: true + oneOf: + - required: + - Starting + - required: + - RegisteringFilters + - required: + - Ready + - required: + - StartError + - required: + - CleanupError + properties: + CleanupError: + properties: + details: + nullable: true + properties: + envUpdates: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Env var name -> old and new queue names. + type: object + queueNames: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Queue ID -> old and new queue names. + type: object + required: + - envUpdates + - queueNames + type: object + error: + description: Representation of Sqs errors for the status of SQS session resources. + properties: + reason: + description: Human-readable explanation of what went wrong. + type: string + statusCode: + description: HTTP code for operator response. + format: uint16 + minimum: 0.0 + type: integer + required: + - reason + - statusCode + type: object + required: + - error + type: object + Ready: + properties: + envUpdates: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Env var name -> old and new queue names. + type: object + queueNames: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Queue ID -> old and new queue names. + type: object + required: + - envUpdates + - queueNames + type: object + RegisteringFilters: + description: SQS operator sets this status before it starts registering filters, so that if anything fails during the registration of filters, we have all the queues we need to delete on cleanup. + properties: + envUpdates: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Env var name -> old and new queue names. + type: object + queueNames: + additionalProperties: + properties: + original_name: + type: string + output_name: + type: string + required: + - original_name + - output_name + type: object + description: Queue ID -> old and new queue names. + type: object + required: + - envUpdates + - queueNames + type: object + StartError: + description: Representation of Sqs errors for the status of SQS session resources. + properties: + reason: + description: Human-readable explanation of what went wrong. + type: string + statusCode: + description: HTTP code for operator response. + format: uint16 + minimum: 0.0 + type: integer + required: + - reason + - statusCode + type: object + Starting: + properties: + start_time_utc: + type: string + required: + - start_time_utc + type: object + type: object + required: + - spec + title: MirrordSqsSession + type: object + served: true + storage: true + subresources: + status: {} +{{ end }} diff --git a/mirrord-operator/templates/deployment.yaml b/mirrord-operator/templates/deployment.yaml index 0fedbf5..2654041 100644 --- a/mirrord-operator/templates/deployment.yaml +++ b/mirrord-operator/templates/deployment.yaml @@ -53,6 +53,8 @@ spec: - name: OPERATOR_MAX_SESSION_TIME_SECONDS value: {{ .Values.operator.maxSessionTimeSeconds | quote }} {{- end }} + - name: OPERATOR_SQS_SPLITTING + value: {{ .Values.operator.sqsSplitting | ternary "true" "false" | quote }} - name: OPERATOR_JSON_LOG value: {{ .Values.operator.jsonLog | ternary "true" "false" | quote }} - name: OPERATOR_AGENT_CONFIG diff --git a/mirrord-operator/templates/service-account.yaml b/mirrord-operator/templates/service-account.yaml index 98676f1..ef3cb6d 100644 --- a/mirrord-operator/templates/service-account.yaml +++ b/mirrord-operator/templates/service-account.yaml @@ -1,6 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + {{- if .Values.sa.roleArn }} + annotations: + eks.amazonaws.com/role-arn: {{ .Values.sa.roleArn }} + {{- end }} labels: app: mirrord-operator {{- include "mirrord-operator.labels" . | nindent 4 }} diff --git a/mirrord-operator/values.yaml b/mirrord-operator/values.yaml index 4c3d335..82bb051 100644 --- a/mirrord-operator/values.yaml +++ b/mirrord-operator/values.yaml @@ -18,6 +18,8 @@ operator: podAnnotations: {} podLabels: {} jsonLog: false + # Has to be set to `true` in order to use the SQS queue splitting feature. + sqsSplitting: false # imagePullSecrets: # - name: value @@ -82,6 +84,9 @@ service: sa: name: mirrord-operator + ## aws role arn to annotate for eks iam assumption + # roleArn: arn:aws:iam::111122223333:role/mirrord-operator-role + tls: secret: mirrord-operator-tls