From d1f1ceb82086c61940d4fd558855915eded464d0 Mon Sep 17 00:00:00 2001 From: cyturney Date: Wed, 14 Aug 2024 11:15:29 -0700 Subject: [PATCH 01/11] Resiliency PR initial commit --- cluster/eksctl/cluster.yaml | 3 +- .../modules/resiliency/.workshop/cleanup.sh | 98 +++++++ .../resiliency/.workshop/terraform/main.tf | 256 ++++++++++++++++++ .../resiliency/.workshop/terraform/outputs.tf | 10 + .../resiliency/.workshop/terraform/vars.tf | 43 +++ .../config/kustomization.yaml | 8 + .../config/scale_and_affinity_patch.yaml | 27 ++ .../rbac/chaos-mesh-role.yaml | 12 + .../rbac/chaos-mesh-rolebinding.yaml | 13 + .../resiliency/scripts/get-pods-by-az.sh | 25 ++ .../resiliency/scripts/node-failure.sh | 25 ++ .../modules/resiliency/scripts/pod-failure.sh | 26 ++ .../resiliency/scripts/verify-cluster.sh | 95 +++++++ .../resiliency/high-availability/01-setup.md | 90 ++++++ .../high-availability/02-pod-failure.md | 50 ++++ .../03-node-failure-no-fis.md | 82 ++++++ .../04-node-failure-partial-fis.md | 82 ++++++ .../05-node-failure-complete-fis.md | 65 +++++ .../high-availability/06-az-failure.md | 134 +++++++++ .../resiliency/high-availability/index.md | 49 ++++ website/docs/resiliency/index.md | 54 ++++ website/docusaurus.config.js | 6 + website/sidebars.js | 1 + 23 files changed, 1253 insertions(+), 1 deletion(-) create mode 100755 manifests/modules/resiliency/.workshop/cleanup.sh create mode 100644 manifests/modules/resiliency/.workshop/terraform/main.tf create mode 100644 manifests/modules/resiliency/.workshop/terraform/outputs.tf create mode 100644 manifests/modules/resiliency/.workshop/terraform/vars.tf create mode 100644 manifests/modules/resiliency/high-availability/config/kustomization.yaml create mode 100644 manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml create mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml create mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml create mode 100755 manifests/modules/resiliency/scripts/get-pods-by-az.sh create mode 100755 manifests/modules/resiliency/scripts/node-failure.sh create mode 100755 manifests/modules/resiliency/scripts/pod-failure.sh create mode 100755 manifests/modules/resiliency/scripts/verify-cluster.sh create mode 100644 website/docs/resiliency/high-availability/01-setup.md create mode 100644 website/docs/resiliency/high-availability/02-pod-failure.md create mode 100644 website/docs/resiliency/high-availability/03-node-failure-no-fis.md create mode 100644 website/docs/resiliency/high-availability/04-node-failure-partial-fis.md create mode 100644 website/docs/resiliency/high-availability/05-node-failure-complete-fis.md create mode 100644 website/docs/resiliency/high-availability/06-az-failure.md create mode 100644 website/docs/resiliency/high-availability/index.md create mode 100644 website/docs/resiliency/index.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index 4c78c034d..a22a4a127 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -31,7 +31,8 @@ managedNodeGroups: maxSize: 6 instanceType: m5.large privateNetworking: true - releaseVersion: "1.30.0-20240625" + # had to remove use make create + #releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh new file mode 100755 index 000000000..d4040bbde --- /dev/null +++ b/manifests/modules/resiliency/.workshop/cleanup.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +set -e + +# Delete Ingress +kubectl delete ingress -n ui ui --ignore-not-found +kubectl delete ingress ui -n ui --ignore-not-found + +# Delete Deployments +kubectl delete deployment -n ui ui --ignore-not-found +kubectl delete deployment ui -n ui --ignore-not-found + +# Delete Services +kubectl delete service -n ui ui-nlb --ignore-not-found + +# Delete Roles and RoleBindings +kubectl delete role chaos-mesh-role -n ui --ignore-not-found +kubectl delete rolebinding chaos-mesh-rolebinding -n ui --ignore-not-found + +# Uninstall Helm chart +if command -v helm &> /dev/null; then + echo "Uninstalling aws-load-balancer-controller Helm chart" + helm uninstall aws-load-balancer-controller -n kube-system || true + + echo "Uninstalling Chaos Mesh Helm chart" + helm uninstall chaos-mesh -n chaos-mesh || true + + # Wait for resources to be cleaned up + echo "Waiting for resources to be cleaned up..." + sleep 30 +else + echo "Helm command not found. Skipping Helm chart uninstallations." +fi + +kubectl delete namespace chaos-mesh --ignore-not-found + +# Delete IAM Roles and Policies +ROLE_PREFIX="fis-execution-role-eks-workshop" +POLICY_PREFIX="eks-resiliency-fis-policy" + +# List and delete roles +for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${ROLE_PREFIX}')].RoleName" --output text); do + echo "Detaching policies and deleting role: $role" + # Detach managed policies + aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess || true + aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess || true + + # Detach and delete inline policies + for policy in $(aws iam list-role-policies --role-name $role --query PolicyNames --output text); do + aws iam delete-role-policy --role-name $role --policy-name $policy || true + done + + # Delete the role + aws iam delete-role --role-name $role || true +done + +# List and delete policies +for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${POLICY_PREFIX}')].Arn" --output text); do + echo "Deleting policy: $policy_arn" + + # Detach policy from all attached roles + for role in $(aws iam list-entities-for-policy --policy-arn $policy_arn --entity-filter Role --query 'PolicyRoles[*].RoleName' --output text); do + aws iam detach-role-policy --role-name $role --policy-arn $policy_arn + done + + # Delete the policy + aws iam delete-policy --policy-arn $policy_arn +done + +# Delete any leftover ALBs +ALB_ARN=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text) +if [ ! -z "$ALB_ARN" ]; then + echo "Deleting leftover ALB: $ALB_ARN" + aws elbv2 delete-load-balancer --load-balancer-arn $ALB_ARN +else + echo "No leftover ALB found." +fi + +# Delete S3 bucket +BUCKET_PREFIX="eks-workshop-canary-artifacts-" +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, '${BUCKET_PREFIX}')].Name" --output text); do + echo "Deleting S3 bucket: $bucket" + # First, remove all objects from the bucket + aws s3 rm s3://$bucket --recursive + # Then delete the bucket + aws s3api delete-bucket --bucket $bucket --region us-west-2 +done + +# Delete CloudWatch Synthetics canary +CANARY_NAME="eks-workshop-canary" +if aws synthetics get-canary --name $CANARY_NAME --region us-west-2 &> /dev/null; then + echo "Deleting CloudWatch Synthetics canary: $CANARY_NAME" + aws synthetics delete-canary --name $CANARY_NAME --region us-west-2 +else + echo "CloudWatch Synthetics canary $CANARY_NAME not found." +fi + +echo "Cleanup completed successfully." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/resiliency/.workshop/terraform/main.tf new file mode 100644 index 000000000..7e039cbdf --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/main.tf @@ -0,0 +1,256 @@ +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "1.16.3" + + cluster_name = var.addon_context.eks_cluster_id + cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint + cluster_version = var.eks_cluster_version + oidc_provider_arn = var.addon_context.eks_oidc_provider_arn + + enable_aws_load_balancer_controller = true + create_kubernetes_resources = false + +} + + +// ALB creation +resource "kubernetes_manifest" "ui_alb" { + manifest = { + "apiVersion" = "networking.k8s.io/v1" + "kind" = "Ingress" + "metadata" = { + "name" = "ui" + "namespace" = "ui" + "annotations" = { + "alb.ingress.kubernetes.io/scheme" = "internet-facing" + "alb.ingress.kubernetes.io/target-type" = "ip" + "alb.ingress.kubernetes.io/healthcheck-path" = "/actuator/health/liveness" + } + } + "spec" = { + ingressClassName = "alb", + "rules" = [{ + "http" = { + paths = [{ + path = "/" + pathType = "Prefix" + "backend" = { + service = { + name = "ui" + port = { + number = 80 + } + } + } + }] + } + }] + } + } +} + +// Create RBAC and Rolebinding +resource "kubernetes_role" "chaos_mesh_role" { + metadata { + name = "chaos-mesh-role" + namespace = "ui" + } + + rule { + api_groups = ["chaos-mesh.org"] + resources = ["podchaos"] + verbs = ["create", "delete", "get", "list", "patch", "update", "watch"] + } + + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["get", "list", "watch"] + } +} + +data "aws_caller_identity" "current" {} + +resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { + metadata { + name = "chaos-mesh-rolebinding" + namespace = "ui" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.chaos_mesh_role.metadata[0].name + } + + subject { + kind = "User" + name = data.aws_caller_identity.current.arn + namespace = "ui" + } +} + +// Add AWS Load Balancer controller +resource "helm_release" "aws_load_balancer_controller" { + name = "aws-load-balancer-controller" + repository = "https://aws.github.io/eks-charts" + chart = "aws-load-balancer-controller" + namespace = "kube-system" + version = var.load_balancer_controller_chart_version + + set { + name = "clusterName" + value = var.addon_context.eks_cluster_id + } + + set { + name = "serviceAccount.name" + value = "aws-load-balancer-controller-sa" + } + + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + } +} + + +// Chaos Mesh Helm Release +resource "helm_release" "chaos_mesh" { + name = "chaos-mesh" + repository = "https://charts.chaos-mesh.org" + chart = "chaos-mesh" + namespace = "chaos-mesh" + version = "2.5.1" + + create_namespace = true +} + +// FIS IAM role +resource "random_id" "suffix" { + byte_length = 8 +} + +resource "aws_iam_role" "fis_role" { + name = "fis-execution-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "fis.amazonaws.com" + } + Action = "sts:AssumeRole" + }, + { + Effect = "Allow" + Principal = { + Federated = var.addon_context.eks_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${trimprefix(var.addon_context.eks_oidc_provider_arn, "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/")}:sub" = [ + "system:serviceaccount:ui:chaos-mesh-sa" + ] + } + } + }, + { + Effect = "Allow" + Principal = { + Service = "ssm.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } + + depends_on = [kubernetes_role_binding.chaos_mesh_rolebinding] +} + +// Attach FIS Access Policy +resource "aws_iam_role_policy_attachment" "fis_eks_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_network_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess" + role = aws_iam_role.fis_role.name +} + +// Policy for creating FIS experiment templates +resource "aws_iam_policy" "eks_resiliency_fis_policy" { + name = "eks-resiliency-fis-policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency FIS experiments" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + // FIS + "fis:CreateExperimentTemplate", + "fis:GetExperimentTemplate", + "fis:ListExperimentTemplates", + "fis:DeleteExperimentTemplate", + "fis:UpdateExperimentTemplate", + "fis:TagResource", + "fis:UntagResource", + "fis:StartExperiment", + "fis:GetExperiment", + "fis:ListExperiments", + "ec2:DescribeInstances", + "ec2:DescribeInstanceStatus", + "ec2:TerminateInstances", + "eks:DescribeCluster", + "eks:ListNodegroups", + "eks:DescribeNodegroup", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:SetDesiredCapacity", + "logs:CreateLogDelivery", + "logs:GetLogDelivery", + "logs:UpdateLogDelivery", + "logs:DeleteLogDelivery", + "logs:ListLogDeliveries", + // Synthetic Canary + "synthetics:CreateCanary", + "synthetics:DeleteCanary", + "synthetics:DescribeCanaries", + "synthetics:StartCanary", + "synthetics:StopCanary", + "synthetics:UpdateCanary", + "s3:PutObject", + "s3:GetBucketLocation", + "s3:ListAllMyBuckets", + "cloudwatch:PutMetricData", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.fis_role.arn + } + ] + }) +} + +// Attach custom policy to the role +resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn + role = aws_iam_role.fis_role.name +} diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/resiliency/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..413de0df1 --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/outputs.tf @@ -0,0 +1,10 @@ +output "environment_variables" { + description = "Environment variables to be added to the IDE shell" + value = { + LBC_CHART_VERSION = var.load_balancer_controller_chart_version + LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + FIS_ROLE_ARN = aws_iam_role.fis_role.arn + RANDOM_SUFFIX = random_id.suffix.hex + SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + } +} diff --git a/manifests/modules/resiliency/.workshop/terraform/vars.tf b/manifests/modules/resiliency/.workshop/terraform/vars.tf new file mode 100644 index 000000000..42bd4d060 --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/vars.tf @@ -0,0 +1,43 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} + +variable "load_balancer_controller_chart_version" { + description = "The chart version of aws-load-balancer-controller to use" + type = string + # renovate-helm: depName=aws-load-balancer-controller + default = "1.8.1" +} + diff --git a/manifests/modules/resiliency/high-availability/config/kustomization.yaml b/manifests/modules/resiliency/high-availability/config/kustomization.yaml new file mode 100644 index 000000000..b71687089 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/config/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../../../manifests/base-application/ui + +patches: + - path: scale_and_affinity_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml new file mode 100644 index 000000000..c84b9a056 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui + namespace: ui +spec: + replicas: 5 + selector: + matchLabels: + app: ui + template: + metadata: + labels: + app: ui + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - ui + topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml new file mode 100644 index 000000000..5e5981a82 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: ui + name: chaos-mesh-role +rules: + - apiGroups: ["chaos-mesh.org"] + resources: ["podchaos"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml new file mode 100644 index 000000000..338d88c3b --- /dev/null +++ b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: chaos-mesh-rolebinding + namespace: ui +subjects: + - kind: User + name: PLACEHOLDER + namespace: ui +roleRef: + kind: Role + name: chaos-mesh-role + apiGroup: rbac.authorization.k8s.io diff --git a/manifests/modules/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/resiliency/scripts/get-pods-by-az.sh new file mode 100755 index 000000000..8063f1094 --- /dev/null +++ b/manifests/modules/resiliency/scripts/get-pods-by-az.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Credit to "Disaster recovery, high availability, and resiliency on Amazon EKS" +# https://catalog.us-east-1.prod.workshops.aws/workshops/6140457f-53b2-48b8-a007-2d4be06ba2fc + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +CURRENT_CONTEXT=$(kubectl config current-context) +REGION=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"$CURRENT_CONTEXT\")].context.cluster}" | cut -d : -f 4) + +for az in a b c +do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done + done + echo "" +done \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/node-failure.sh b/manifests/modules/resiliency/scripts/node-failure.sh new file mode 100755 index 000000000..80d3fc3b9 --- /dev/null +++ b/manifests/modules/resiliency/scripts/node-failure.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# node-failure.sh - Simulates node failure by stopping an EC2 instance with running pods + +# Get a list of nodes with running pods +node_with_pods=$(kubectl get pods --all-namespaces -o wide | awk 'NR>1 {print $8}' | sort | uniq) + +if [ -z "$node_with_pods" ]; then + echo "No nodes with running pods found. Please run this script: $SCRIPT_DIR/verify-cluster.sh" + exit 1 +fi + +# Select a random node from the list +selected_node=$(echo "$node_with_pods" | shuf -n 1) + +# Get the EC2 instance ID for the selected node +instance_id=$(aws ec2 describe-instances \ + --filters "Name=private-dns-name,Values=$selected_node" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + +# Stop the instance to simulate a node failure +echo "Stopping instance: $instance_id (Node: $selected_node)" +aws ec2 stop-instances --instance-ids $instance_id + +echo "Instance $instance_id is being stopped. Monitoring pod distribution..." diff --git a/manifests/modules/resiliency/scripts/pod-failure.sh b/manifests/modules/resiliency/scripts/pod-failure.sh new file mode 100755 index 000000000..3ed7df813 --- /dev/null +++ b/manifests/modules/resiliency/scripts/pod-failure.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# pod-failure.sh - Simulates pod failure using Chaos Mesh + +# Generates a unique identifier for the pod failure experiment +unique_id=$(date +%s) + +# Create a YAML configuration for the PodChaos resource +cat < pod-failure.yaml +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: pod-failure-$unique_id + namespace: ui +spec: + action: pod-kill + mode: one + selector: + namespaces: + - ui + labelSelectors: + "app.kubernetes.io/name": "ui" + duration: "60s" +EOF + +# Apply the PodChaos configuration to simulate the failure +kubectl apply -f pod-failure.yaml diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/resiliency/scripts/verify-cluster.sh new file mode 100755 index 000000000..56e2844df --- /dev/null +++ b/manifests/modules/resiliency/scripts/verify-cluster.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# verify-cluster.sh - Verifies cluster state and corrects replica count + +DESIRED_REPLICAS=5 +MAX_WAIT_TIME=300 # 5 minutes +POLL_INTERVAL=10 # 10 seconds +NAMESPACE="ui" + +print_header() { + echo -e "\n==== $1 ====\n" +} + +wait_for_condition() { + local end_time=$((SECONDS + MAX_WAIT_TIME)) + while [ $SECONDS -lt $end_time ]; do + if eval "$1"; then + return 0 + fi + echo -n "." + sleep $POLL_INTERVAL + done + echo " Timeout!" + return 1 +} + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Node Information" +kubectl get nodes -o wide + +print_header "Verifying Cluster State" +node_count=$(kubectl get nodes --no-headers | grep " Ready " | grep -vc "SchedulingDisabled") +current_pod_count=$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) + +echo "Ready and schedulable nodes: $node_count" +echo "Current active ui pods: $current_pod_count" +echo "Desired ui pods: $DESIRED_REPLICAS" + +if [ $current_pod_count -ne $DESIRED_REPLICAS ]; then + print_header "Adjusting Replica Count" + echo "Scaling deployment to $DESIRED_REPLICAS replicas..." + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pod count to stabilize" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pod count has reached the desired number." + else + echo -e "\n⚠️ Warning: Failed to reach desired pod count within the timeout period." + fi +else + echo "✅ Number of replicas is correct." +fi + +print_header "Checking Pod Distribution" +if [ $node_count -gt 0 ]; then + max_pods_per_node=$((DESIRED_REPLICAS / node_count + 1)) + uneven_distribution=false + + for node in $(kubectl get nodes -o name | grep -v "SchedulingDisabled"); do + pods_on_node=$(kubectl get pods -n $NAMESPACE -l app=ui --field-selector spec.nodeName=${node#node/} --no-headers | grep -v Terminating | wc -l) + if [ $pods_on_node -gt $max_pods_per_node ]; then + uneven_distribution=true + break + fi + done + + if $uneven_distribution; then + echo "⚠️ Pod distribution is uneven. Rebalancing..." + kubectl scale deployment ui -n $NAMESPACE --replicas=0 + sleep $POLL_INTERVAL + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pods to be ready" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep Running | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pods are ready and balanced." + else + echo -e "\n⚠️ Warning: Pods did not reach ready state within the timeout period." + fi + else + echo "✅ Pod distribution is balanced." + fi +else + echo "⚠️ Warning: No Ready and schedulable nodes found. Cannot check pod distribution." +fi + +print_header "Final Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +echo +if [ $node_count -gt 0 ] && [ $current_pod_count -eq $DESIRED_REPLICAS ]; then + echo "✅ Cluster verification and correction complete." +else + echo "⚠️ Cluster verification complete, but some issues may require attention." +fi \ No newline at end of file diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md new file mode 100644 index 000000000..03b327af8 --- /dev/null +++ b/website/docs/resiliency/high-availability/01-setup.md @@ -0,0 +1,90 @@ +--- +title: "Scaling and Pod Anti-Affinity for UI Service" +sidebar_position: 1 +description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." +--- + +TODO: + +- Update Name +- Update/Remove Verification + +This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. + +## Scaling and Pod Anti-Affinity + +We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. + +Here's the content of our patch file: + +```file +manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml +``` + +Apply the changes using Kustomize patch and + + + +```bash +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ +``` + +## Create Helper Script: Get Pods by AZ + +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file + + + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/get-pods-by-az.sh +``` + +### Script Execution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/get-pods-by-az.sh +``` + +:::tip +Use this to quickly assess the distribution of your pods across multiple zones. +::: + +## Verification + +After applying these changes, verify the setup: + +1. Check for 5 running UI pods: + +```bash +$ kubectl get pods -n ui +``` + +2. Verify pod distribution across nodes: + +```bash +$ kubectl get pods -n ui -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}' +``` + +3. Check that AWS Load Balancer Controller is installed and working: + +```bash +$ kubectl get pods -n kube-system | grep aws-load-balancer-controller +$ kubectl get ingress --all-namespaces +``` + +4. Ensure the Load Balancer is working and access to the Retail URL: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +:::info +For more information on these changes, check out these sections: + +- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) + ::: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/resiliency/high-availability/02-pod-failure.md new file mode 100644 index 000000000..cbde69d2c --- /dev/null +++ b/website/docs/resiliency/high-availability/02-pod-failure.md @@ -0,0 +1,50 @@ +--- +title: "Simulating Pod Failure" +sidebar_position: 2 +description: "Simulate pod failure in your environment using ChaosMesh to test the resiliency of your application." +--- + +## Overview + +TODO: + +- fix file visual? +- add more information about this lab and a conclusion +- Note that this experiment is repeatable +- Note that retail store should still work even when the pod fails + +In this experiment, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds. The `pod-failure.sh` script will simulate a pod failure using Chaos Mesh. This is the script we will be using: + +```file +manifests/modules/resiliency/scripts/pod-failure.sh +``` + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/pod-failure.sh +``` + +## Running the Experiment + +Run the experiment and monitor the effects on pod distribution: + +```bash +$ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command initiates the pod failure and monitors the pod distribution for 30 seconds to observe how the system handles the failure. You should see one pod dissapear and then reappear. + +Check the status of pods in the `ui` namespace: + +```bash +$ kubectl get pods -n ui -o wide +``` + +## Verify Retail Store Availability + +To ensure that the retail store is operational, check its availability with the url fetched with this command: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md new file mode 100644 index 000000000..7e154f2b0 --- /dev/null +++ b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md @@ -0,0 +1,82 @@ +--- +title: "Simulating Node Failure without FIS" +sidebar_position: 3 +description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." +--- + +# Simulating Node Failure without FIS + +TODO: + +- add information and concluding thoughts +- note that this is repeatable +- should see node failure after about a minute, pods come return shortly after to current working nodes, node comes back online after about 2 minutes +- should I make more things following the verify-cluster.sh visual? +- Load balancer does not appear to work although it should +- Rather than the seeing whole script, show expected output? +- Update script to wait for 3 nodes online + +## Overview + +This experiment simulate a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: + +```file +manifests/modules/resiliency/scripts/node-failure.sh +``` + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/node-failure.sh +``` + +## Running the Experiment + +Run the node failure experiment and monitor the effects on pod distribution: + +```bash +$ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. + +During the experiment, you should observe the following: + +- One node disappearing from the list +- Kubernetes will detect the node failure and reschedule the pods that were running on the failed node +- These pods being redistributed to the remaining healthy nodes +- The failed node will come back online + +The total number of running pods should remain constant, ensuring application availability. + +## Verify Retail Store Availability + +After simulating the node failure, verify if the retail store application remains accessible: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +## Verifying Cluster Recovery + +After simulating the node failure, we'll verify the cluster's self-healing and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. + +Use the following + + + +to verify the cluster state and rebalance pods: + +```bash +$ chmod +x $SCRIPT_DIR/verify-cluster.sh +$ $SCRIPT_DIR/verify-cluster.sh +``` + +This script will: + +- Counts the number of nodes and ui pods +- Checks if the pods are evenly distributed across the nodes + +## Conclusion + +add concluding thoughts diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md new file mode 100644 index 000000000..4b9091fd5 --- /dev/null +++ b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md @@ -0,0 +1,82 @@ +--- +title: "Simulating Partial Node Failure with FIS" +sidebar_position: 4 +description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." +--- + +# Simulating Partial Node Failure with FIS + +TODO: + +- More FIS info? +- More information about the experiment +- Explain what FIS is doing different, what the experiment is doing +- should see a 1 node failing after about a minute, pods to come back up after 2 and a half minutes, and the node come back up after +- check to make sure retail app stays up +- retail app apears to not work -> need to fix load balancer configs +- A conclusion / learning from experiment +- Note that FIS can allow automatic testing for failure and whatever else is cool + +## AWS Fault Injection Simulator (FIS) Overview + +AWS Fault Injection Simulator is a fully managed service that helps you perform fault injection experiments on your AWS workloads. In the context of EKS, FIS allows us to simulate various failure scenarios, which is crucial for: + +1. Validating high availability configurations +2. Testing auto-scaling and self-healing capabilities +3. Identifying potential single points of failure +4. Improving incident response procedures + +By using FIS, you can: + +- Discover hidden bugs and performance bottlenecks +- Observe how your systems behave under stress +- Implement and validate automated recovery procedures + +In our FIS experiment, we'll simulate a partial node failure in our EKS cluster and observe how our application responds, providing practical insights into building resilient systems. + +:::info +For more information on AWS FIS check out: + +- [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) + ::: + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the node failure: + +```bash +$ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the node failure and monitor the response: + +```bash +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This will trigger the node failure and begin monitoring the pods for 5 minutes, observing how the cluster responds to losing part of its capacity. + +## Verifying Retail Store Availability + +After simulating the node failure, check if the retail store application remains operational: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +Despite a partial node failure, the retail store continues to serve traffic, demonstrating the resilience of your deployment setup. + +:::caution +Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. +::: + +:::note +To verify clusters and rebalance pods you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md new file mode 100644 index 000000000..ab5cbdd95 --- /dev/null +++ b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md @@ -0,0 +1,65 @@ +--- +title: "Simulating Complete Node Failure with FIS" +sidebar_position: 5 +description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." +--- + +# Simulating Complete Node Failure with FIS + +TODO: + +- Fix script to mimic last experiment again +- Why is this different than last experiment +- Explain what is happening in more detail +- Note timings +- Concluding Statement +- You should see all nodes and pods dissapear rather quickly then after about 2 minutes will start to see 1 node and pods coming online, after 4 minutes a second node will come online and 3 more pods. + +## Overview + +This experiment is an extensive test that isn't necessary but demonstrates the robust capabilities of AWS Fault Injection Simulator by simulating a complete node failure in a Kubernetes cluster. + +:::info Important +This test showcases how FIS can be used to simulate worst-case scenarios to help validate the resilience and recovery strategies of your applications. +::: + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the complete failure of all nodes in a specific node group: + +```bash +$ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"ALL"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"100"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the complete node failure: + +```bash +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +Monitor the cluster as it loses all node resources temporarily, observing how the Kubernetes system and your application respond. + +## Verifying Retail Store Availability + +After simulating the node failure, check if the retail store application is still operational: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +This command helps confirm that despite complete node failure, the application begins to recover as the Kubernetes cluster auto-scales back up. + +:::caution +This test can cause significant disruption, so it's recommended for use only in controlled environments where recovery mechanisms are thoroughly tested. +::: + +:::note +To verify clusters and rebalance pods you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: diff --git a/website/docs/resiliency/high-availability/06-az-failure.md b/website/docs/resiliency/high-availability/06-az-failure.md new file mode 100644 index 000000000..1091b41e7 --- /dev/null +++ b/website/docs/resiliency/high-availability/06-az-failure.md @@ -0,0 +1,134 @@ +--- +title: "Simulating AZ Failure" +sidebar_position: 6 +description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." +--- + +# Simulating AZ Failure + +TODO: + +- Fix canary +- Check AZ failure still works +- add specific cloudwatch iam role +- add conclustion + +## Overview + +This experiment simulates an Availability Zone (AZ) failure, demonstrating how robust your application is when faced with significant disruptions. It leverages AWS Fault Injection Simulator (FIS) and additional AWS services to test the resilience of the system under the stress of an AZ going offline. + +## Preparation + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash +$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 +``` + +2. Create the canary: + +Set up the blueprint: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ cat < canary_script.js +var synthetics = require('Synthetics'); +var log = require('SyntheticsLogger'); + +const pageLoadBlueprint = async function () { + const PAGE_LOAD_TIMEOUT = 30; + const URL = 'http://${INGRESS_URL}'; + let page = await synthetics.getPage(); + await synthetics.executeStep('Navigate to ' + URL, async function () { + await page.goto(URL, {waitUntil: 'domcontentloaded', timeout: PAGE_LOAD_TIMEOUT * 1000}); + }); + await synthetics.executeStep('Page loaded successfully', async function () { + log.info('Page loaded successfully'); + }); +}; + +exports.handler = async () => { + return await pageLoadBlueprint(); +}; +EOF +$ aws s3 cp canary_script.js s3://$BUCKET_NAME/canary-script/canary_script.js +``` + +Create a synthetic canary: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $FIS_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-9.0 \ + --schedule Expression="rate(1 minute)" \ + --code S3Bucket=$BUCKET_NAME,S3Key=canary-script/canary_script.js,Handler="canary_script.handler" \ + --region us-west-2 +$ sleep 30 +$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 +``` + +3. Create a CloudWatch alarm for the canary: + +```bash +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region us-west-2 +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +### Setting up the Experiment + +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: + +```bash +$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) +$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') +``` + +Create the FIS experiment template to simulate the AZ failure: + +```bash +$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"arn:aws:iam::'$AWS_ACCOUNT_ID':role/WSParticipantRole\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the AZ failure: + +```bash +aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && \ +timeout 450 watch -n 1 --color $SCRIPT_DIR/get-pods-by-az.sh +``` + +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs to understand the immediate impact of the simulated AZ failure. + +## Post-Experiment Verification + +Ensure that your application remains operational despite the simulated AZ failure, confirming the effectiveness of Kubernetes high availability: + +```bash +wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +## Conclusion + +This experiment demonstrates the resilience of your EKS cluster in the face of an Availability Zone failure. By monitoring the canary and observing the redistribution of pods, you can assess how well your application maintains availability during significant infrastructure disruptions. diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/resiliency/high-availability/index.md new file mode 100644 index 000000000..31556db21 --- /dev/null +++ b/website/docs/resiliency/high-availability/index.md @@ -0,0 +1,49 @@ +--- +title: "High Availability" +sidebar_position: 20 +sidebar_custom_props: { "module": true } +description: "Prepare your EKS environment to handle high availability scenarios effectively." +--- + +TODO: + +- have to delete deployment before? why? is that due to dev or what +- expected time for lab completion +- expected time for prepare-env (about 5 minutes without cleanup.sh and any previous applications) +- Lab overview +- Check info sections +- Are we able to chmod in backend? +- Check why the load balancer stopped working + +::required-time + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=300 wait=30 +$ chmod +x /manifests/modules/resiliency/.workshop/cleanup.sh +$ /manifests/modules/resiliency/.workshop/cleanup.sh +$ prepare-environment resiliency +``` + +This will make the following changes to your lab environment: + +- Create the ingress load balancer +- Create RBAC and Rolebindings +- Install AWS Load Balancer controller +- Install ChaosMesh +- Create an IAM role for AWS Fault Injection Simulator (FIS) + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). +::: + +In this lab, we'll look at... +information + +:::info +For more information on these changes checkout: + +- [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) +- [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) +- [Chaos Mesh](https://chaos-mesh.org/) + ::: diff --git a/website/docs/resiliency/index.md b/website/docs/resiliency/index.md new file mode 100644 index 000000000..1541ba19d --- /dev/null +++ b/website/docs/resiliency/index.md @@ -0,0 +1,54 @@ +--- +title: "Resiliency" +sidebar_position: 11 +weight: 10 +--- + +TODO: + +- Add intro information +- Find a lab to input + +Other TODO: + +- autotesting +- Containers on couch vod (link it here?) + +## What is Resiliency? + +Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: + +1. **Fault Tolerance**: The ability to continue operating properly in the event of the failure of some of its components. +2. **Self-Healing**: The capability to detect and recover from failures automatically. +3. **Scalability**: The ability to handle increased load by adding resources. +4. **Disaster Recovery**: The process of preparing for and recovering from potential disasters. + +## Why is Resiliency Important in EKS? + +Amazon EKS provides a managed Kubernetes platform, but it's still crucial to design and implement resilient architectures. Here's why: + +1. **High Availability**: Ensure your applications remain accessible even during partial system failures. +2. **Data Integrity**: Prevent data loss and maintain consistency during unexpected events. +3. **User Experience**: Minimize downtime and performance degradation to maintain user satisfaction. +4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. + +## Resiliency Scenarios Covered in this Chapter + +We'll explore several scenarios to show resiliency by performing: + +1. Pod Failures +2. Node Failures +3. Availability Zone Failures + +## What You'll Learn + +By the end of this chapter, you'll be able to: + +- Use AWS FIS to simulate and learn from controlled failure scenarios +- other info + +:::info + + + +::: diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 9b2339322..d31d3f620 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -133,6 +133,12 @@ const config = { position: "left", label: "Observability", }, + { + type: "doc", + docId: "resiliency/index", + position: "left", + label: "Resiliency", + }, { type: "doc", docId: "security/index", diff --git a/website/sidebars.js b/website/sidebars.js index 7da64994c..adf89ee4a 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,6 +20,7 @@ const sidebars = { networking: [{ type: "autogenerated", dirName: "networking" }], autoscaling: [{ type: "autogenerated", dirName: "autoscaling" }], observability: [{ type: "autogenerated", dirName: "observability" }], + resiliency: [{ type: "autogenerated", dirName: "resiliency" }], automation: [{ type: "autogenerated", dirName: "automation" }], aiml: [{ type: "autogenerated", dirName: "aiml" }], }; From 324fcb46095a8eddadc734c37ae822a1dd4e6c24 Mon Sep 17 00:00:00 2001 From: cyturney Date: Fri, 16 Aug 2024 11:09:56 -0700 Subject: [PATCH 02/11] update --- .../modules/resiliency/.workshop/cleanup.sh | 180 +++++++++++------- .../resiliency/.workshop/terraform/main.tf | 163 ++++++++++++++-- .../resiliency/.workshop/terraform/outputs.tf | 14 +- .../multi_az/add_us_east_2_patch.yaml | 41 ++++ .../multi_az/kustomization.yaml | 8 + .../rbac/chaos-mesh-role.yaml | 12 -- .../rbac/chaos-mesh-rolebinding.yaml | 13 -- .../resiliency/scripts/create-second-az.sh | 52 +++++ .../scripts/eks_workshop_canary_script.js | 30 +++ .../resiliency/scripts/multi-az-get-pods.sh | 26 +++ .../resiliency/scripts/verify-cluster.sh | 15 ++ .../resiliency/high-availability/01-setup.md | 59 ++---- .../high-availability/02-pod-failure.md | 49 +++-- .../03-node-failure-no-fis.md | 86 +++++---- .../04-node-failure-partial-fis.md | 87 ++++++--- .../05-node-failure-complete-fis.md | 70 ++++--- .../high-availability/06-az-failure.md | 134 ------------- .../high-availability/06-az-setup.md | 123 ++++++++++++ .../high-availability/07-az-failure.md | 84 ++++++++ .../resiliency/high-availability/index.md | 38 ++-- .../high-availability/tests/hook-suite.sh | 11 ++ website/docs/resiliency/index.md | 39 ++-- 22 files changed, 901 insertions(+), 433 deletions(-) create mode 100644 manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml create mode 100644 manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml delete mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml delete mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml create mode 100755 manifests/modules/resiliency/scripts/create-second-az.sh create mode 100644 manifests/modules/resiliency/scripts/eks_workshop_canary_script.js create mode 100755 manifests/modules/resiliency/scripts/multi-az-get-pods.sh delete mode 100644 website/docs/resiliency/high-availability/06-az-failure.md create mode 100644 website/docs/resiliency/high-availability/06-az-setup.md create mode 100644 website/docs/resiliency/high-availability/07-az-failure.md create mode 100644 website/docs/resiliency/high-availability/tests/hook-suite.sh diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh index d4040bbde..537a7d260 100755 --- a/manifests/modules/resiliency/.workshop/cleanup.sh +++ b/manifests/modules/resiliency/.workshop/cleanup.sh @@ -2,97 +2,131 @@ set -e -# Delete Ingress -kubectl delete ingress -n ui ui --ignore-not-found -kubectl delete ingress ui -n ui --ignore-not-found +echo "Starting cleanup process..." -# Delete Deployments -kubectl delete deployment -n ui ui --ignore-not-found -kubectl delete deployment ui -n ui --ignore-not-found +# Function to safely delete a resource +safe_delete() { + local cmd=$1 + local resource=$2 + echo "Attempting to delete $resource..." + if $cmd 2>/dev/null; then + echo "$resource deleted successfully." + else + echo "Failed to delete $resource or it doesn't exist. Continuing..." + fi +} -# Delete Services -kubectl delete service -n ui ui-nlb --ignore-not-found +# Function to wait for resource deletion +wait_for_deletion() { + local check_cmd=$1 + local resource=$2 + local max_attempts=30 + local attempt=0 + echo "Waiting for $resource to be deleted..." + while $check_cmd &>/dev/null && [ $attempt -lt $max_attempts ]; do + sleep 10 + ((attempt++)) + done + if [ $attempt -eq $max_attempts ]; then + echo "Timeout waiting for $resource to be deleted." + else + echo "$resource deleted successfully." + fi +} + +# Function to cleanup EKS resources in a region +cleanup_eks_region() { + local region=$1 + local cluster_name=$2 + local nodegroup_name=$3 + local delete_cluster=$4 + + echo "Cleaning up EKS resources in $region..." + + # Switch to the specified region + aws configure set default.region $region -# Delete Roles and RoleBindings -kubectl delete role chaos-mesh-role -n ui --ignore-not-found -kubectl delete rolebinding chaos-mesh-rolebinding -n ui --ignore-not-found + # Delete Kubernetes resources + echo "Cleaning up Kubernetes resources..." + kubectl delete ingress,deployment,service -n ui --all --ignore-not-found + kubectl delete role,rolebinding -n ui --all --ignore-not-found + kubectl delete namespace chaos-mesh --ignore-not-found -# Uninstall Helm chart -if command -v helm &> /dev/null; then - echo "Uninstalling aws-load-balancer-controller Helm chart" + # Delete EKS Cluster and Node Group if specified + if [ "$delete_cluster" = true ]; then + echo "Attempting to delete EKS cluster and node group..." + if aws eks describe-cluster --name $cluster_name --region $region &>/dev/null; then + aws eks delete-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region || true + wait_for_deletion "aws eks describe-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region" "node group" + + aws eks delete-cluster --name $cluster_name --region $region + wait_for_deletion "aws eks describe-cluster --name $cluster_name --region $region" "EKS cluster" + else + echo "EKS cluster $cluster_name not found in $region. Skipping deletion." + fi + else + echo "Skipping EKS cluster and node group deletion in $region as requested." + fi + + # Uninstall Helm charts + echo "Uninstalling Helm charts..." helm uninstall aws-load-balancer-controller -n kube-system || true - - echo "Uninstalling Chaos Mesh Helm chart" helm uninstall chaos-mesh -n chaos-mesh || true - - # Wait for resources to be cleaned up - echo "Waiting for resources to be cleaned up..." - sleep 30 -else - echo "Helm command not found. Skipping Helm chart uninstallations." -fi -kubectl delete namespace chaos-mesh --ignore-not-found + # Delete ALBs + echo "Cleaning up ALBs in $region..." + for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do + safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" + done +} + +# Cleanup in PRIMARY_REGION (preserve cluster and node groups) +cleanup_eks_region $PRIMARY_REGION "eks-workshop" "default" false + +# Cleanup in SECONDARY_REGION (full cleanup) +cleanup_eks_region $SECONDARY_REGION "eks-workshop-east" "us-east-2-node-group" true + +# Global cleanup (not region-specific) # Delete IAM Roles and Policies -ROLE_PREFIX="fis-execution-role-eks-workshop" -POLICY_PREFIX="eks-resiliency-fis-policy" - -# List and delete roles -for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${ROLE_PREFIX}')].RoleName" --output text); do - echo "Detaching policies and deleting role: $role" - # Detach managed policies - aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess || true - aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess || true - - # Detach and delete inline policies - for policy in $(aws iam list-role-policies --role-name $role --query PolicyNames --output text); do - aws iam delete-role-policy --role-name $role --policy-name $policy || true +echo "Cleaning up IAM roles and policies..." +for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do + for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do + echo "Processing role: $role" + for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do + safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" + done + for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do + safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" + done + safe_delete "aws iam delete-role --role-name $role" "IAM role $role" done - - # Delete the role - aws iam delete-role --role-name $role || true done -# List and delete policies -for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${POLICY_PREFIX}')].Arn" --output text); do - echo "Deleting policy: $policy_arn" - - # Detach policy from all attached roles - for role in $(aws iam list-entities-for-policy --policy-arn $policy_arn --entity-filter Role --query 'PolicyRoles[*].RoleName' --output text); do - aws iam detach-role-policy --role-name $role --policy-arn $policy_arn +for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do + for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do + safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" done - - # Delete the policy - aws iam delete-policy --policy-arn $policy_arn done -# Delete any leftover ALBs -ALB_ARN=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text) -if [ ! -z "$ALB_ARN" ]; then - echo "Deleting leftover ALB: $ALB_ARN" - aws elbv2 delete-load-balancer --load-balancer-arn $ALB_ARN -else - echo "No leftover ALB found." -fi - -# Delete S3 bucket -BUCKET_PREFIX="eks-workshop-canary-artifacts-" -for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, '${BUCKET_PREFIX}')].Name" --output text); do - echo "Deleting S3 bucket: $bucket" - # First, remove all objects from the bucket +# Delete S3 buckets +echo "Cleaning up S3 buckets..." +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do aws s3 rm s3://$bucket --recursive - # Then delete the bucket - aws s3api delete-bucket --bucket $bucket --region us-west-2 + safe_delete "aws s3api delete-bucket --bucket $bucket --region $PRIMARY_REGION" "S3 bucket $bucket" done -# Delete CloudWatch Synthetics canary +# Delete CloudWatch Synthetics canary and alarm CANARY_NAME="eks-workshop-canary" -if aws synthetics get-canary --name $CANARY_NAME --region us-west-2 &> /dev/null; then - echo "Deleting CloudWatch Synthetics canary: $CANARY_NAME" - aws synthetics delete-canary --name $CANARY_NAME --region us-west-2 -else - echo "CloudWatch Synthetics canary $CANARY_NAME not found." +ALARM_NAME="eks-workshop-canary-alarm" + +echo "Cleaning up CloudWatch Synthetics canary and alarm..." +if aws synthetics get-canary --name $CANARY_NAME --region $PRIMARY_REGION &>/dev/null; then + aws synthetics stop-canary --name $CANARY_NAME --region $PRIMARY_REGION || true + sleep 30 + safe_delete "aws synthetics delete-canary --name $CANARY_NAME --region $PRIMARY_REGION" "CloudWatch Synthetics canary $CANARY_NAME" fi -echo "Cleanup completed successfully." \ No newline at end of file +safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME --region $PRIMARY_REGION" "CloudWatch alarm $ALARM_NAME" + +echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/resiliency/.workshop/terraform/main.tf index 7e039cbdf..ae6da7511 100644 --- a/manifests/modules/resiliency/.workshop/terraform/main.tf +++ b/manifests/modules/resiliency/.workshop/terraform/main.tf @@ -13,7 +13,7 @@ module "eks_blueprints_addons" { } -// ALB creation +# ALB creation resource "kubernetes_manifest" "ui_alb" { manifest = { "apiVersion" = "networking.k8s.io/v1" @@ -49,7 +49,7 @@ resource "kubernetes_manifest" "ui_alb" { } } -// Create RBAC and Rolebinding +# Create RBAC and Rolebinding resource "kubernetes_role" "chaos_mesh_role" { metadata { name = "chaos-mesh-role" @@ -90,10 +90,10 @@ resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { } } -// Add AWS Load Balancer controller +# Add AWS Load Balancer controller resource "helm_release" "aws_load_balancer_controller" { name = "aws-load-balancer-controller" - repository = "https://aws.github.io/eks-charts" + repository = "https:#aws.github.io/eks-charts" chart = "aws-load-balancer-controller" namespace = "kube-system" version = var.load_balancer_controller_chart_version @@ -115,10 +115,10 @@ resource "helm_release" "aws_load_balancer_controller" { } -// Chaos Mesh Helm Release +# Chaos Mesh Helm Release resource "helm_release" "chaos_mesh" { name = "chaos-mesh" - repository = "https://charts.chaos-mesh.org" + repository = "https:#charts.chaos-mesh.org" chart = "chaos-mesh" namespace = "chaos-mesh" version = "2.5.1" @@ -126,7 +126,7 @@ resource "helm_release" "chaos_mesh" { create_namespace = true } -// FIS IAM role +# FIS IAM role resource "random_id" "suffix" { byte_length = 8 } @@ -140,7 +140,12 @@ resource "aws_iam_role" "fis_role" { { Effect = "Allow" Principal = { - Service = "fis.amazonaws.com" + Service = [ + "fis.amazonaws.com", + # for second region + "ec2.amazonaws.com", + "eks.amazonaws.com" + ] } Action = "sts:AssumeRole" }, @@ -175,7 +180,7 @@ resource "aws_iam_role" "fis_role" { depends_on = [kubernetes_role_binding.chaos_mesh_rolebinding] } -// Attach FIS Access Policy +# Attach FIS Access Policy resource "aws_iam_role_policy_attachment" "fis_eks_access" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess" role = aws_iam_role.fis_role.name @@ -186,7 +191,23 @@ resource "aws_iam_role_policy_attachment" "fis_network_access" { role = aws_iam_role.fis_role.name } -// Policy for creating FIS experiment templates +# Attach to FIS for EKS node group +resource "aws_iam_role_policy_attachment" "fis_node_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_ecr_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_cni_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.fis_role.name +} + +# Policy for creating FIS experiment templates resource "aws_iam_policy" "eks_resiliency_fis_policy" { name = "eks-resiliency-fis-policy-${random_id.suffix.hex}" path = "/" @@ -198,7 +219,7 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { { Effect = "Allow" Action = [ - // FIS + # FIS "fis:CreateExperimentTemplate", "fis:GetExperimentTemplate", "fis:ListExperimentTemplates", @@ -212,6 +233,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "ec2:DescribeInstances", "ec2:DescribeInstanceStatus", "ec2:TerminateInstances", + "ec2:StartInstances", + "ec2:StopInstances", "eks:DescribeCluster", "eks:ListNodegroups", "eks:DescribeNodegroup", @@ -223,7 +246,72 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "logs:UpdateLogDelivery", "logs:DeleteLogDelivery", "logs:ListLogDeliveries", - // Synthetic Canary + "ssm:StartAutomationExecution", + "ssm:GetAutomationExecution", + "cloudwatch:DescribeAlarms", + "cloudwatch:GetMetricData" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.fis_role.arn + } + ] + }) +} + +# Attach custom policy to the role +resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn + role = aws_iam_role.fis_role.name +} + + +# Canary IAM role +resource "aws_iam_role" "canary_role" { + name = "canary-execution-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = [ + "lambda.amazonaws.com", + "synthetics.amazonaws.com" + ] + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } +} + +# Attach Lambda Basic Execution Role to Canary role +resource "aws_iam_role_policy_attachment" "canary_lambda_basic_execution" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + role = aws_iam_role.canary_role.name +} + +# Policy for Canary +resource "aws_iam_policy" "eks_resiliency_canary_policy" { + name = "eks-resiliency-canary-policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency Canary" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ "synthetics:CreateCanary", "synthetics:DeleteCanary", "synthetics:DescribeCanaries", @@ -233,24 +321,59 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "s3:PutObject", "s3:GetBucketLocation", "s3:ListAllMyBuckets", + "s3:GetObject", + "s3:ListBucket", "cloudwatch:PutMetricData", + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", "logs:CreateLogGroup", "logs:CreateLogStream", - "logs:PutLogEvents" + "logs:PutLogEvents", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "lambda:InvokeFunction" ] Resource = "*" - }, + } + ] + }) +} + +# Attach custom policy to the Canary role +resource "aws_iam_role_policy_attachment" "eks_resiliency_canary_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_canary_policy.arn + role = aws_iam_role.canary_role.name +} + +# EKS Cluster IAM Role +resource "aws_iam_role" "eks_cluster_role" { + name = "eks-cluster-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ { Effect = "Allow" - Action = "iam:PassRole" - Resource = aws_iam_role.fis_role.arn + Principal = { + Service = "eks.amazonaws.com" + } + Action = "sts:AssumeRole" } ] }) + + lifecycle { + create_before_destroy = true + } } -// Attach custom policy to the role -resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { - policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn - role = aws_iam_role.fis_role.name +# Attach required policies to EKS Cluster role +resource "aws_iam_role_policy_attachment" "eks_cluster_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" + role = aws_iam_role.eks_cluster_role.name +} + +resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" + role = aws_iam_role.eks_cluster_role.name } diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/resiliency/.workshop/terraform/outputs.tf index 413de0df1..a584978a7 100644 --- a/manifests/modules/resiliency/.workshop/terraform/outputs.tf +++ b/manifests/modules/resiliency/.workshop/terraform/outputs.tf @@ -1,10 +1,14 @@ output "environment_variables" { description = "Environment variables to be added to the IDE shell" value = { - LBC_CHART_VERSION = var.load_balancer_controller_chart_version - LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn - FIS_ROLE_ARN = aws_iam_role.fis_role.arn - RANDOM_SUFFIX = random_id.suffix.hex - SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + LBC_CHART_VERSION = var.load_balancer_controller_chart_version + LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + FIS_ROLE_ARN = aws_iam_role.fis_role.arn + RANDOM_SUFFIX = random_id.suffix.hex + SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + CANARY_ROLE_ARN = aws_iam_role.canary_role.arn + EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn + PRIMARY_REGION = "us-west-2" + SECONDARY_REGION = "us-east-2" } } diff --git a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml new file mode 100644 index 000000000..b2a276fde --- /dev/null +++ b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui + namespace: ui +spec: + replicas: 9 # Total number of replicas + template: + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 60 + preference: + matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-west-2a + - us-west-2b + - us-west-2c + - weight: 40 + preference: + matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + - us-east-2b + - us-east-2c + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - ui + topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml new file mode 100644 index 000000000..32bf6179b --- /dev/null +++ b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../../../manifests/base-application/ui + +patches: + - path: add_us_east_2_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml deleted file mode 100644 index 5e5981a82..000000000 --- a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: ui - name: chaos-mesh-role -rules: - - apiGroups: ["chaos-mesh.org"] - resources: ["podchaos"] - verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml deleted file mode 100644 index 338d88c3b..000000000 --- a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: chaos-mesh-rolebinding - namespace: ui -subjects: - - kind: User - name: PLACEHOLDER - namespace: ui -roleRef: - kind: Role - name: chaos-mesh-role - apiGroup: rbac.authorization.k8s.io diff --git a/manifests/modules/resiliency/scripts/create-second-az.sh b/manifests/modules/resiliency/scripts/create-second-az.sh new file mode 100755 index 000000000..09d9c28bb --- /dev/null +++ b/manifests/modules/resiliency/scripts/create-second-az.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Ensure SCRIPT_DIR is set +if [ -z "$SCRIPT_DIR" ]; then + echo "Error: SCRIPT_DIR environment variable is not set." + exit 1 +fi + +# Ensure PRIMARY_REGION and SECONDARY_REGION are set +if [ -z "$PRIMARY_REGION" ] || [ -z "$SECONDARY_REGION" ]; then + echo "Error: PRIMARY_REGION and SECONDARY_REGION must be set." + exit 1 +fi + +# Function to run multi-az-get-pods.sh and display region +run_multi_az_script() { + local region=$1 + echo "Current region: $region" + echo "Running multi-az-get-pods.sh..." + $SCRIPT_DIR/multi-az-get-pods.sh + echo "----------------------------------------" +} + +# Run multi-az-get-pods.sh in PRIMARY_REGION +aws configure set default.region $PRIMARY_REGION +run_multi_az_script $PRIMARY_REGION + +# Switch to SECONDARY_REGION +echo "Switching to SECONDARY_REGION: $SECONDARY_REGION" +aws configure set default.region $SECONDARY_REGION + +# Prepare environment for resiliency module +echo "Preparing environment for resiliency module..." +prepare-environment resiliency + +# Verify the EKS cluster in SECONDARY_REGION +echo "Verifying EKS cluster in SECONDARY_REGION..." +aws eks list-clusters + +# Check node groups in SECONDARY_REGION +CLUSTER_NAME=$(aws eks list-clusters --query 'clusters[0]' --output text) +echo "Checking node groups for cluster: $CLUSTER_NAME" +aws eks list-nodegroups --cluster-name $CLUSTER_NAME + +# Switch back to PRIMARY_REGION +echo "Switching back to PRIMARY_REGION: $PRIMARY_REGION" +aws configure set default.region $PRIMARY_REGION + +# Run multi-az-get-pods.sh one last time in PRIMARY_REGION +run_multi_az_script $PRIMARY_REGION + +echo "Setup complete. \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js new file mode 100644 index 000000000..74deb4591 --- /dev/null +++ b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js @@ -0,0 +1,30 @@ +const synthetics = require("Synthetics"); +const log = require("SyntheticsLogger"); + +const pageLoadBlueprint = async function () { + const PAGE_LOAD_TIMEOUT = 30; + const URL = process.env.INGRESS_URL || "http://localhost"; // Use environment variable or fallback + + let page = await synthetics.getPage(); + + await synthetics.executeStep("Navigate to " + URL, async function () { + const response = await page.goto(URL, { + waitUntil: "domcontentloaded", + timeout: PAGE_LOAD_TIMEOUT * 1000, + }); + + // Verify the page loaded successfully + if (response.status() !== 200) { + throw new Error(`Failed to load page. Status code: ${response.status()}`); + } + }); + + await synthetics.executeStep("Verify page content", async function () { + const pageTitle = await page.title(); + log.info("Page title: " + pageTitle); + }); +}; + +exports.handler = async () => { + return await pageLoadBlueprint(); +}; diff --git a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh new file mode 100755 index 000000000..f47649eb8 --- /dev/null +++ b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +REGIONS=("us-west-2" "us-east-2") + +for REGION in "${REGIONS[@]}" +do + echo "Region: $REGION" + for az in a b c + do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers 2>/dev/null | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>/dev/null | while read line; do echo " ${line}"; done + done + echo "" + done + echo "" +done \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/resiliency/scripts/verify-cluster.sh index 56e2844df..2e6329b90 100755 --- a/manifests/modules/resiliency/scripts/verify-cluster.sh +++ b/manifests/modules/resiliency/scripts/verify-cluster.sh @@ -5,6 +5,7 @@ DESIRED_REPLICAS=5 MAX_WAIT_TIME=300 # 5 minutes POLL_INTERVAL=10 # 10 seconds NAMESPACE="ui" +EXPECTED_READY_NODES=3 print_header() { echo -e "\n==== $1 ====\n" @@ -26,6 +27,20 @@ wait_for_condition() { print_header "Checking Current Pod Distribution" $SCRIPT_DIR/get-pods-by-az.sh +print_header "Waiting for nodes to be Ready" +total_nodes=$(kubectl get nodes --no-headers | wc -l) +echo "Total nodes in the cluster: $total_nodes" +echo "Waiting for $EXPECTED_READY_NODES nodes to be in Ready state" +if wait_for_condition "[ \$(kubectl get nodes --no-headers | grep ' Ready ' | wc -l) -eq $EXPECTED_READY_NODES ]"; then + echo -e "\n✅ $EXPECTED_READY_NODES nodes are in Ready state." +else + echo -e "\n⚠️ Warning: $EXPECTED_READY_NODES nodes did not reach Ready state within the timeout period." + exit 1 +fi + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + print_header "Node Information" kubectl get nodes -o wide diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md index 03b327af8..31821d93a 100644 --- a/website/docs/resiliency/high-availability/01-setup.md +++ b/website/docs/resiliency/high-availability/01-setup.md @@ -4,11 +4,6 @@ sidebar_position: 1 description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." --- -TODO: - -- Update Name -- Update/Remove Verification - This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. ## Scaling and Pod Anti-Affinity @@ -21,67 +16,43 @@ Here's the content of our patch file: manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml ``` -Apply the changes using Kustomize patch and - - +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/config/kustomization.yaml): ```bash $ kubectl delete deployment ui -n ui $ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ ``` -## Create Helper Script: Get Pods by AZ - -The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file +## Verify Retail Store Accessibility - - -To make this script executable: +After applying these changes, it's important to verify that your retail store is accessible: ```bash -$ chmod +x $SCRIPT_DIR/get-pods-by-az.sh +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` -### Script Execution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/get-pods-by-az.sh -``` +Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. :::tip -Use this to quickly assess the distribution of your pods across multiple zones. +If the retail store doesn't load immediately, wait a few moments and refresh the page. It may take a short time for all components to become fully operational. ::: -## Verification +## Helper Script: Get Pods by AZ -After applying these changes, verify the setup: +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/get-pods-by-az.sh). -1. Check for 5 running UI pods: - -```bash -$ kubectl get pods -n ui -``` - -2. Verify pod distribution across nodes: - -```bash -$ kubectl get pods -n ui -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}' -``` +### Script Execution -3. Check that AWS Load Balancer Controller is installed and working: +To run the script and see the distribution of pods across availability zones, execute: ```bash -$ kubectl get pods -n kube-system | grep aws-load-balancer-controller -$ kubectl get ingress --all-namespaces +$ $SCRIPT_DIR/get-pods-by-az.sh ``` -4. Ensure the Load Balancer is working and access to the Retail URL: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` +:::tip +Use this to quickly assess the distribution of your pods across multiple zones. +::: :::info For more information on these changes, check out these sections: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/resiliency/high-availability/02-pod-failure.md index cbde69d2c..b1bcc55c8 100644 --- a/website/docs/resiliency/high-availability/02-pod-failure.md +++ b/website/docs/resiliency/high-availability/02-pod-failure.md @@ -6,45 +6,62 @@ description: "Simulate pod failure in your environment using ChaosMesh to test t ## Overview -TODO: +In this lab, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds and recovers. This experiment is designed to test the resiliency of your application under adverse conditions, specifically when a pod unexpectedly fails. -- fix file visual? -- add more information about this lab and a conclusion -- Note that this experiment is repeatable -- Note that retail store should still work even when the pod fails +The `pod-failure.sh` script utilizes Chaos Mesh, a powerful chaos engineering platform for Kubernetes, to simulate a pod failure. This controlled experiment allows you to: -In this experiment, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds. The `pod-failure.sh` script will simulate a pod failure using Chaos Mesh. This is the script we will be using: +1. Observe the system's immediate response to pod failure +2. Monitor the automatic recovery process +3. Verify that your application remains available despite the simulated failure + +This experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. This is the script we will be using: ```file manifests/modules/resiliency/scripts/pod-failure.sh ``` -To make this script executable: - -```bash -$ chmod +x $SCRIPT_DIR/pod-failure.sh -``` - ## Running the Experiment -Run the experiment and monitor the effects on pod distribution: +To simulate the pod failure and monitor its effects, run the following command: ```bash $ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -This command initiates the pod failure and monitors the pod distribution for 30 seconds to observe how the system handles the failure. You should see one pod dissapear and then reappear. +This command does the following: + +1. Initiates the pod failure simulation using the `pod-failure.sh` script +2. Monitors the pod distribution across Availability Zones (AZs) for 30 seconds +3. Updates the display every second to show real-time changes + +During the experiment, you should observe one pod disappearing and then reappearing, demonstrating the system's ability to detect and recover from failures. -Check the status of pods in the `ui` namespace: +To get a more detailed view of the pods in the `ui` namespace, use the following command: ```bash $ kubectl get pods -n ui -o wide ``` +This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. + ## Verify Retail Store Availability -To ensure that the retail store is operational, check its availability with the url fetched with this command: +An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: ```bash $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` + +This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. + +## Conclusion + +This pod failure simulation demonstrates the resilience of your Kubernetes-based application. By intentionally causing a pod to fail, you can observe: + +1. The system's ability to detect failures quickly +2. Kubernetes' automatic rescheduling and recovery of failed pods +3. The application's continued availability during pod failures + +Remember that the retail store should remain operational even when a pod fails, showcasing the high availability and fault tolerance of your Kubernetes setup. This experiment helps validate your application's resilience and can be repeated as needed to ensure consistent behavior across different scenarios or after making changes to your infrastructure. + +By regularly performing such chaos engineering experiments, you can build confidence in your system's ability to withstand and recover from various types of failures, ultimately leading to a more robust and reliable application. diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md index 7e154f2b0..494bddbd1 100644 --- a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md +++ b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md @@ -6,33 +6,21 @@ description: "Manually simulate a node failure in your Kubernetes environment to # Simulating Node Failure without FIS -TODO: - -- add information and concluding thoughts -- note that this is repeatable -- should see node failure after about a minute, pods come return shortly after to current working nodes, node comes back online after about 2 minutes -- should I make more things following the verify-cluster.sh visual? -- Load balancer does not appear to work although it should -- Rather than the seeing whole script, show expected output? -- Update script to wait for 3 nodes online - ## Overview -This experiment simulate a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: +This experiment simulates a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. By deliberately causing a node to fail, we can observe how Kubernetes handles the failure and maintains the overall health of the cluster. + +The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: ```file manifests/modules/resiliency/scripts/node-failure.sh ``` -To make this script executable: - -```bash -$ chmod +x $SCRIPT_DIR/node-failure.sh -``` +It's important to note that this experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. ## Running the Experiment -Run the node failure experiment and monitor the effects on pod distribution: +To simulate the node failure and monitor its effects, run the following command: ```bash $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done @@ -40,43 +28,67 @@ $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. -During the experiment, you should observe the following: +During the experiment, you should observe the following sequence of events: -- One node disappearing from the list -- Kubernetes will detect the node failure and reschedule the pods that were running on the failed node -- These pods being redistributed to the remaining healthy nodes -- The failed node will come back online +1. After about 1 minute, you'll see one node disappear from the list. This represents the simulated node failure. +2. Shortly after the node failure, you'll notice pods being redistributed to the remaining healthy nodes. Kubernetes detects the node failure and automatically reschedules the affected pods. +3. Approximately 2 minutes after the initial failure, the failed node will come back online. -The total number of running pods should remain constant, ensuring application availability. +Throughout this process, the total number of running pods should remain constant, ensuring application availability. -## Verify Retail Store Availability +## Verifying Cluster Recovery + +While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -After simulating the node failure, verify if the retail store application remains accessible: +Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: ```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ $SCRIPT_DIR/verify-cluster.sh ``` -## Verifying Cluster Recovery +This script will: + +- Wait for nodes to come back online +- Count the number of nodes and ui pods +- Check if the pods are evenly distributed across the nodes -After simulating the node failure, we'll verify the cluster's self-healing and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. +## Verify Retail Store Availability -Use the following +After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: - +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` -to verify the cluster state and rebalance pods: +This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated node failure. + +:::caution +The retail url may take 10 minutes to become operational. You can optionally continue on with the lab by pressing `ctrl` + `z` to move operation to the background. To access it again input: ```bash -$ chmod +x $SCRIPT_DIR/verify-cluster.sh -$ $SCRIPT_DIR/verify-cluster.sh +$ fg ``` -This script will: +The url may not become operational by the time `wait-for-lb` times out. In that case, it should become operational after running the command again: -- Counts the number of nodes and ui pods -- Checks if the pods are evenly distributed across the nodes +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +::: ## Conclusion -add concluding thoughts +This node failure simulation demonstrates the robustness and self-healing capabilities of your Kubernetes cluster. Key observations and lessons from this experiment include: + +1. Kubernetes' ability to quickly detect node failures and respond accordingly. +2. The automatic rescheduling of pods from the failed node to healthy nodes, ensuring continuity of service. +3. The cluster's self-healing process, bringing the failed node back online after a short period. +4. The importance of proper resource allocation and pod distribution to maintain application availability during node failures. + +By regularly performing such experiments, you can: + +- Validate your cluster's resilience to node failures. +- Identify potential weaknesses in your application's architecture or deployment strategy. +- Gain confidence in your system's ability to handle unexpected infrastructure issues. +- Refine your incident response procedures and automation. diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md index 4b9091fd5..0d5a738db 100644 --- a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md @@ -6,20 +6,9 @@ description: "Simulate a partial node failures in your Kubernetes environment us # Simulating Partial Node Failure with FIS -TODO: - -- More FIS info? -- More information about the experiment -- Explain what FIS is doing different, what the experiment is doing -- should see a 1 node failing after about a minute, pods to come back up after 2 and a half minutes, and the node come back up after -- check to make sure retail app stays up -- retail app apears to not work -> need to fix load balancer configs -- A conclusion / learning from experiment -- Note that FIS can allow automatic testing for failure and whatever else is cool - ## AWS Fault Injection Simulator (FIS) Overview -AWS Fault Injection Simulator is a fully managed service that helps you perform fault injection experiments on your AWS workloads. In the context of EKS, FIS allows us to simulate various failure scenarios, which is crucial for: +AWS Fault Injection Simulator (FIS) is a fully managed service that enables you to perform controlled fault injection experiments on your AWS workloads. FIS allows you to simulate various failure scenarios, which is crucial for: 1. Validating high availability configurations 2. Testing auto-scaling and self-healing capabilities @@ -31,18 +20,31 @@ By using FIS, you can: - Discover hidden bugs and performance bottlenecks - Observe how your systems behave under stress - Implement and validate automated recovery procedures +- Conduct repeatable experiments to ensure consistent behavior In our FIS experiment, we'll simulate a partial node failure in our EKS cluster and observe how our application responds, providing practical insights into building resilient systems. :::info -For more information on AWS FIS check out: +For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) +- [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) ::: +## Experiment Details + +This experiment differs from the previous manual node failure simulation in several ways: + +1. Automated execution: FIS manages the experiment, allowing for more controlled and repeatable tests. +2. Partial failure: Instead of simulating a complete node failure, we're testing a scenario where a portion of the nodes fail. +3. Scale: FIS allows us to target multiple nodes simultaneously, providing a more realistic large-scale failure scenario. +4. Precision: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. + +In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. + ## Creating the Node Failure Experiment -Create a new AWS FIS experiment template to simulate the node failure: +Create a new AWS FIS experiment template to simulate the partial node failure: ```bash $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -53,30 +55,67 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: ```bash -$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -This will trigger the node failure and begin monitoring the pods for 5 minutes, observing how the cluster responds to losing part of its capacity. +This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. + +During the experiment, you should observe the following: + +1. After about 1 minute, you'll see one or more nodes disappear from the list, representing the simulated partial node failure. +2. Over the next 2 minutes, you'll notice pods being rescheduled and redistributed to the remaining healthy nodes. +3. Shortly after you'll see the new node coming online to replace the terminated one. + +Your retail url should stay operational unlike the node failure without FIS. + +:::note +To verify clusters and rebalance pods, you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: ## Verifying Retail Store Availability -After simulating the node failure, check if the retail store application remains operational: +Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: ```bash $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` -Despite a partial node failure, the retail store continues to serve traffic, demonstrating the resilience of your deployment setup. +:::tip +The retail url may take 10 minutes to become operational. +::: + +Despite the partial node failure, the retail store should continue to serve traffic, demonstrating the resilience of your deployment setup. :::caution Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. ::: -:::note -To verify clusters and rebalance pods you can run: +## Conclusion -```bash -$ $SCRIPT_DIR/verify-cluster.sh -``` +This partial node failure simulation using AWS FIS demonstrates several key aspects of your Kubernetes cluster's resilience: -::: +1. Automatic detection of node failures by Kubernetes +2. Swift rescheduling of pods from failed nodes to healthy ones +3. The cluster's ability to maintain service availability during significant infrastructure disruptions +4. Auto-scaling capabilities to replace failed nodes + +Key takeaways from this experiment: + +- The importance of distributing your workload across multiple nodes and availability zones +- The value of having appropriate resource requests and limits set for your pods +- The effectiveness of Kubernetes' self-healing mechanisms +- The need for robust monitoring and alerting systems to detect and respond to node failures + +By leveraging AWS FIS for such experiments, you gain several advantages: + +1. Repeatability: You can run this experiment multiple times to ensure consistent behavior. +2. Automation: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. +3. Comprehensive testing: You can create more complex scenarios involving multiple AWS services to test your entire application stack. +4. Controlled chaos: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. + +Regular execution of such experiments helps build confidence in your system's resilience and provides valuable insights for continuous improvement of your architecture and operational procedures. diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md index ab5cbdd95..a7f142b71 100644 --- a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md @@ -6,26 +6,25 @@ description: "Demonstrates the impact of a complete node failure on a Kubernetes # Simulating Complete Node Failure with FIS -TODO: - -- Fix script to mimic last experiment again -- Why is this different than last experiment -- Explain what is happening in more detail -- Note timings -- Concluding Statement -- You should see all nodes and pods dissapear rather quickly then after about 2 minutes will start to see 1 node and pods coming online, after 4 minutes a second node will come online and 3 more pods. - ## Overview -This experiment is an extensive test that isn't necessary but demonstrates the robust capabilities of AWS Fault Injection Simulator by simulating a complete node failure in a Kubernetes cluster. +This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. :::info Important -This test showcases how FIS can be used to simulate worst-case scenarios to help validate the resilience and recovery strategies of your applications. +This test simulates a worst-case scenario. It's designed for controlled environments with thoroughly tested recovery mechanisms. ::: +## Experiment Details + +Unlike the partial node failure simulation, this experiment: + +1. Terminates 100% of the instances in all node groups. +2. Tests your cluster's ability to recover from a state of complete failure. +3. Allows observation of the full recovery process, from total outage to full restoration. + ## Creating the Node Failure Experiment -Create a new AWS FIS experiment template to simulate the complete failure of all nodes in a specific node group: +Create a new AWS FIS experiment template to simulate the complete node failure: ```bash $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"ALL"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"100"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -33,33 +32,56 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc ## Running the Experiment -Execute the FIS experiment to simulate the complete node failure: +Execute the FIS experiment and monitor the cluster's response: ```bash $ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -Monitor the cluster as it loses all node resources temporarily, observing how the Kubernetes system and your application respond. +This command will show the pods distribution over 5 minutes while we observe the experiment. We should see: -## Verifying Retail Store Availability +1. Shortly after the experment is initiated, all nodes and pods dissapear. +2. After about 2 minutes, First node and some pods will come back online. +3. Around 4 minutes, a second node appears and more pods start up. +4. At 5 minutes, continued recovery as the last node come online. -After simulating the node failure, check if the retail store application is still operational: +Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. + +:::note +To verify clusters and rebalance pods, you can run: ```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ $SCRIPT_DIR/verify-cluster.sh ``` -This command helps confirm that despite complete node failure, the application begins to recover as the Kubernetes cluster auto-scales back up. - -:::caution -This test can cause significant disruption, so it's recommended for use only in controlled environments where recovery mechanisms are thoroughly tested. ::: -:::note -To verify clusters and rebalance pods you can run: +## Verifying Retail Store Availability + +Check the retail store application's recovery: ```bash -$ $SCRIPT_DIR/verify-cluster.sh +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` +:::tip +The retail url may take 10 minutes to become operational. ::: + +## Conclusion + +This experiment demonstrates: + +1. Your cluster's response to catastrophic failure. +2. Effectiveness of auto-scaling in replacing all failed nodes. +3. Kubernetes' ability to reschedule all pods onto new nodes. +4. Total system recovery time from complete failure. + +Key learnings: + +- Importance of robust auto-scaling configurations. +- Value of effective pod priority and preemption settings. +- Need for architectures that can withstand complete cluster failure. +- Significance of regular testing of extreme scenarios. + +By using FIS for such tests, you can safely simulate catastrophic failures, validate recovery procedures, identify critical dependencies, and measure recovery times. This helps in refining your disaster recovery plans and improving overall system resilience. diff --git a/website/docs/resiliency/high-availability/06-az-failure.md b/website/docs/resiliency/high-availability/06-az-failure.md deleted file mode 100644 index 1091b41e7..000000000 --- a/website/docs/resiliency/high-availability/06-az-failure.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -title: "Simulating AZ Failure" -sidebar_position: 6 -description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." ---- - -# Simulating AZ Failure - -TODO: - -- Fix canary -- Check AZ failure still works -- add specific cloudwatch iam role -- add conclustion - -## Overview - -This experiment simulates an Availability Zone (AZ) failure, demonstrating how robust your application is when faced with significant disruptions. It leverages AWS Fault Injection Simulator (FIS) and additional AWS services to test the resilience of the system under the stress of an AZ going offline. - -## Preparation - -### Setting up a Synthetic Canary - -Before starting the experiment, set up a synthetic canary for heartbeat monitoring: - -1. First, create an S3 bucket for the canary artifacts: - -```bash -$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" -$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 -``` - -2. Create the canary: - -Set up the blueprint: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ cat < canary_script.js -var synthetics = require('Synthetics'); -var log = require('SyntheticsLogger'); - -const pageLoadBlueprint = async function () { - const PAGE_LOAD_TIMEOUT = 30; - const URL = 'http://${INGRESS_URL}'; - let page = await synthetics.getPage(); - await synthetics.executeStep('Navigate to ' + URL, async function () { - await page.goto(URL, {waitUntil: 'domcontentloaded', timeout: PAGE_LOAD_TIMEOUT * 1000}); - }); - await synthetics.executeStep('Page loaded successfully', async function () { - log.info('Page loaded successfully'); - }); -}; - -exports.handler = async () => { - return await pageLoadBlueprint(); -}; -EOF -$ aws s3 cp canary_script.js s3://$BUCKET_NAME/canary-script/canary_script.js -``` - -Create a synthetic canary: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ aws synthetics create-canary \ - --name eks-workshop-canary \ - --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ - --execution-role-arn $FIS_ROLE_ARN \ - --runtime-version syn-nodejs-puppeteer-9.0 \ - --schedule Expression="rate(1 minute)" \ - --code S3Bucket=$BUCKET_NAME,S3Key=canary-script/canary_script.js,Handler="canary_script.handler" \ - --region us-west-2 -$ sleep 30 -$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 -``` - -3. Create a CloudWatch alarm for the canary: - -```bash -$ aws cloudwatch put-metric-alarm \ - --alarm-name "eks-workshop-canary-alarm" \ - --metric-name SuccessPercent \ - --namespace CloudWatchSynthetics \ - --statistic Average \ - --period 60 \ - --threshold 95 \ - --comparison-operator LessThanThreshold \ - --dimensions Name=CanaryName,Value=eks-workshop-canary \ - --evaluation-periods 1 \ - --alarm-description "Alarm when Canary success rate drops below 95%" \ - --unit Percent \ - --region us-west-2 -``` - -This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. - -### Setting up the Experiment - -Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: - -```bash -$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) -$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') -``` - -Create the FIS experiment template to simulate the AZ failure: - -```bash -$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"arn:aws:iam::'$AWS_ACCOUNT_ID':role/WSParticipantRole\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') -``` - -## Running the Experiment - -Execute the FIS experiment to simulate the AZ failure: - -```bash -aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && \ -timeout 450 watch -n 1 --color $SCRIPT_DIR/get-pods-by-az.sh -``` - -This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs to understand the immediate impact of the simulated AZ failure. - -## Post-Experiment Verification - -Ensure that your application remains operational despite the simulated AZ failure, confirming the effectiveness of Kubernetes high availability: - -```bash -wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -## Conclusion - -This experiment demonstrates the resilience of your EKS cluster in the face of an Availability Zone failure. By monitoring the canary and observing the redistribution of pods, you can assess how well your application maintains availability during significant infrastructure disruptions. diff --git a/website/docs/resiliency/high-availability/06-az-setup.md b/website/docs/resiliency/high-availability/06-az-setup.md new file mode 100644 index 000000000..4c7d2eeb9 --- /dev/null +++ b/website/docs/resiliency/high-availability/06-az-setup.md @@ -0,0 +1,123 @@ +--- +title: "AZ Failure Experiment Setup" +sidebar_position: 6 +description: "Scale your application to two Availability Zones and prepare for an AZ failure simulation experiment." +--- + +This guide outlines steps to enhance the resilience of your UI service by scaling it across two Availability Zones (AZs) and preparing for an AZ failure simulation experiment. + +## Scaling to Two AZs + +We'll use a Kustomize patch to modify the UI deployment, adding a second AZ and adjusting the number of replicas. We'll scale to 4 replicas in the new AZ while maintaining 5 replicas in the first AZ. + +First we need to make ann EKS Cluster in `us-east-2`. Run this to create a second AZ: + +```bash timeout=300 wait=30 +$ $SCRIPT_DIR/multi-az-get-pods.sh +$ aws configure set default.region $SECONDARY_REGION +$ prepare-environment resiliency +$ aws configure set default.region $PRIMARY_REGION +$ $SCRIPT_DIR/multi-az-get-pods.sh +``` + +Now we need to Kustomize our content with a patch file: + +```file +manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml +``` + +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml): + +```bash +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/resiliency/high-availability/multi_az/ +``` + +## Verify Retail Store Accessibility + +After applying these changes, it's important to verify that your retail store is accessible: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +:::tip +The retail url may take 10 minutes to become operational. +::: + +## Check Pod Distribution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/multi-az-get-pods.sh +``` + +## AZ Failure Experiment Preparation + +### Overview + +This experiment will simulate an Availability Zone (AZ) failure, demonstrating how resilient your application is when faced with significant infrastructure disruptions. We'll use AWS Fault Injection Simulator (FIS) and additional AWS services to test how well your system maintains functionality when an entire AZ becomes unavailable. + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash +$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 +``` + +2. Create the blueprint: + +```file +manifests/modules/resiliency/scripts/eks_workshop_canary_script.js +``` + +Place this canary script into the bucket: + +```bash +$ aws s3 cp /manifests/modules/resiliency/scripts/eks_workshop_canary_script.zip s3://$BUCKET_NAME/canary-scripts/eks_workshop_canary_script.zip +``` + +3. Create a synthetic canary: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $CANARY_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-6.2 \ + --schedule Expression="rate(1 minute)" \ + --code S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/eks_workshop_canary_script.zip,Handler="exports.handler" \ + --run-config "EnvironmentVariables={INGRESS_URL=http://$INGRESS_URL}" \ + --region us-west-2 +$ sleep 30 +$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 +``` + +4. Create a CloudWatch alarm for the canary: + +```bash +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region us-west-2 +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +With these steps completed, your application is now scaled across two AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docs/resiliency/high-availability/07-az-failure.md b/website/docs/resiliency/high-availability/07-az-failure.md new file mode 100644 index 000000000..c164d3c85 --- /dev/null +++ b/website/docs/resiliency/high-availability/07-az-failure.md @@ -0,0 +1,84 @@ +--- +title: "Simulating AZ Failure" +sidebar_position: 7 +description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." +--- + +# Simulating AZ Failure + +## Overview + +This experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. + +### Setting up the Experiment + +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: + +```bash +$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) +$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') +``` + +Create the FIS experiment template to simulate the AZ failure: + +```bash +$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the AZ failure: + +```bash +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 450 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 7.5 minutes to understand the immediate impact of the simulated AZ failure. + +During the experiment, you should observe the following sequence of events: + +- input here + +:::note +To verify clusters and rebalance pods, you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: + +## Post-Experiment Verification + +After the experiment, verify that your application remains operational despite the simulated AZ failure: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +This step confirms the effectiveness of your Kubernetes cluster's high availability configuration and its ability to maintain service continuity during significant infrastructure disruptions. + +## Conclusion + +The AZ failure simulation represents a critical test of your EKS cluster's resilience and your application's high availability design. Through this experiment, you've gained valuable insights into: + +1. The effectiveness of your multi-AZ deployment strategy +2. Kubernetes' ability to reschedule pods across remaining healthy AZs +3. The impact of an AZ failure on your application's performance and availability +4. The efficiency of your monitoring and alerting systems in detecting and responding to major infrastructure issues + +Key takeaways from this experiment include: + +- The importance of distributing your workload across multiple AZs +- The value of proper resource allocation and pod anti-affinity rules +- The need for robust monitoring and alerting systems that can quickly detect AZ-level issues +- The effectiveness of your disaster recovery and business continuity plans + +By regularly conducting such experiments, you can: + +- Identify potential weaknesses in your infrastructure and application architecture +- Refine your incident response procedures +- Build confidence in your system's ability to withstand major failures +- Continuously improve your application's resilience and reliability + +Remember, true resilience comes not just from surviving such failures, but from maintaining performance and user experience even in the face of significant infrastructure disruptions. Use the insights gained from this experiment to further enhance your application's fault tolerance and ensure seamless operations across all scenarios. diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/resiliency/high-availability/index.md index 31556db21..6dec235d9 100644 --- a/website/docs/resiliency/high-availability/index.md +++ b/website/docs/resiliency/high-availability/index.md @@ -5,23 +5,10 @@ sidebar_custom_props: { "module": true } description: "Prepare your EKS environment to handle high availability scenarios effectively." --- -TODO: - -- have to delete deployment before? why? is that due to dev or what -- expected time for lab completion -- expected time for prepare-env (about 5 minutes without cleanup.sh and any previous applications) -- Lab overview -- Check info sections -- Are we able to chmod in backend? -- Check why the load balancer stopped working - -::required-time - :::tip Before you start Prepare your environment for this section: ```bash timeout=300 wait=30 -$ chmod +x /manifests/modules/resiliency/.workshop/cleanup.sh $ /manifests/modules/resiliency/.workshop/cleanup.sh $ prepare-environment resiliency ``` @@ -37,13 +24,32 @@ This will make the following changes to your lab environment: You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). ::: -In this lab, we'll look at... -information +## Lab Overview + +In this lab, we'll explore various high availability scenarios and test the resilience of your EKS environment. Through a series of experiments, you'll gain hands-on experience in handling different types of failures and understanding how your Kubernetes cluster responds to these challenges. + +The experiments we'll conduct include: + +1. Pod Failure Simulation: Using ChaosMesh to test your application's resilience to individual pod failures. +2. Node Failure without FIS: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. +3. Partial Node Failure with FIS: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. +4. Complete Node Failure with FIS: Testing your cluster's response to a catastrophic failure of all nodes. +5. Availability Zone Failure: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. + +These experiments will help you understand: + +- How Kubernetes handles different types of failures +- The importance of proper resource allocation and pod distribution +- The effectiveness of your monitoring and alerting systems +- How to improve your application's fault tolerance and recovery strategies + +By the end of this lab, you'll have a comprehensive understanding of your EKS environment's high availability capabilities and areas for potential improvement. :::info -For more information on these changes checkout: +For more information on the components used in this lab, check out: - [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) - [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) - [Chaos Mesh](https://chaos-mesh.org/) +- [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) ::: diff --git a/website/docs/resiliency/high-availability/tests/hook-suite.sh b/website/docs/resiliency/high-availability/tests/hook-suite.sh new file mode 100644 index 000000000..8b5a4baea --- /dev/null +++ b/website/docs/resiliency/high-availability/tests/hook-suite.sh @@ -0,0 +1,11 @@ +set -e + +before() { + echo "noop" +} + +after() { + prepare-environment +} + +"$@" diff --git a/website/docs/resiliency/index.md b/website/docs/resiliency/index.md index 1541ba19d..0252fee19 100644 --- a/website/docs/resiliency/index.md +++ b/website/docs/resiliency/index.md @@ -4,16 +4,6 @@ sidebar_position: 11 weight: 10 --- -TODO: - -- Add intro information -- Find a lab to input - -Other TODO: - -- autotesting -- Containers on couch vod (link it here?) - ## What is Resiliency? Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: @@ -31,10 +21,11 @@ Amazon EKS provides a managed Kubernetes platform, but it's still crucial to des 2. **Data Integrity**: Prevent data loss and maintain consistency during unexpected events. 3. **User Experience**: Minimize downtime and performance degradation to maintain user satisfaction. 4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. +5. **Compliance**: Meet regulatory requirements for uptime and data protection in various industries. ## Resiliency Scenarios Covered in this Chapter -We'll explore several scenarios to show resiliency by performing: +We'll explore several scenarios to show resiliency by by simulating and responding to: 1. Pod Failures 2. Node Failures @@ -44,11 +35,29 @@ We'll explore several scenarios to show resiliency by performing: By the end of this chapter, you'll be able to: -- Use AWS FIS to simulate and learn from controlled failure scenarios -- other info +- Use AWS Fault Injection Simulator (FIS) to simulate and learn from controlled failure scenarios +- Understand how Kubernetes handles different types of failures (pod, node, and availability zone) +- Observe the self-healing capabilities of Kubernetes in action +- Gain practical experience in chaos engineering for EKS environments -:::info +## Tools and Technologies - +Throughout this chapter, we'll be using: +- AWS Fault Injection Simulator (FIS) for controlled chaos engineering +- Chaos Mesh for Kubernetes-native chaos testing +- AWS CloudWatch Synthetics for creating and monitoring a canary +- Kubernetes native features for observing pod and node behavior during failures + +## Importance of Chaos Engineering + +Chaos engineering is the practice of intentionally introducing controlled failures to identify weaknesses in your system. By proactively testing your system's resilience, you can: + +1. Uncover hidden issues before they affect users +2. Build confidence in your system's ability to withstand turbulent conditions +3. Improve your incident response procedures +4. Foster a culture of resilience within your organization + +:::info +For more information on AWS Resiliency features in greater depth, we recommend checking out [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) ::: From d34c0b852d3ee452cc4d9951c0e8774fcbd67f15 Mon Sep 17 00:00:00 2001 From: cyturney Date: Tue, 20 Aug 2024 10:39:12 -0700 Subject: [PATCH 03/11] fixed az failure, added autotesting, added outputs to bash --- cluster/eksctl/cluster.yaml | 2 +- .../resiliency/.workshop/cleanup.sh | 77 ++++++++++ .../resiliency/.workshop/terraform/main.tf | 54 ++++--- .../resiliency/.workshop/terraform/outputs.tf | 6 +- .../resiliency/.workshop/terraform/vars.tf | 6 + .../config/kustomization.yaml | 2 +- .../config/scale_and_affinity_patch.yaml | 0 .../resiliency/scripts/AZ-verify-clusters.sh | 110 +++++++++++++++ .../resiliency/scripts/create-blueprint.sh | 114 +++++++++++++++ .../resiliency/scripts/get-pods-by-az.sh | 0 .../resiliency/scripts/node-failure.sh | 0 .../resiliency/scripts/pod-failure.sh | 0 .../resiliency/scripts/testing.sh | 31 ++++ .../resiliency/scripts/verify-cluster.sh | 0 .../modules/resiliency/.workshop/cleanup.sh | 132 ------------------ .../multi_az/add_us_east_2_patch.yaml | 41 ------ .../multi_az/kustomization.yaml | 8 -- .../resiliency/scripts/create-second-az.sh | 52 ------- .../scripts/eks_workshop_canary_script.js | 30 ---- .../resiliency/scripts/multi-az-get-pods.sh | 26 ---- .../high-availability/00-setup.md} | 22 ++- .../high-availability/01-scale.md | 96 +++++++++++++ .../high-availability/02-pod-failure.md | 33 ++++- .../03-node-failure-no-fis.md | 52 +++++-- .../04-node-failure-partial-fis.md | 63 ++++++--- .../05-node-failure-complete-fis.md | 57 ++++++-- .../high-availability/06-az-setup.md | 100 +++++++++++++ .../high-availability/07-az-failure.md | 78 ++++++++--- .../high-availability}/index.md | 3 +- .../high-availability/tests/hook-suite.sh | 0 .../resiliency/high-availability/01-setup.md | 61 -------- .../high-availability/06-az-setup.md | 123 ---------------- website/docusaurus.config.js | 6 - website/sidebars.js | 1 - 34 files changed, 806 insertions(+), 580 deletions(-) create mode 100755 manifests/modules/observability/resiliency/.workshop/cleanup.sh rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/main.tf (90%) rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/outputs.tf (77%) rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/vars.tf (85%) rename manifests/modules/{ => observability}/resiliency/high-availability/config/kustomization.yaml (70%) rename manifests/modules/{ => observability}/resiliency/high-availability/config/scale_and_affinity_patch.yaml (100%) create mode 100755 manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh create mode 100755 manifests/modules/observability/resiliency/scripts/create-blueprint.sh rename manifests/modules/{ => observability}/resiliency/scripts/get-pods-by-az.sh (100%) rename manifests/modules/{ => observability}/resiliency/scripts/node-failure.sh (100%) rename manifests/modules/{ => observability}/resiliency/scripts/pod-failure.sh (100%) create mode 100644 manifests/modules/observability/resiliency/scripts/testing.sh rename manifests/modules/{ => observability}/resiliency/scripts/verify-cluster.sh (100%) delete mode 100755 manifests/modules/resiliency/.workshop/cleanup.sh delete mode 100644 manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml delete mode 100644 manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml delete mode 100755 manifests/modules/resiliency/scripts/create-second-az.sh delete mode 100644 manifests/modules/resiliency/scripts/eks_workshop_canary_script.js delete mode 100755 manifests/modules/resiliency/scripts/multi-az-get-pods.sh rename website/docs/{resiliency/high-availability/index.md => observability/high-availability/00-setup.md} (65%) create mode 100644 website/docs/observability/high-availability/01-scale.md rename website/docs/{resiliency => observability}/high-availability/02-pod-failure.md (63%) rename website/docs/{resiliency => observability}/high-availability/03-node-failure-no-fis.md (74%) rename website/docs/{resiliency => observability}/high-availability/04-node-failure-partial-fis.md (58%) rename website/docs/{resiliency => observability}/high-availability/05-node-failure-complete-fis.md (58%) create mode 100644 website/docs/observability/high-availability/06-az-setup.md rename website/docs/{resiliency => observability}/high-availability/07-az-failure.md (50%) rename website/docs/{resiliency => observability/high-availability}/index.md (97%) rename website/docs/{resiliency => observability}/high-availability/tests/hook-suite.sh (100%) delete mode 100644 website/docs/resiliency/high-availability/01-setup.md delete mode 100644 website/docs/resiliency/high-availability/06-az-setup.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index a22a4a127..d0f2cae4e 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -32,7 +32,7 @@ managedNodeGroups: instanceType: m5.large privateNetworking: true # had to remove use make create - #releaseVersion: "1.30.0-20240625" + releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/observability/resiliency/.workshop/cleanup.sh b/manifests/modules/observability/resiliency/.workshop/cleanup.sh new file mode 100755 index 000000000..1bb63ce1e --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/cleanup.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -e + +echo "Starting cleanup process..." + +# Function to safely delete a resource +safe_delete() { + local cmd=$1 + local resource=$2 + echo "Attempting to delete $resource..." + if $cmd 2>/dev/null; then + echo "$resource deleted successfully." + else + echo "Failed to delete $resource or it doesn't exist. Continuing..." + fi +} + +# Delete Kubernetes resources +echo "Cleaning up Kubernetes resources..." +kubectl delete ingress,deployment,service -n ui --all --ignore-not-found +kubectl delete role,rolebinding -n ui --all --ignore-not-found +kubectl delete namespace chaos-mesh --ignore-not-found + +# Uninstall Helm charts +echo "Uninstalling Helm charts..." +helm uninstall aws-load-balancer-controller -n kube-system || true +helm uninstall chaos-mesh -n chaos-mesh || true + +# Delete ALBs +echo "Cleaning up ALBs..." +for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do + safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" +done + +# Delete IAM Roles and Policies +echo "Cleaning up IAM roles and policies..." +for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do + for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do + echo "Processing role: $role" + for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do + safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" + done + for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do + safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" + done + safe_delete "aws iam delete-role --role-name $role" "IAM role $role" + done +done + +for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do + for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do + safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" + done +done + +# Delete S3 buckets +echo "Cleaning up S3 buckets..." +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do + aws s3 rm s3://$bucket --recursive + safe_delete "aws s3api delete-bucket --bucket $bucket" "S3 bucket $bucket" +done + +# Delete CloudWatch Synthetics canary and alarm +CANARY_NAME="eks-workshop-canary" +ALARM_NAME="eks-workshop-canary-alarm" + +echo "Cleaning up CloudWatch Synthetics canary and alarm..." +if aws synthetics get-canary --name $CANARY_NAME &>/dev/null; then + aws synthetics stop-canary --name $CANARY_NAME || true + sleep 30 + safe_delete "aws synthetics delete-canary --name $CANARY_NAME" "CloudWatch Synthetics canary $CANARY_NAME" +fi + +safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME" "CloudWatch alarm $ALARM_NAME" + +echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf similarity index 90% rename from manifests/modules/resiliency/.workshop/terraform/main.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/main.tf index ae6da7511..4da3d5fde 100644 --- a/manifests/modules/resiliency/.workshop/terraform/main.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf @@ -93,7 +93,7 @@ resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { # Add AWS Load Balancer controller resource "helm_release" "aws_load_balancer_controller" { name = "aws-load-balancer-controller" - repository = "https:#aws.github.io/eks-charts" + repository = "https://aws.github.io/eks-charts" chart = "aws-load-balancer-controller" namespace = "kube-system" version = var.load_balancer_controller_chart_version @@ -116,15 +116,15 @@ resource "helm_release" "aws_load_balancer_controller" { # Chaos Mesh Helm Release -resource "helm_release" "chaos_mesh" { - name = "chaos-mesh" - repository = "https:#charts.chaos-mesh.org" - chart = "chaos-mesh" - namespace = "chaos-mesh" - version = "2.5.1" - - create_namespace = true -} +#resource "helm_release" "chaos_mesh" { +# name = "chaos-mesh" +# repository = "https://charts.chaos-mesh.org" +# chart = "chaos-mesh" +# namespace = "chaos-mesh" +# version = "2.5.1" +# +# create_namespace = true +#} # FIS IAM role resource "random_id" "suffix" { @@ -141,10 +141,7 @@ resource "aws_iam_role" "fis_role" { Effect = "Allow" Principal = { Service = [ - "fis.amazonaws.com", - # for second region - "ec2.amazonaws.com", - "eks.amazonaws.com" + "fis.amazonaws.com" ] } Action = "sts:AssumeRole" @@ -241,6 +238,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "autoscaling:DescribeAutoScalingGroups", "autoscaling:DescribeAutoScalingInstances", "autoscaling:SetDesiredCapacity", + "autoscaling:SuspendProcesses", + "autoscaling:ResumeProcesses", "logs:CreateLogDelivery", "logs:GetLogDelivery", "logs:UpdateLogDelivery", @@ -249,7 +248,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "ssm:StartAutomationExecution", "ssm:GetAutomationExecution", "cloudwatch:DescribeAlarms", - "cloudwatch:GetMetricData" + "cloudwatch:GetMetricData", + "iam:PassRole" ] Resource = "*" }, @@ -331,7 +331,15 @@ resource "aws_iam_policy" "eks_resiliency_canary_policy" { "logs:PutLogEvents", "logs:DescribeLogGroups", "logs:DescribeLogStreams", - "lambda:InvokeFunction" + "lambda:CreateFunction", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:GetFunction", + "lambda:DeleteFunction", + "lambda:InvokeFunction", + "lambda:AddPermission", + "lambda:RemovePermission", + "iam:PassRole" ] Resource = "*" } @@ -377,3 +385,17 @@ resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" { policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" role = aws_iam_role.eks_cluster_role.name } + +# Executable Scripts +resource "null_resource" "chmod_all_scripts_bash" { + provisioner "local-exec" { + command = "find ${var.script_dir} -type f -exec chmod +x {} + || true" + } +} + +# Add Region terraform +data "aws_region" "current" {} + + + + diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf similarity index 77% rename from manifests/modules/resiliency/.workshop/terraform/outputs.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf index a584978a7..8538519e6 100644 --- a/manifests/modules/resiliency/.workshop/terraform/outputs.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf @@ -5,10 +5,10 @@ output "environment_variables" { LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn FIS_ROLE_ARN = aws_iam_role.fis_role.arn RANDOM_SUFFIX = random_id.suffix.hex - SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + SCRIPT_DIR = var.script_dir CANARY_ROLE_ARN = aws_iam_role.canary_role.arn EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn - PRIMARY_REGION = "us-west-2" - SECONDARY_REGION = "us-east-2" + AWS_REGION = data.aws_region.current.name } } + diff --git a/manifests/modules/resiliency/.workshop/terraform/vars.tf b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf similarity index 85% rename from manifests/modules/resiliency/.workshop/terraform/vars.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/vars.tf index 42bd4d060..f0b4e480c 100644 --- a/manifests/modules/resiliency/.workshop/terraform/vars.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf @@ -41,3 +41,9 @@ variable "load_balancer_controller_chart_version" { default = "1.8.1" } +# Executable Scripts +variable "script_dir" { + description = "Directory where scripts are located" + type = string + default = "/manifests/modules/observability/resiliency/scripts" +} \ No newline at end of file diff --git a/manifests/modules/resiliency/high-availability/config/kustomization.yaml b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml similarity index 70% rename from manifests/modules/resiliency/high-availability/config/kustomization.yaml rename to manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml index b71687089..deae0ee7f 100644 --- a/manifests/modules/resiliency/high-availability/config/kustomization.yaml +++ b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - ../../../../../manifests/base-application/ui + - ../../../../../../manifests/base-application/ui patches: - path: scale_and_affinity_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml similarity index 100% rename from manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml rename to manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml diff --git a/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh new file mode 100755 index 000000000..a136332b2 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# verify-cluster.sh - Verifies cluster state and corrects replica count + +DESIRED_REPLICAS=9 +MAX_WAIT_TIME=300 # 5 minutes +POLL_INTERVAL=10 # 10 seconds +NAMESPACE="ui" +EXPECTED_READY_NODES=6 + +print_header() { + echo -e "\n==== $1 ====\n" +} + +wait_for_condition() { + local end_time=$((SECONDS + MAX_WAIT_TIME)) + while [ $SECONDS -lt $end_time ]; do + if eval "$1"; then + return 0 + fi + echo -n "." + sleep $POLL_INTERVAL + done + echo " Timeout!" + return 1 +} + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Waiting for nodes to be Ready" +total_nodes=$(kubectl get nodes --no-headers | wc -l) +echo "Total nodes in the cluster: $total_nodes" +echo "Waiting for $EXPECTED_READY_NODES nodes to be in Ready state" +if wait_for_condition "[ \$(kubectl get nodes --no-headers | grep ' Ready ' | wc -l) -eq $EXPECTED_READY_NODES ]"; then + echo -e "\n✅ $EXPECTED_READY_NODES nodes are in Ready state." +else + echo -e "\n⚠️ Warning: $EXPECTED_READY_NODES nodes did not reach Ready state within the timeout period." + exit 1 +fi + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Node Information" +kubectl get nodes -o wide + +print_header "Verifying Cluster State" +node_count=$(kubectl get nodes --no-headers | grep " Ready " | grep -vc "SchedulingDisabled") +current_pod_count=$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) + +echo "Ready and schedulable nodes: $node_count" +echo "Current active ui pods: $current_pod_count" +echo "Desired ui pods: $DESIRED_REPLICAS" + +if [ $current_pod_count -ne $DESIRED_REPLICAS ]; then + print_header "Adjusting Replica Count" + echo "Scaling deployment to $DESIRED_REPLICAS replicas..." + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pod count to stabilize" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pod count has reached the desired number." + else + echo -e "\n⚠️ Warning: Failed to reach desired pod count within the timeout period." + fi +else + echo "✅ Number of replicas is correct." +fi + +print_header "Checking Pod Distribution" +if [ $node_count -gt 0 ]; then + max_pods_per_node=$((DESIRED_REPLICAS / node_count + 1)) + uneven_distribution=false + + for node in $(kubectl get nodes -o name | grep -v "SchedulingDisabled"); do + pods_on_node=$(kubectl get pods -n $NAMESPACE -l app=ui --field-selector spec.nodeName=${node#node/} --no-headers | grep -v Terminating | wc -l) + if [ $pods_on_node -gt $max_pods_per_node ]; then + uneven_distribution=true + break + fi + done + + if $uneven_distribution; then + echo "⚠️ Pod distribution is uneven. Rebalancing..." + kubectl scale deployment ui -n $NAMESPACE --replicas=0 + sleep $POLL_INTERVAL + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pods to be ready" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep Running | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pods are ready and balanced." + else + echo -e "\n⚠️ Warning: Pods did not reach ready state within the timeout period." + fi + else + echo "✅ Pod distribution is balanced." + fi +else + echo "⚠️ Warning: No Ready and schedulable nodes found. Cannot check pod distribution." +fi + +print_header "Final Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +echo +if [ $node_count -gt 0 ] && [ $current_pod_count -eq $DESIRED_REPLICAS ]; then + echo "✅ Cluster verification and correction complete." +else + echo "⚠️ Cluster verification complete, but some issues may require attention." +fi \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/scripts/create-blueprint.sh b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh new file mode 100755 index 000000000..4f8ab5112 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Get Ingress URL +INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +# Create the required directory structure +mkdir -p nodejs/node_modules + +# Create the Node.js canary script with heartbeat blueprint +cat << EOF > nodejs/node_modules/canary.js +const { URL } = require('url'); +const synthetics = require('Synthetics'); +const log = require('SyntheticsLogger'); +const syntheticsConfiguration = synthetics.getConfiguration(); +const syntheticsLogHelper = require('SyntheticsLogHelper'); + +const loadBlueprint = async function () { + const urls = ['http://${INGRESS_URL}']; + + // Set screenshot option + const takeScreenshot = true; + + // Configure synthetics settings + syntheticsConfiguration.disableStepScreenshots(); + syntheticsConfiguration.setConfig({ + continueOnStepFailure: true, + includeRequestHeaders: true, + includeResponseHeaders: true, + restrictedHeaders: [], + restrictedUrlParameters: [] + }); + + let page = await synthetics.getPage(); + + for (const url of urls) { + await loadUrl(page, url, takeScreenshot); + } +}; + +// Reset the page in-between +const resetPage = async function(page) { + try { + await page.goto('about:blank', {waitUntil: ['load', 'networkidle0'], timeout: 30000}); + } catch (e) { + synthetics.addExecutionError('Unable to open a blank page. ', e); + } +}; + +const loadUrl = async function (page, url, takeScreenshot) { + let stepName = null; + let domcontentloaded = false; + + try { + stepName = new URL(url).hostname; + } catch (e) { + const errorString = \`Error parsing url: \${url}. \${e}\`; + log.error(errorString); + throw e; + } + + await synthetics.executeStep(stepName, async function () { + const sanitizedUrl = syntheticsLogHelper.getSanitizedUrl(url); + + const response = await page.goto(url, { waitUntil: ['domcontentloaded'], timeout: 30000}); + if (response) { + domcontentloaded = true; + const status = response.status(); + const statusText = response.statusText(); + + logResponseString = \`Response from url: \${sanitizedUrl} Status: \${status} Status Text: \${statusText}\`; + + if (response.status() < 200 || response.status() > 299) { + throw new Error(\`Failed to load url: \${sanitizedUrl} \${response.status()} \${response.statusText()}\`); + } + } else { + const logNoResponseString = \`No response returned for url: \${sanitizedUrl}\`; + log.error(logNoResponseString); + throw new Error(logNoResponseString); + } + }); + + // Wait for 15 seconds to let page load fully before taking screenshot. + if (domcontentloaded && takeScreenshot) { + await new Promise(r => setTimeout(r, 15000)); + await synthetics.takeScreenshot(stepName, 'loaded'); + } + + // Reset page + await resetPage(page); +}; + +exports.handler = async () => { + return await loadBlueprint(); +}; +EOF + +# Zip the Node.js script +python3 - << EOL +import zipfile +with zipfile.ZipFile('canary.zip', 'w') as zipf: + zipf.write('nodejs/node_modules/canary.js', arcname='nodejs/node_modules/canary.js') +EOL + +# Ensure BUCKET_NAME is set +if [ -z "$BUCKET_NAME" ]; then + echo "Error: BUCKET_NAME environment variable is not set." + exit 1 +fi + +# Upload the zipped canary script to S3 +aws s3 cp canary.zip "s3://${BUCKET_NAME}/canary-scripts/canary.zip" + +echo "Canary script has been zipped and uploaded to s3://${BUCKET_NAME}/canary-scripts/canary.zip" +echo "The script is configured to check the URL: http://${INGRESS_URL}" diff --git a/manifests/modules/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh similarity index 100% rename from manifests/modules/resiliency/scripts/get-pods-by-az.sh rename to manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh diff --git a/manifests/modules/resiliency/scripts/node-failure.sh b/manifests/modules/observability/resiliency/scripts/node-failure.sh similarity index 100% rename from manifests/modules/resiliency/scripts/node-failure.sh rename to manifests/modules/observability/resiliency/scripts/node-failure.sh diff --git a/manifests/modules/resiliency/scripts/pod-failure.sh b/manifests/modules/observability/resiliency/scripts/pod-failure.sh similarity index 100% rename from manifests/modules/resiliency/scripts/pod-failure.sh rename to manifests/modules/observability/resiliency/scripts/pod-failure.sh diff --git a/manifests/modules/observability/resiliency/scripts/testing.sh b/manifests/modules/observability/resiliency/scripts/testing.sh new file mode 100644 index 000000000..b42708e1f --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/testing.sh @@ -0,0 +1,31 @@ +ZONE_EXP_ID=$(aws fis create-experiment-template \ + --cli-input-json '{ + "description": "publicdocument-azfailure", + "targets": {}, + "actions": { + "azfailure": { + "actionId": "aws:ssm:start-automation-execution", + "parameters": { + "documentArn": "arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23", + "documentParameters": "{ + \"AutoScalingGroupName\":\"'$ASG_NAME'\", + \"CanaryAlarmName\":\"eks-workshop-canary-alarm\", + \"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\", + \"IsRollback\":\"false\", + \"TestDurationInMinutes\":\"2\" + }", + "maxDuration": "PT6M" + } + } + }, + "stopConditions": [ + { + "source": "none" + } + ], + "roleArn": "'$FIS_ROLE_ARN'", + "tags": { + "ExperimentSuffix": "'$RANDOM_SUFFIX'" + } + }' \ + --output json | jq -r '.experimentTemplate.id') \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/observability/resiliency/scripts/verify-cluster.sh similarity index 100% rename from manifests/modules/resiliency/scripts/verify-cluster.sh rename to manifests/modules/observability/resiliency/scripts/verify-cluster.sh diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh deleted file mode 100755 index 537a7d260..000000000 --- a/manifests/modules/resiliency/.workshop/cleanup.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -set -e - -echo "Starting cleanup process..." - -# Function to safely delete a resource -safe_delete() { - local cmd=$1 - local resource=$2 - echo "Attempting to delete $resource..." - if $cmd 2>/dev/null; then - echo "$resource deleted successfully." - else - echo "Failed to delete $resource or it doesn't exist. Continuing..." - fi -} - -# Function to wait for resource deletion -wait_for_deletion() { - local check_cmd=$1 - local resource=$2 - local max_attempts=30 - local attempt=0 - echo "Waiting for $resource to be deleted..." - while $check_cmd &>/dev/null && [ $attempt -lt $max_attempts ]; do - sleep 10 - ((attempt++)) - done - if [ $attempt -eq $max_attempts ]; then - echo "Timeout waiting for $resource to be deleted." - else - echo "$resource deleted successfully." - fi -} - -# Function to cleanup EKS resources in a region -cleanup_eks_region() { - local region=$1 - local cluster_name=$2 - local nodegroup_name=$3 - local delete_cluster=$4 - - echo "Cleaning up EKS resources in $region..." - - # Switch to the specified region - aws configure set default.region $region - - # Delete Kubernetes resources - echo "Cleaning up Kubernetes resources..." - kubectl delete ingress,deployment,service -n ui --all --ignore-not-found - kubectl delete role,rolebinding -n ui --all --ignore-not-found - kubectl delete namespace chaos-mesh --ignore-not-found - - # Delete EKS Cluster and Node Group if specified - if [ "$delete_cluster" = true ]; then - echo "Attempting to delete EKS cluster and node group..." - if aws eks describe-cluster --name $cluster_name --region $region &>/dev/null; then - aws eks delete-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region || true - wait_for_deletion "aws eks describe-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region" "node group" - - aws eks delete-cluster --name $cluster_name --region $region - wait_for_deletion "aws eks describe-cluster --name $cluster_name --region $region" "EKS cluster" - else - echo "EKS cluster $cluster_name not found in $region. Skipping deletion." - fi - else - echo "Skipping EKS cluster and node group deletion in $region as requested." - fi - - # Uninstall Helm charts - echo "Uninstalling Helm charts..." - helm uninstall aws-load-balancer-controller -n kube-system || true - helm uninstall chaos-mesh -n chaos-mesh || true - - # Delete ALBs - echo "Cleaning up ALBs in $region..." - for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do - safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" - done -} - -# Cleanup in PRIMARY_REGION (preserve cluster and node groups) -cleanup_eks_region $PRIMARY_REGION "eks-workshop" "default" false - -# Cleanup in SECONDARY_REGION (full cleanup) -cleanup_eks_region $SECONDARY_REGION "eks-workshop-east" "us-east-2-node-group" true - -# Global cleanup (not region-specific) - -# Delete IAM Roles and Policies -echo "Cleaning up IAM roles and policies..." -for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do - for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do - echo "Processing role: $role" - for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do - safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" - done - for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do - safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" - done - safe_delete "aws iam delete-role --role-name $role" "IAM role $role" - done -done - -for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do - for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do - safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" - done -done - -# Delete S3 buckets -echo "Cleaning up S3 buckets..." -for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do - aws s3 rm s3://$bucket --recursive - safe_delete "aws s3api delete-bucket --bucket $bucket --region $PRIMARY_REGION" "S3 bucket $bucket" -done - -# Delete CloudWatch Synthetics canary and alarm -CANARY_NAME="eks-workshop-canary" -ALARM_NAME="eks-workshop-canary-alarm" - -echo "Cleaning up CloudWatch Synthetics canary and alarm..." -if aws synthetics get-canary --name $CANARY_NAME --region $PRIMARY_REGION &>/dev/null; then - aws synthetics stop-canary --name $CANARY_NAME --region $PRIMARY_REGION || true - sleep 30 - safe_delete "aws synthetics delete-canary --name $CANARY_NAME --region $PRIMARY_REGION" "CloudWatch Synthetics canary $CANARY_NAME" -fi - -safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME --region $PRIMARY_REGION" "CloudWatch alarm $ALARM_NAME" - -echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml deleted file mode 100644 index b2a276fde..000000000 --- a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: ui - namespace: ui -spec: - replicas: 9 # Total number of replicas - template: - spec: - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 60 - preference: - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-west-2a - - us-west-2b - - us-west-2c - - weight: 40 - preference: - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - - us-east-2b - - us-east-2c - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - ui - topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml deleted file mode 100644 index 32bf6179b..000000000 --- a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../../../../manifests/base-application/ui - -patches: - - path: add_us_east_2_patch.yaml diff --git a/manifests/modules/resiliency/scripts/create-second-az.sh b/manifests/modules/resiliency/scripts/create-second-az.sh deleted file mode 100755 index 09d9c28bb..000000000 --- a/manifests/modules/resiliency/scripts/create-second-az.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Ensure SCRIPT_DIR is set -if [ -z "$SCRIPT_DIR" ]; then - echo "Error: SCRIPT_DIR environment variable is not set." - exit 1 -fi - -# Ensure PRIMARY_REGION and SECONDARY_REGION are set -if [ -z "$PRIMARY_REGION" ] || [ -z "$SECONDARY_REGION" ]; then - echo "Error: PRIMARY_REGION and SECONDARY_REGION must be set." - exit 1 -fi - -# Function to run multi-az-get-pods.sh and display region -run_multi_az_script() { - local region=$1 - echo "Current region: $region" - echo "Running multi-az-get-pods.sh..." - $SCRIPT_DIR/multi-az-get-pods.sh - echo "----------------------------------------" -} - -# Run multi-az-get-pods.sh in PRIMARY_REGION -aws configure set default.region $PRIMARY_REGION -run_multi_az_script $PRIMARY_REGION - -# Switch to SECONDARY_REGION -echo "Switching to SECONDARY_REGION: $SECONDARY_REGION" -aws configure set default.region $SECONDARY_REGION - -# Prepare environment for resiliency module -echo "Preparing environment for resiliency module..." -prepare-environment resiliency - -# Verify the EKS cluster in SECONDARY_REGION -echo "Verifying EKS cluster in SECONDARY_REGION..." -aws eks list-clusters - -# Check node groups in SECONDARY_REGION -CLUSTER_NAME=$(aws eks list-clusters --query 'clusters[0]' --output text) -echo "Checking node groups for cluster: $CLUSTER_NAME" -aws eks list-nodegroups --cluster-name $CLUSTER_NAME - -# Switch back to PRIMARY_REGION -echo "Switching back to PRIMARY_REGION: $PRIMARY_REGION" -aws configure set default.region $PRIMARY_REGION - -# Run multi-az-get-pods.sh one last time in PRIMARY_REGION -run_multi_az_script $PRIMARY_REGION - -echo "Setup complete. \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js deleted file mode 100644 index 74deb4591..000000000 --- a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js +++ /dev/null @@ -1,30 +0,0 @@ -const synthetics = require("Synthetics"); -const log = require("SyntheticsLogger"); - -const pageLoadBlueprint = async function () { - const PAGE_LOAD_TIMEOUT = 30; - const URL = process.env.INGRESS_URL || "http://localhost"; // Use environment variable or fallback - - let page = await synthetics.getPage(); - - await synthetics.executeStep("Navigate to " + URL, async function () { - const response = await page.goto(URL, { - waitUntil: "domcontentloaded", - timeout: PAGE_LOAD_TIMEOUT * 1000, - }); - - // Verify the page loaded successfully - if (response.status() !== 200) { - throw new Error(`Failed to load page. Status code: ${response.status()}`); - } - }); - - await synthetics.executeStep("Verify page content", async function () { - const pageTitle = await page.title(); - log.info("Page title: " + pageTitle); - }); -}; - -exports.handler = async () => { - return await pageLoadBlueprint(); -}; diff --git a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh deleted file mode 100755 index f47649eb8..000000000 --- a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -GREEN='\033[0;32m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -REGIONS=("us-west-2" "us-east-2") - -for REGION in "${REGIONS[@]}" -do - echo "Region: $REGION" - for az in a b c - do - AZ=$REGION$az - echo -n "------" - echo -n -e "${GREEN}$AZ${NC}" - echo "------" - for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers 2>/dev/null | grep -v NotReady | cut -d " " -f1) - do - echo -e " ${RED}$node:${NC}" - kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>/dev/null | while read line; do echo " ${line}"; done - done - echo "" - done - echo "" -done \ No newline at end of file diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/observability/high-availability/00-setup.md similarity index 65% rename from website/docs/resiliency/high-availability/index.md rename to website/docs/observability/high-availability/00-setup.md index 6dec235d9..cf774351b 100644 --- a/website/docs/resiliency/high-availability/index.md +++ b/website/docs/observability/high-availability/00-setup.md @@ -1,16 +1,14 @@ --- title: "High Availability" -sidebar_position: 20 -sidebar_custom_props: { "module": true } +sidebar_position: 1 description: "Prepare your EKS environment to handle high availability scenarios effectively." --- :::tip Before you start Prepare your environment for this section: -```bash timeout=300 wait=30 -$ /manifests/modules/resiliency/.workshop/cleanup.sh -$ prepare-environment resiliency +```bash timeout=600 wait=30 +$ prepare-environment observability/resiliency ``` This will make the following changes to your lab environment: @@ -18,10 +16,9 @@ This will make the following changes to your lab environment: - Create the ingress load balancer - Create RBAC and Rolebindings - Install AWS Load Balancer controller -- Install ChaosMesh - Create an IAM role for AWS Fault Injection Simulator (FIS) -You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/.workshop/terraform). ::: ## Lab Overview @@ -30,11 +27,11 @@ In this lab, we'll explore various high availability scenarios and test the resi The experiments we'll conduct include: -1. Pod Failure Simulation: Using ChaosMesh to test your application's resilience to individual pod failures. -2. Node Failure without FIS: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. -3. Partial Node Failure with FIS: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. -4. Complete Node Failure with FIS: Testing your cluster's response to a catastrophic failure of all nodes. -5. Availability Zone Failure: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. +1. **Pod Failure Simulation**: Using ChaosMesh to test your application's resilience to individual pod failures. +2. **Node Failure without FIS**: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. +3. **Partial Node Failure with FIS**: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. +4. **Complete Node Failure with FIS**: Testing your cluster's response to a catastrophic failure of all nodes. +5. **Availability Zone Failure**: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. These experiments will help you understand: @@ -50,6 +47,5 @@ For more information on the components used in this lab, check out: - [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) - [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) -- [Chaos Mesh](https://chaos-mesh.org/) - [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) ::: diff --git a/website/docs/observability/high-availability/01-scale.md b/website/docs/observability/high-availability/01-scale.md new file mode 100644 index 000000000..03c24ccaf --- /dev/null +++ b/website/docs/observability/high-availability/01-scale.md @@ -0,0 +1,96 @@ +--- +title: "Lab Setup: Chaos Mesh, Scaling, and Pod affinity" +sidebar_position: 2 +description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." +--- + +This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover installing helm, scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. + +## Installing Chaos Mesh + +To enhance our cluster's resilience testing capabilities, we'll install Chaos Mesh. Chaos Mesh is a powerful chaos engineering tool for Kubernetes environments. It allows us to simulate various failure scenarios and test how our applications respond. + +Let's install Chaos Mesh in our cluster using Helm: + +```bash timeout= 180 wait=30 +$ helm repo add chaos-mesh https://charts.chaos-mesh.org +$ helm upgrade --install chaos-mesh chaos-mesh/chaos-mesh \ + --namespace chaos-mesh \ + --create-namespace \ + --version 2.5.1 \ + --set dashboard.create=true \ +Release "chaos-mesh" does not exist. Installing it now. +NAME: chaos-mesh +LAST DEPLOYED: Tue Aug 20 04:44:31 2024 +NAMESPACE: chaos-mesh +STATUS: deployed +REVISION: 1 +TEST SUITE: None +``` + +## Scaling and Pod Anti-Affinity + +We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. + +Here's the content of our patch file: + +```kustomization +modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml +Deployment/ui +``` + +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml): + +```bash wait=30 +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +``` + +## Verify Retail Store Accessibility + +After applying these changes, it's important to verify that your retail store is accessible: + +```bash timeout=600 wait=30 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. + +:::tip +The retail url may take up to 10 minutes to become operational. +::: + +## Helper Script: Get Pods by AZ + +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh). + +### Script Execution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/get-pods-by-az.sh +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-6fzrk 1/1 Running 0 56s + ui-6dfb84cf67-dsp55 1/1 Running 0 56s + +------us-west-2b------ + ip-10-42-153-179.us-west-2.compute.internal: + ui-6dfb84cf67-2pxnp 1/1 Running 0 59s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-n8x4f 1/1 Running 0 61s + ui-6dfb84cf67-wljth 1/1 Running 0 61s +``` + +:::info +For more information on these changes, check out these sections: + +- [Chaos Mesh](https://chaos-mesh.org/) +- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) + ::: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/observability/high-availability/02-pod-failure.md similarity index 63% rename from website/docs/resiliency/high-availability/02-pod-failure.md rename to website/docs/observability/high-availability/02-pod-failure.md index b1bcc55c8..5cbba76ef 100644 --- a/website/docs/resiliency/high-availability/02-pod-failure.md +++ b/website/docs/observability/high-availability/02-pod-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating Pod Failure" -sidebar_position: 2 +sidebar_position: 3 description: "Simulate pod failure in your environment using ChaosMesh to test the resiliency of your application." --- @@ -17,15 +17,28 @@ The `pod-failure.sh` script utilizes Chaos Mesh, a powerful chaos engineering pl This experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. This is the script we will be using: ```file -manifests/modules/resiliency/scripts/pod-failure.sh +manifests/modules/observability/resiliency/scripts/pod-failure.sh ``` ## Running the Experiment To simulate the pod failure and monitor its effects, run the following command: -```bash +```bash timeout=90 wait=30 $ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-dsp55 1/1 Running 0 2m10s + ui-6dfb84cf67-gzd9s 1/1 Running 0 8s + +------us-west-2b------ + ip-10-42-153-179.us-west-2.compute.internal: + ui-6dfb84cf67-2pxnp 1/1 Running 0 2m13s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-n8x4f 1/1 Running 0 2m17s + ui-6dfb84cf67-wljth 1/1 Running 0 2m17s ``` This command does the following: @@ -38,8 +51,14 @@ During the experiment, you should observe one pod disappearing and then reappear To get a more detailed view of the pods in the `ui` namespace, use the following command: -```bash +```bash wait=15 $ kubectl get pods -n ui -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +ui-6dfb84cf67-2pxnp 1/1 Running 0 2m56s 10.42.154.151 ip-10-42-153-179.us-west-2.compute.internal +ui-6dfb84cf67-dsp55 1/1 Running 0 2m56s 10.42.126.161 ip-10-42-127-82.us-west-2.compute.internal +ui-6dfb84cf67-gzd9s 1/1 Running 0 71s 10.42.126.246 ip-10-42-127-82.us-west-2.compute.internal +ui-6dfb84cf67-n8x4f 1/1 Running 0 2m56s 10.42.190.250 ip-10-42-186-246.us-west-2.compute.internal +ui-6dfb84cf67-wljth 1/1 Running 0 2m56s 10.42.190.249 ip-10-42-186-246.us-west-2.compute.internal ``` This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. @@ -48,11 +67,13 @@ This will show you the status, IP addresses, and nodes for each pod in the `ui` An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` -This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. +Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. ## Conclusion diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md similarity index 74% rename from website/docs/resiliency/high-availability/03-node-failure-no-fis.md rename to website/docs/observability/high-availability/03-node-failure-no-fis.md index 494bddbd1..ac487042c 100644 --- a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Node Failure without FIS" -sidebar_position: 3 +sidebar_position: 4 description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." --- @@ -13,7 +13,7 @@ This experiment simulates a node failure manually in your Kubernetes cluster to The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: ```file -manifests/modules/resiliency/scripts/node-failure.sh +manifests/modules/observability/resiliency/scripts/node-failure.sh ``` It's important to note that this experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. @@ -22,8 +22,22 @@ It's important to note that this experiment is repeatable, allowing you to run i To simulate the node failure and monitor its effects, run the following command: -```bash +```bash timeout=180 wait=30 $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-dsp55 1/1 Running 0 10m + ui-6dfb84cf67-gzd9s 1/1 Running 0 8m19s + +------us-west-2b------ + ip-10-42-133-195.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-4bmjm 1/1 Running 0 44s + ui-6dfb84cf67-n8x4f 1/1 Running 0 10m + ui-6dfb84cf67-wljth 1/1 Running 0 10m ``` This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. @@ -40,10 +54,28 @@ Throughout this process, the total number of running pods should remain constant While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: +Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: -```bash +```bash timeout=300 wait=30 $ $SCRIPT_DIR/verify-cluster.sh + +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-vwk4x 1/1 Running 0 25s + +------us-west-2b------ + ip-10-42-133-195.us-west-2.compute.internal: + ui-6dfb84cf67-2rb6s 1/1 Running 0 27s + ui-6dfb84cf67-dk495 1/1 Running 0 27s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-7bftc 1/1 Running 0 29s + ui-6dfb84cf67-nqgdn 1/1 Running 0 29s + + ``` This script will: @@ -56,8 +88,10 @@ This script will: After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated node failure. @@ -65,13 +99,13 @@ This command retrieves the load balancer hostname for the ingress and waits for :::caution The retail url may take 10 minutes to become operational. You can optionally continue on with the lab by pressing `ctrl` + `z` to move operation to the background. To access it again input: -```bash -$ fg +```bash test=false +$ fg %1 ``` The url may not become operational by the time `wait-for-lb` times out. In that case, it should become operational after running the command again: -```bash +```bash test=false $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md similarity index 58% rename from website/docs/resiliency/high-availability/04-node-failure-partial-fis.md rename to website/docs/observability/high-availability/04-node-failure-partial-fis.md index 0d5a738db..4ca8d6c4b 100644 --- a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Partial Node Failure with FIS" -sidebar_position: 4 +sidebar_position: 5 description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." --- @@ -29,18 +29,20 @@ For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) - [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) +- [AWS Systems Manager, Automation](https://console.aws.amazon.com/systems-manager/automation/executions) ::: ## Experiment Details This experiment differs from the previous manual node failure simulation in several ways: -1. Automated execution: FIS manages the experiment, allowing for more controlled and repeatable tests. -2. Partial failure: Instead of simulating a complete node failure, we're testing a scenario where a portion of the nodes fail. -3. Scale: FIS allows us to target multiple nodes simultaneously, providing a more realistic large-scale failure scenario. -4. Precision: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. +1. **Automated execution**: FIS manages the experiment, allowing for more controlled and repeatable tests compared to the manual script execution in the previous experiment. +2. **Partial failure**: Instead of simulating a complete failure of a single node, FIS allows us to simulate a partial failure across multiple nodes. This provides a more nuanced and realistic failure scenario. +3. **Scale**: FIS allows us to target multiple nodes simultaneously. This allows us to test the resilience of our application at a larger scale compared to the single-node failure in the manual experiment. +4. **Precision**: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. This level of control wasn't possible in the manual experiment, where we were limited to terminating entire nodes. +5. **Minimal disruption**: The FIS experiment is designed to maintain service availability throughout the test, whereas the manual node failure might have caused temporary disruptions to the retail store's accessibility. -In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. +These differences allows for a more comprehensive and realistic test of our application's resilience to failures, while maintaining better control over the experiment parameters. In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. Similar to previous experiments, this experiment is also repeatable ## Creating the Node Failure Experiment @@ -54,8 +56,20 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: -```bash +```bash timeout=240 wait=30 $ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-s6kw4 1/1 Running 0 2m16s + ui-6dfb84cf67-vwk4x 1/1 Running 0 4m54s + +------us-west-2b------ + +------us-west-2c------ + ip-10-42-180-16.us-west-2.compute.internal: + ui-6dfb84cf67-29xtf 1/1 Running 0 79s + ui-6dfb84cf67-68hbw 1/1 Running 0 79s + ui-6dfb84cf67-plv9f 1/1 Running 0 79s ``` This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. @@ -69,10 +83,25 @@ During the experiment, you should observe the following: Your retail url should stay operational unlike the node failure without FIS. :::note -To verify clusters and rebalance pods, you can run: +To verify nodes and rebalance pods, you can run: -```bash +```bash timeout=240 wait=30 $ $SCRIPT_DIR/verify-cluster.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-v2xj6 1/1 Running 0 14s + +------us-west-2b------ + ip-10-42-148-187.us-west-2.compute.internal: + ui-6dfb84cf67-4xq4n 1/1 Running 0 16s + ui-6dfb84cf67-56d6d 1/1 Running 0 16s + +------us-west-2c------ + ip-10-42-180-16.us-west-2.compute.internal: + ui-6dfb84cf67-86mpz 1/1 Running 0 18s + ui-6dfb84cf67-nhx4j 1/1 Running 0 18s ``` ::: @@ -81,8 +110,10 @@ $ $SCRIPT_DIR/verify-cluster.sh Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` :::tip @@ -91,10 +122,6 @@ The retail url may take 10 minutes to become operational. Despite the partial node failure, the retail store should continue to serve traffic, demonstrating the resilience of your deployment setup. -:::caution -Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. -::: - ## Conclusion This partial node failure simulation using AWS FIS demonstrates several key aspects of your Kubernetes cluster's resilience: @@ -113,9 +140,9 @@ Key takeaways from this experiment: By leveraging AWS FIS for such experiments, you gain several advantages: -1. Repeatability: You can run this experiment multiple times to ensure consistent behavior. -2. Automation: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. -3. Comprehensive testing: You can create more complex scenarios involving multiple AWS services to test your entire application stack. -4. Controlled chaos: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. +1. **Repeatability**: You can run this experiment multiple times to ensure consistent behavior. +2. **Automation**: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. +3. **Comprehensive testing**: You can create more complex scenarios involving multiple AWS services to test your entire application stack. +4. **Controlled chaos**: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. Regular execution of such experiments helps build confidence in your system's resilience and provides valuable insights for continuous improvement of your architecture and operational procedures. diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md similarity index 58% rename from website/docs/resiliency/high-availability/05-node-failure-complete-fis.md rename to website/docs/observability/high-availability/05-node-failure-complete-fis.md index a7f142b71..722341fd0 100644 --- a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Complete Node Failure with FIS" -sidebar_position: 5 +sidebar_position: 6 description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." --- @@ -8,15 +8,11 @@ description: "Demonstrates the impact of a complete node failure on a Kubernetes ## Overview -This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. - -:::info Important -This test simulates a worst-case scenario. It's designed for controlled environments with thoroughly tested recovery mechanisms. -::: +This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. This is essentially a cluster failure. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. ## Experiment Details -Unlike the partial node failure simulation, this experiment: +This experiment is similar to the partial node failure as it is repeatable. Unlike the partial node failure simulation, this experiment: 1. Terminates 100% of the instances in all node groups. 2. Tests your cluster's ability to recover from a state of complete failure. @@ -34,24 +30,53 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc Execute the FIS experiment and monitor the cluster's response: -```bash -$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=420 wait=30 +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 360 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-106-250.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2b------ + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-n9xns 1/1 Running 0 4m8s + ui-6dfb84cf67-slknv 1/1 Running 0 2m48s + +------us-west-2c------ + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-5xht5 1/1 Running 0 4m52s + ui-6dfb84cf67-b6xbf 1/1 Running 0 4m10s + ui-6dfb84cf67-fpg8j 1/1 Running 0 4m52s ``` -This command will show the pods distribution over 5 minutes while we observe the experiment. We should see: +This command will show the pods distribution over 6 minutes while we observe the experiment. We should see: 1. Shortly after the experment is initiated, all nodes and pods dissapear. 2. After about 2 minutes, First node and some pods will come back online. 3. Around 4 minutes, a second node appears and more pods start up. -4. At 5 minutes, continued recovery as the last node come online. +4. At 6 minutes, continued recovery as the last node come online. -Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. +Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. If the node is not operational after this test, run `$SCRIPT_DIR/verify-clsuter.sh` to wait for the final node to change state to running before proceeding. :::note -To verify clusters and rebalance pods, you can run: +To verify nodes and rebalance pods, you can run: -```bash +```bash timeout=240 wait=30 $ $SCRIPT_DIR/verify-cluster.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-106-250.us-west-2.compute.internal: + ui-6dfb84cf67-4fjhh 1/1 Running 0 15s + ui-6dfb84cf67-gkrtn 1/1 Running 0 14s + +------us-west-2b------ + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-7qnkz 1/1 Running 0 16s + ui-6dfb84cf67-n58b9 1/1 Running 0 16s + +------us-west-2c------ + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-lvdc2 1/1 Running 0 18s ``` ::: @@ -60,8 +85,10 @@ $ $SCRIPT_DIR/verify-cluster.sh Check the retail store application's recovery: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` :::tip diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md new file mode 100644 index 000000000..04a3bbb83 --- /dev/null +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -0,0 +1,100 @@ +--- +title: "AZ Failure Experiment Setup" +sidebar_position: 7 +description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." +--- + +### Scaling Instances + +To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9: + +```bash timeout=120 wait=30 +$ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text) +$ aws autoscaling update-auto-scaling-group \ + --auto-scaling-group-name $ASG_NAME \ + --desired-capacity 6 \ + --min-size 6 \ + --max-size 6 +$ sleep 60 +$ kubectl scale deployment ui --replicas=9 -n ui +$ $SCRIPT_DIR/get-pods-by-az.sh +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s + ip-10-42-106-250.us-west-2.compute.internal: + ui-6dfb84cf67-4fjhh 1/1 Running 0 5m20s + ui-6dfb84cf67-gkrtn 1/1 Running 0 5m19s + +------us-west-2b------ + ip-10-42-139-198.us-west-2.compute.internal: + ui-6dfb84cf67-7rfkf 0/1 ContainerCreating 0 4s + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-7qnkz 1/1 Running 0 5m23s + ui-6dfb84cf67-n58b9 1/1 Running 0 5m23s + +------us-west-2c------ + ip-10-42-175-140.us-west-2.compute.internal: + ui-6dfb84cf67-8xfk8 0/1 ContainerCreating 0 8s + ui-6dfb84cf67-s55nb 0/1 ContainerCreating 0 8s + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-lvdc2 1/1 Running 0 5m26s +``` + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash wait=15 +$ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION +make_bucket: eks-workshop-canary-artifacts-1724131402 +``` + +2. Create the blueprint: + +```file +manifests/modules/observability/resiliency/scripts/create-blueprint.sh +``` + +Place this canary blueprint into the bucket: + +```bash wait=15 +$ $SCRIPT_DIR/create-blueprint.sh +upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip +Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip +The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +3. Create a synthetic canary with a Cloudwatch alarm: + +```bash timeout=120 wait=30 +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $CANARY_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-9.0 \ + --schedule "Expression=rate(1 minute)" \ + --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ + --region $AWS_REGION +$ sleep 45 +$ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region $AWS_REGION +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +With these steps completed, your application is now scaled across to two instances in AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docs/resiliency/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md similarity index 50% rename from website/docs/resiliency/high-availability/07-az-failure.md rename to website/docs/observability/high-availability/07-az-failure.md index c164d3c85..97d1043b3 100644 --- a/website/docs/resiliency/high-availability/07-az-failure.md +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating AZ Failure" -sidebar_position: 7 +sidebar_position: 8 description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." --- @@ -8,18 +8,11 @@ description: "This experiment simulates an Availability Zone failure to test the ## Overview -This experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. +This repeatable experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. ### Setting up the Experiment -Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: - -```bash -$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) -$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') -``` - -Create the FIS experiment template to simulate the AZ failure: +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster and creat the FIS experiment template to simulate the AZ failure: ```bash $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -29,21 +22,70 @@ $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the AZ failure: -```bash -$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 450 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=560 wait=30 +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 480 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-h57sp 1/1 Running 0 12m + ui-6dfb84cf67-h87h8 1/1 Running 0 12m + ip-10-42-111-144.us-west-2.compute.internal: + ui-6dfb84cf67-4xvmc 1/1 Running 0 11m + ui-6dfb84cf67-crl2s 1/1 Running 0 6m23s + +------us-west-2b------ + ip-10-42-141-243.us-west-2.compute.internal: + No resources found in ui namespace. + ip-10-42-150-255.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-164-250.us-west-2.compute.internal: + ui-6dfb84cf67-fl4hk 1/1 Running 0 11m + ui-6dfb84cf67-mptkw 1/1 Running 0 11m + ui-6dfb84cf67-zxnts 1/1 Running 0 6m27s + ip-10-42-178-108.us-west-2.compute.internal: + ui-6dfb84cf67-8vmcz 1/1 Running 0 6m28s + ui-6dfb84cf67-wknc5 1/1 Running 0 12m ``` -This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 7.5 minutes to understand the immediate impact of the simulated AZ failure. +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 8 minutes to understand the immediate impact of the simulated AZ failure. During the experiment, you should observe the following sequence of events: -- input here +1. After about 3 minutes, an AZ zone will fail. +2. Looking at the [Synthetic Canary]() you will see change state to `In Alarm` +3. Around 4 minutes after the experiment started, you will see pods reappearing in the other AZs +4. After the experiment is complete, after about 7 minutes, it marks the AZ as healthy, and replacement EC2 instances will be launched as a result of an EC2 autoscaling action, bringing the number of instances in each AZ to 2 again. + +During this time, the retail url will stay available showimg how resilient EKS is to AZ failures. :::note To verify clusters and rebalance pods, you can run: -```bash -$ $SCRIPT_DIR/verify-cluster.sh +```bash timeout=240 wait=30 +$ $SCRIPT_DIR/AZ-verify-clusters.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-lwd86 1/1 Running 0 16s + ip-10-42-111-144.us-west-2.compute.internal: + ui-6dfb84cf67-hfrcf 1/1 Running 0 17s + ui-6dfb84cf67-qdr4s 1/1 Running 0 17s + +------us-west-2b------ + ip-10-42-141-243.us-west-2.compute.internal: + ui-6dfb84cf67-dxtg4 1/1 Running 0 19s + ip-10-42-150-255.us-west-2.compute.internal: + ui-6dfb84cf67-jvvg6 1/1 Running 0 20s + ui-6dfb84cf67-tmbzc 1/1 Running 0 20s + +------us-west-2c------ + ip-10-42-164-250.us-west-2.compute.internal: + ui-6dfb84cf67-k5mn8 1/1 Running 0 23s + ui-6dfb84cf67-zbm8j 1/1 Running 0 23s + ip-10-42-178-108.us-west-2.compute.internal: + ui-6dfb84cf67-svwqp 1/1 Running 0 24s ``` ::: @@ -52,8 +94,10 @@ $ $SCRIPT_DIR/verify-cluster.sh After the experiment, verify that your application remains operational despite the simulated AZ failure: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` This step confirms the effectiveness of your Kubernetes cluster's high availability configuration and its ability to maintain service continuity during significant infrastructure disruptions. diff --git a/website/docs/resiliency/index.md b/website/docs/observability/high-availability/index.md similarity index 97% rename from website/docs/resiliency/index.md rename to website/docs/observability/high-availability/index.md index 0252fee19..3d9254faa 100644 --- a/website/docs/resiliency/index.md +++ b/website/docs/observability/high-availability/index.md @@ -1,6 +1,7 @@ --- title: "Resiliency" -sidebar_position: 11 +sidebar_position: 70 +sidebar_custom_props: { "module": true } weight: 10 --- diff --git a/website/docs/resiliency/high-availability/tests/hook-suite.sh b/website/docs/observability/high-availability/tests/hook-suite.sh similarity index 100% rename from website/docs/resiliency/high-availability/tests/hook-suite.sh rename to website/docs/observability/high-availability/tests/hook-suite.sh diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md deleted file mode 100644 index 31821d93a..000000000 --- a/website/docs/resiliency/high-availability/01-setup.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: "Scaling and Pod Anti-Affinity for UI Service" -sidebar_position: 1 -description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." ---- - -This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. - -## Scaling and Pod Anti-Affinity - -We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. - -Here's the content of our patch file: - -```file -manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml -``` - -Apply the changes using Kustomize patch and -[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/config/kustomization.yaml): - -```bash -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ -``` - -## Verify Retail Store Accessibility - -After applying these changes, it's important to verify that your retail store is accessible: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. - -:::tip -If the retail store doesn't load immediately, wait a few moments and refresh the page. It may take a short time for all components to become fully operational. -::: - -## Helper Script: Get Pods by AZ - -The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/get-pods-by-az.sh). - -### Script Execution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/get-pods-by-az.sh -``` - -:::tip -Use this to quickly assess the distribution of your pods across multiple zones. -::: - -:::info -For more information on these changes, check out these sections: - -- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) - ::: diff --git a/website/docs/resiliency/high-availability/06-az-setup.md b/website/docs/resiliency/high-availability/06-az-setup.md deleted file mode 100644 index 4c7d2eeb9..000000000 --- a/website/docs/resiliency/high-availability/06-az-setup.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -title: "AZ Failure Experiment Setup" -sidebar_position: 6 -description: "Scale your application to two Availability Zones and prepare for an AZ failure simulation experiment." ---- - -This guide outlines steps to enhance the resilience of your UI service by scaling it across two Availability Zones (AZs) and preparing for an AZ failure simulation experiment. - -## Scaling to Two AZs - -We'll use a Kustomize patch to modify the UI deployment, adding a second AZ and adjusting the number of replicas. We'll scale to 4 replicas in the new AZ while maintaining 5 replicas in the first AZ. - -First we need to make ann EKS Cluster in `us-east-2`. Run this to create a second AZ: - -```bash timeout=300 wait=30 -$ $SCRIPT_DIR/multi-az-get-pods.sh -$ aws configure set default.region $SECONDARY_REGION -$ prepare-environment resiliency -$ aws configure set default.region $PRIMARY_REGION -$ $SCRIPT_DIR/multi-az-get-pods.sh -``` - -Now we need to Kustomize our content with a patch file: - -```file -manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml -``` - -Apply the changes using Kustomize patch and -[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml): - -```bash -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/resiliency/high-availability/multi_az/ -``` - -## Verify Retail Store Accessibility - -After applying these changes, it's important to verify that your retail store is accessible: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -:::tip -The retail url may take 10 minutes to become operational. -::: - -## Check Pod Distribution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/multi-az-get-pods.sh -``` - -## AZ Failure Experiment Preparation - -### Overview - -This experiment will simulate an Availability Zone (AZ) failure, demonstrating how resilient your application is when faced with significant infrastructure disruptions. We'll use AWS Fault Injection Simulator (FIS) and additional AWS services to test how well your system maintains functionality when an entire AZ becomes unavailable. - -### Setting up a Synthetic Canary - -Before starting the experiment, set up a synthetic canary for heartbeat monitoring: - -1. First, create an S3 bucket for the canary artifacts: - -```bash -$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" -$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 -``` - -2. Create the blueprint: - -```file -manifests/modules/resiliency/scripts/eks_workshop_canary_script.js -``` - -Place this canary script into the bucket: - -```bash -$ aws s3 cp /manifests/modules/resiliency/scripts/eks_workshop_canary_script.zip s3://$BUCKET_NAME/canary-scripts/eks_workshop_canary_script.zip -``` - -3. Create a synthetic canary: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ aws synthetics create-canary \ - --name eks-workshop-canary \ - --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ - --execution-role-arn $CANARY_ROLE_ARN \ - --runtime-version syn-nodejs-puppeteer-6.2 \ - --schedule Expression="rate(1 minute)" \ - --code S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/eks_workshop_canary_script.zip,Handler="exports.handler" \ - --run-config "EnvironmentVariables={INGRESS_URL=http://$INGRESS_URL}" \ - --region us-west-2 -$ sleep 30 -$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 -``` - -4. Create a CloudWatch alarm for the canary: - -```bash -$ aws cloudwatch put-metric-alarm \ - --alarm-name "eks-workshop-canary-alarm" \ - --metric-name SuccessPercent \ - --namespace CloudWatchSynthetics \ - --statistic Average \ - --period 60 \ - --threshold 95 \ - --comparison-operator LessThanThreshold \ - --dimensions Name=CanaryName,Value=eks-workshop-canary \ - --evaluation-periods 1 \ - --alarm-description "Alarm when Canary success rate drops below 95%" \ - --unit Percent \ - --region us-west-2 -``` - -This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. - -With these steps completed, your application is now scaled across two AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index d31d3f620..9b2339322 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -133,12 +133,6 @@ const config = { position: "left", label: "Observability", }, - { - type: "doc", - docId: "resiliency/index", - position: "left", - label: "Resiliency", - }, { type: "doc", docId: "security/index", diff --git a/website/sidebars.js b/website/sidebars.js index adf89ee4a..7da64994c 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,7 +20,6 @@ const sidebars = { networking: [{ type: "autogenerated", dirName: "networking" }], autoscaling: [{ type: "autogenerated", dirName: "autoscaling" }], observability: [{ type: "autogenerated", dirName: "observability" }], - resiliency: [{ type: "autogenerated", dirName: "resiliency" }], automation: [{ type: "autogenerated", dirName: "automation" }], aiml: [{ type: "autogenerated", dirName: "aiml" }], }; From c7564606759c64a08a648358a57a3cf8b0346eb1 Mon Sep 17 00:00:00 2001 From: cyturney Date: Fri, 23 Aug 2024 14:42:12 -0700 Subject: [PATCH 04/11] some updates based on PR input --- cluster/eksctl/cluster.yaml | 2 +- .../config/scale_and_affinity_patch.yaml | 13 ++++ .../resiliency/scripts/get-pods-by-az.sh | 50 +++++++++---- .../resiliency/scripts/pod-failure.sh | 4 +- .../resiliency/scripts/testing.sh | 0 .../resiliency/scripts/verify-cluster.sh | 6 +- .../high-availability/00-setup.md | 51 ------------- .../high-availability/01-scale.md | 18 +++-- .../high-availability/02-pod-failure.md | 71 +++++++++++-------- .../03-node-failure-no-fis.md | 47 ++++++------ .../04-node-failure-partial-fis.md | 37 ++++------ .../05-node-failure-complete-fis.md | 33 +++------ .../high-availability/06-az-setup.md | 9 ++- .../high-availability/07-az-failure.md | 44 ++++-------- .../observability/high-availability/index.md | 52 +++++++++++--- 15 files changed, 219 insertions(+), 218 deletions(-) mode change 100644 => 100755 manifests/modules/observability/resiliency/scripts/testing.sh delete mode 100644 website/docs/observability/high-availability/00-setup.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index d0f2cae4e..a22a4a127 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -32,7 +32,7 @@ managedNodeGroups: instanceType: m5.large privateNetworking: true # had to remove use make create - releaseVersion: "1.30.0-20240625" + #releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml index c84b9a056..3637434f5 100644 --- a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml +++ b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -25,3 +25,16 @@ spec: values: - ui topologyKey: "kubernetes.io/hostname" + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui diff --git a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh index 8063f1094..3306c9b0f 100755 --- a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh +++ b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Credit to "Disaster recovery, high availability, and resiliency on Amazon EKS" +# Modified from "Disaster recovery, high availability, and resiliency on Amazon EKS" # https://catalog.us-east-1.prod.workshops.aws/workshops/6140457f-53b2-48b8-a007-2d4be06ba2fc GREEN='\033[0;32m' @@ -10,16 +10,40 @@ NC='\033[0m' # No Color CURRENT_CONTEXT=$(kubectl config current-context) REGION=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"$CURRENT_CONTEXT\")].context.cluster}" | cut -d : -f 4) -for az in a b c -do - AZ=$REGION$az - echo -n "------" - echo -n -e "${GREEN}$AZ${NC}" - echo "------" - for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) - do - echo -e " ${RED}$node:${NC}" - kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done - done - echo "" +# Function to clear the screen and move cursor to top-left +clear_screen() { + echo -e "\033[2J\033[H" +} + +# Function to generate the output +generate_output() { + for az in a b c + do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done + done + echo "" + done +} + +# Initial clear screen +clear_screen + +# Main loop +while true; do + # Generate output to a temporary file + generate_output > temp_output.txt + + # Clear screen and display the new output + clear_screen + cat temp_output.txt + + # Wait before next update + sleep 1 done \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/scripts/pod-failure.sh b/manifests/modules/observability/resiliency/scripts/pod-failure.sh index 3ed7df813..fd7ea7b49 100755 --- a/manifests/modules/observability/resiliency/scripts/pod-failure.sh +++ b/manifests/modules/observability/resiliency/scripts/pod-failure.sh @@ -5,7 +5,7 @@ unique_id=$(date +%s) # Create a YAML configuration for the PodChaos resource -cat < pod-failure.yaml +kubectl apply -f - < +ui-6dfb84cf67-6d5lq 1/1 Running 0 46s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 46s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-qqltz 1/1 Running 0 46s 10.42.185.149 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 46s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal ``` -This command does the following: +Note that all pods have similar start times (shown in the AGE column). + +### Step 2: Simulate Pod Failure + +Now, let's simulate a pod failure: + +```bash +$ $SCRIPT_DIR/pod-failure.sh +``` -1. Initiates the pod failure simulation using the `pod-failure.sh` script -2. Monitors the pod distribution across Availability Zones (AZs) for 30 seconds -3. Updates the display every second to show real-time changes +This script will use Chaos Mesh to terminate one of the pods. -During the experiment, you should observe one pod disappearing and then reappearing, demonstrating the system's ability to detect and recover from failures. +### Step 3: Observe Recovery -To get a more detailed view of the pods in the `ui` namespace, use the following command: +Wait for a couple of seconds to allow Kubernetes to detect the failure and initiate recovery. Then, check the pod status again: -```bash wait=15 +```bash timeout=5 $ kubectl get pods -n ui -o wide +``` + +You should now see output similar to this: + +``` NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -ui-6dfb84cf67-2pxnp 1/1 Running 0 2m56s 10.42.154.151 ip-10-42-153-179.us-west-2.compute.internal -ui-6dfb84cf67-dsp55 1/1 Running 0 2m56s 10.42.126.161 ip-10-42-127-82.us-west-2.compute.internal -ui-6dfb84cf67-gzd9s 1/1 Running 0 71s 10.42.126.246 ip-10-42-127-82.us-west-2.compute.internal -ui-6dfb84cf67-n8x4f 1/1 Running 0 2m56s 10.42.190.250 ip-10-42-186-246.us-west-2.compute.internal -ui-6dfb84cf67-wljth 1/1 Running 0 2m56s 10.42.190.249 ip-10-42-186-246.us-west-2.compute.internal +ui-6dfb84cf67-44hc9 1/1 Running 0 2m57s 10.42.121.37 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-6d5lq 1/1 Running 0 2m57s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-ghp5z 1/1 Running 0 6s 10.42.185.150 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 2m57s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 2m57s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal +[ec2-user@bc44085aafa9 environment]$ ``` +Notice that one of the pods (in this example, `ui-6dfb84cf67-ghp5z`) has a much lower AGE value. This is the pod that Kubernetes automatically created to replace the one that was terminated by our simulation. + This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. ## Verify Retail Store Availability An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md index ac487042c..817c8f75c 100644 --- a/website/docs/observability/high-availability/03-node-failure-no-fis.md +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Node Failure without FIS" -sidebar_position: 4 +sidebar_position: 3 description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." --- -# Simulating Node Failure without FIS - ## Overview This experiment simulates a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. By deliberately causing a node to fail, we can observe how Kubernetes handles the failure and maintains the overall health of the cluster. @@ -22,8 +20,9 @@ It's important to note that this experiment is repeatable, allowing you to run i To simulate the node failure and monitor its effects, run the following command: -```bash timeout=180 wait=30 -$ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=240 wait=30 +$ $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: ui-6dfb84cf67-dsp55 1/1 Running 0 10m @@ -54,42 +53,36 @@ Throughout this process, the total number of running pods should remain constant While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: +First let's ensure all nodes are in the `Ready` state: ```bash timeout=300 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh - -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-127-82.us-west-2.compute.internal: - ui-6dfb84cf67-vwk4x 1/1 Running 0 25s - -------us-west-2b------ - ip-10-42-133-195.us-west-2.compute.internal: - ui-6dfb84cf67-2rb6s 1/1 Running 0 27s - ui-6dfb84cf67-dk495 1/1 Running 0 27s +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +``` -------us-west-2c------ - ip-10-42-186-246.us-west-2.compute.internal: - ui-6dfb84cf67-7bftc 1/1 Running 0 29s - ui-6dfb84cf67-nqgdn 1/1 Running 0 29s +This command counts the total number of nodes in the `Ready` state and continuously checks until all 3 active nodes are ready. +Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes: +```bash timeout=60 wait=30 +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` -This script will: +These commands perform the following actions: -- Wait for nodes to come back online -- Count the number of nodes and ui pods -- Check if the pods are evenly distributed across the nodes +1. Delete the existing ui deployment. +2. Reapply the configuration to create a new deployment. +3. Use the `get-pods-by-az.sh` script to check the distribution of pods across availability zones. ## Verify Retail Store Availability After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md index 4ca8d6c4b..7ca211192 100644 --- a/website/docs/observability/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Partial Node Failure with FIS" -sidebar_position: 5 +sidebar_position: 4 description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." --- -# Simulating Partial Node Failure with FIS - ## AWS Fault Injection Simulator (FIS) Overview AWS Fault Injection Simulator (FIS) is a fully managed service that enables you to perform controlled fault injection experiments on your AWS workloads. FIS allows you to simulate various failure scenarios, which is crucial for: @@ -30,7 +28,8 @@ For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) - [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) - [AWS Systems Manager, Automation](https://console.aws.amazon.com/systems-manager/automation/executions) - ::: + +::: ## Experiment Details @@ -57,7 +56,8 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: ```bash timeout=240 wait=30 -$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: ui-6dfb84cf67-s6kw4 1/1 Running 0 2m16s @@ -70,6 +70,7 @@ $ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json & ui-6dfb84cf67-29xtf 1/1 Running 0 79s ui-6dfb84cf67-68hbw 1/1 Running 0 79s ui-6dfb84cf67-plv9f 1/1 Running 0 79s + ``` This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. @@ -85,23 +86,12 @@ Your retail url should stay operational unlike the node failure without FIS. :::note To verify nodes and rebalance pods, you can run: -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-127-82.us-west-2.compute.internal: - ui-6dfb84cf67-v2xj6 1/1 Running 0 14s - -------us-west-2b------ - ip-10-42-148-187.us-west-2.compute.internal: - ui-6dfb84cf67-4xq4n 1/1 Running 0 16s - ui-6dfb84cf67-56d6d 1/1 Running 0 16s - -------us-west-2c------ - ip-10-42-180-16.us-west-2.compute.internal: - ui-6dfb84cf67-86mpz 1/1 Running 0 18s - ui-6dfb84cf67-nhx4j 1/1 Running 0 18s +```bash timeout=300 wait=30 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -110,8 +100,9 @@ $ $SCRIPT_DIR/verify-cluster.sh Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md index 722341fd0..4bc755886 100644 --- a/website/docs/observability/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Complete Node Failure with FIS" -sidebar_position: 6 +sidebar_position: 5 description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." --- -# Simulating Complete Node Failure with FIS - ## Overview This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. This is essentially a cluster failure. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. @@ -31,7 +29,8 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc Execute the FIS experiment and monitor the cluster's response: ```bash timeout=420 wait=30 -$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 360 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 360s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-106-250.us-west-2.compute.internal: No resources found in ui namespace. @@ -60,23 +59,12 @@ Due to the severity of the experiment, the retail store url will not stay operat :::note To verify nodes and rebalance pods, you can run: -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-106-250.us-west-2.compute.internal: - ui-6dfb84cf67-4fjhh 1/1 Running 0 15s - ui-6dfb84cf67-gkrtn 1/1 Running 0 14s - -------us-west-2b------ - ip-10-42-141-133.us-west-2.compute.internal: - ui-6dfb84cf67-7qnkz 1/1 Running 0 16s - ui-6dfb84cf67-n58b9 1/1 Running 0 16s - -------us-west-2c------ - ip-10-42-179-59.us-west-2.compute.internal: - ui-6dfb84cf67-lvdc2 1/1 Running 0 18s +```bash timeout=300 wait=30 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -85,8 +73,9 @@ $ $SCRIPT_DIR/verify-cluster.sh Check the retail store application's recovery: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md index 04a3bbb83..21b8c83a7 100644 --- a/website/docs/observability/high-availability/06-az-setup.md +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -1,6 +1,6 @@ --- title: "AZ Failure Experiment Setup" -sidebar_position: 7 +sidebar_position: 6 description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." --- @@ -17,7 +17,8 @@ $ aws autoscaling update-auto-scaling-group \ --max-size 6 $ sleep 60 $ kubectl scale deployment ui --replicas=9 -n ui -$ $SCRIPT_DIR/get-pods-by-az.sh +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 + ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s @@ -49,6 +50,7 @@ Before starting the experiment, set up a synthetic canary for heartbeat monitori ```bash wait=15 $ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" $ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION + make_bucket: eks-workshop-canary-artifacts-1724131402 ``` @@ -62,6 +64,7 @@ Place this canary blueprint into the bucket: ```bash wait=15 $ $SCRIPT_DIR/create-blueprint.sh + upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com @@ -78,7 +81,7 @@ $ aws synthetics create-canary \ --schedule "Expression=rate(1 minute)" \ --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ --region $AWS_REGION -$ sleep 45 +$ aws synthetics wait canary-ready --name eks-workshop-canary --region $AWS_REGION $ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION $ aws cloudwatch put-metric-alarm \ --alarm-name "eks-workshop-canary-alarm" \ diff --git a/website/docs/observability/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md index 97d1043b3..94c6274c1 100644 --- a/website/docs/observability/high-availability/07-az-failure.md +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -1,11 +1,9 @@ --- title: "Simulating AZ Failure" -sidebar_position: 8 +sidebar_position: 7 description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." --- -# Simulating AZ Failure - ## Overview This repeatable experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. @@ -23,7 +21,8 @@ $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the AZ failure: ```bash timeout=560 wait=30 -$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 480 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 480s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: ui-6dfb84cf67-h57sp 1/1 Running 0 12m @@ -60,32 +59,14 @@ During the experiment, you should observe the following sequence of events: During this time, the retail url will stay available showimg how resilient EKS is to AZ failures. :::note -To verify clusters and rebalance pods, you can run: - -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/AZ-verify-clusters.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-100-4.us-west-2.compute.internal: - ui-6dfb84cf67-lwd86 1/1 Running 0 16s - ip-10-42-111-144.us-west-2.compute.internal: - ui-6dfb84cf67-hfrcf 1/1 Running 0 17s - ui-6dfb84cf67-qdr4s 1/1 Running 0 17s - -------us-west-2b------ - ip-10-42-141-243.us-west-2.compute.internal: - ui-6dfb84cf67-dxtg4 1/1 Running 0 19s - ip-10-42-150-255.us-west-2.compute.internal: - ui-6dfb84cf67-jvvg6 1/1 Running 0 20s - ui-6dfb84cf67-tmbzc 1/1 Running 0 20s - -------us-west-2c------ - ip-10-42-164-250.us-west-2.compute.internal: - ui-6dfb84cf67-k5mn8 1/1 Running 0 23s - ui-6dfb84cf67-zbm8j 1/1 Running 0 23s - ip-10-42-178-108.us-west-2.compute.internal: - ui-6dfb84cf67-svwqp 1/1 Running 0 24s +To verify nodes and rebalance pods, you can run: + +```bash timeout=300 wait=30 +$ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -94,8 +75,9 @@ $ $SCRIPT_DIR/AZ-verify-clusters.sh After the experiment, verify that your application remains operational despite the simulated AZ failure: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/index.md b/website/docs/observability/high-availability/index.md index 3d9254faa..9873c4990 100644 --- a/website/docs/observability/high-availability/index.md +++ b/website/docs/observability/high-availability/index.md @@ -1,10 +1,28 @@ --- -title: "Resiliency" +title: "Chaos Engineering with EKS" sidebar_position: 70 sidebar_custom_props: { "module": true } weight: 10 --- +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=900 wait=30 +$ kubectl delete deployment ui -n ui +$ prepare-environment observability/resiliency +``` + +This will make the following changes to your lab environment: + +- Create the ingress load balancer +- Create RBAC and Rolebindings +- Install AWS Load Balancer controller +- Create an IAM role for AWS Fault Injection Simulator (FIS) + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/.workshop/terraform). +::: + ## What is Resiliency? Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: @@ -24,13 +42,17 @@ Amazon EKS provides a managed Kubernetes platform, but it's still crucial to des 4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. 5. **Compliance**: Meet regulatory requirements for uptime and data protection in various industries. -## Resiliency Scenarios Covered in this Chapter +## Lab Overview and Resiliency Scenarios -We'll explore several scenarios to show resiliency by by simulating and responding to: +In this lab, we'll explore various high availability scenarios and test the resilience of your EKS environment. Through a series of experiments, you'll gain hands-on experience in handling different types of failures and understanding how your Kubernetes cluster responds to these challenges. -1. Pod Failures -2. Node Failures -3. Availability Zone Failures +The simulate and respond to: + +1. **Pod Failures**: Using ChaosMesh to test your application's resilience to individual pod failures. +2. **Node Failures**: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + - Without AWS Fault Injection Simulator: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + - With AWS Fault Injection Simulator: Leveraging AWS Fault Injection Simulator for partial and complete node failure scenarios. +3. **Availability Zone Failure**: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. ## What You'll Learn @@ -41,6 +63,13 @@ By the end of this chapter, you'll be able to: - Observe the self-healing capabilities of Kubernetes in action - Gain practical experience in chaos engineering for EKS environments +These experiments will help you understand: + +- How Kubernetes handles different types of failures +- The importance of proper resource allocation and pod distribution +- The effectiveness of your monitoring and alerting systems +- How to improve your application's fault tolerance and recovery strategies + ## Tools and Technologies Throughout this chapter, we'll be using: @@ -59,6 +88,13 @@ Chaos engineering is the practice of intentionally introducing controlled failur 3. Improve your incident response procedures 4. Foster a culture of resilience within your organization +By the end of this lab, you'll have a comprehensive understanding of your EKS environment's high availability capabilities and areas for potential improvement. + :::info -For more information on AWS Resiliency features in greater depth, we recommend checking out [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) -::: +For more information on AWS Resiliency features in greater depth, we recommend checking out: + +- [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) +- [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) +- [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) +- [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) + ::: From 0da600cb5e0deffcdaec419d89d303bc8cfce55f Mon Sep 17 00:00:00 2001 From: Sai Vennam Date: Fri, 27 Sep 2024 12:47:25 -0500 Subject: [PATCH 05/11] Update cluster.yaml --- cluster/eksctl/cluster.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index a22a4a127..4c78c034d 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -31,8 +31,7 @@ managedNodeGroups: maxSize: 6 instanceType: m5.large privateNetworking: true - # had to remove use make create - #releaseVersion: "1.30.0-20240625" + releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: From b2e15ac1e51606012196d13405334c1a2fe6b3cb Mon Sep 17 00:00:00 2001 From: cyturney Date: Wed, 14 Aug 2024 11:15:29 -0700 Subject: [PATCH 06/11] Resiliency PR initial commit --- cluster/eksctl/cluster.yaml | 3 +- .../modules/resiliency/.workshop/cleanup.sh | 98 +++++++ .../resiliency/.workshop/terraform/main.tf | 256 ++++++++++++++++++ .../resiliency/.workshop/terraform/outputs.tf | 10 + .../resiliency/.workshop/terraform/vars.tf | 43 +++ .../config/kustomization.yaml | 8 + .../config/scale_and_affinity_patch.yaml | 27 ++ .../rbac/chaos-mesh-role.yaml | 12 + .../rbac/chaos-mesh-rolebinding.yaml | 13 + .../resiliency/scripts/get-pods-by-az.sh | 25 ++ .../resiliency/scripts/node-failure.sh | 25 ++ .../modules/resiliency/scripts/pod-failure.sh | 26 ++ .../resiliency/scripts/verify-cluster.sh | 95 +++++++ .../resiliency/high-availability/01-setup.md | 90 ++++++ .../high-availability/02-pod-failure.md | 50 ++++ .../03-node-failure-no-fis.md | 82 ++++++ .../04-node-failure-partial-fis.md | 82 ++++++ .../05-node-failure-complete-fis.md | 65 +++++ .../high-availability/06-az-failure.md | 134 +++++++++ .../resiliency/high-availability/index.md | 49 ++++ website/docs/resiliency/index.md | 54 ++++ website/docusaurus.config.js | 6 + website/sidebars.js | 1 + 23 files changed, 1253 insertions(+), 1 deletion(-) create mode 100755 manifests/modules/resiliency/.workshop/cleanup.sh create mode 100644 manifests/modules/resiliency/.workshop/terraform/main.tf create mode 100644 manifests/modules/resiliency/.workshop/terraform/outputs.tf create mode 100644 manifests/modules/resiliency/.workshop/terraform/vars.tf create mode 100644 manifests/modules/resiliency/high-availability/config/kustomization.yaml create mode 100644 manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml create mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml create mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml create mode 100755 manifests/modules/resiliency/scripts/get-pods-by-az.sh create mode 100755 manifests/modules/resiliency/scripts/node-failure.sh create mode 100755 manifests/modules/resiliency/scripts/pod-failure.sh create mode 100755 manifests/modules/resiliency/scripts/verify-cluster.sh create mode 100644 website/docs/resiliency/high-availability/01-setup.md create mode 100644 website/docs/resiliency/high-availability/02-pod-failure.md create mode 100644 website/docs/resiliency/high-availability/03-node-failure-no-fis.md create mode 100644 website/docs/resiliency/high-availability/04-node-failure-partial-fis.md create mode 100644 website/docs/resiliency/high-availability/05-node-failure-complete-fis.md create mode 100644 website/docs/resiliency/high-availability/06-az-failure.md create mode 100644 website/docs/resiliency/high-availability/index.md create mode 100644 website/docs/resiliency/index.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index dbbaf5cc1..b038c2441 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -31,7 +31,8 @@ managedNodeGroups: maxSize: 6 instanceType: m5.large privateNetworking: true - releaseVersion: "1.30.0-20240625" + # had to remove use make create + #releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh new file mode 100755 index 000000000..d4040bbde --- /dev/null +++ b/manifests/modules/resiliency/.workshop/cleanup.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +set -e + +# Delete Ingress +kubectl delete ingress -n ui ui --ignore-not-found +kubectl delete ingress ui -n ui --ignore-not-found + +# Delete Deployments +kubectl delete deployment -n ui ui --ignore-not-found +kubectl delete deployment ui -n ui --ignore-not-found + +# Delete Services +kubectl delete service -n ui ui-nlb --ignore-not-found + +# Delete Roles and RoleBindings +kubectl delete role chaos-mesh-role -n ui --ignore-not-found +kubectl delete rolebinding chaos-mesh-rolebinding -n ui --ignore-not-found + +# Uninstall Helm chart +if command -v helm &> /dev/null; then + echo "Uninstalling aws-load-balancer-controller Helm chart" + helm uninstall aws-load-balancer-controller -n kube-system || true + + echo "Uninstalling Chaos Mesh Helm chart" + helm uninstall chaos-mesh -n chaos-mesh || true + + # Wait for resources to be cleaned up + echo "Waiting for resources to be cleaned up..." + sleep 30 +else + echo "Helm command not found. Skipping Helm chart uninstallations." +fi + +kubectl delete namespace chaos-mesh --ignore-not-found + +# Delete IAM Roles and Policies +ROLE_PREFIX="fis-execution-role-eks-workshop" +POLICY_PREFIX="eks-resiliency-fis-policy" + +# List and delete roles +for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${ROLE_PREFIX}')].RoleName" --output text); do + echo "Detaching policies and deleting role: $role" + # Detach managed policies + aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess || true + aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess || true + + # Detach and delete inline policies + for policy in $(aws iam list-role-policies --role-name $role --query PolicyNames --output text); do + aws iam delete-role-policy --role-name $role --policy-name $policy || true + done + + # Delete the role + aws iam delete-role --role-name $role || true +done + +# List and delete policies +for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${POLICY_PREFIX}')].Arn" --output text); do + echo "Deleting policy: $policy_arn" + + # Detach policy from all attached roles + for role in $(aws iam list-entities-for-policy --policy-arn $policy_arn --entity-filter Role --query 'PolicyRoles[*].RoleName' --output text); do + aws iam detach-role-policy --role-name $role --policy-arn $policy_arn + done + + # Delete the policy + aws iam delete-policy --policy-arn $policy_arn +done + +# Delete any leftover ALBs +ALB_ARN=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text) +if [ ! -z "$ALB_ARN" ]; then + echo "Deleting leftover ALB: $ALB_ARN" + aws elbv2 delete-load-balancer --load-balancer-arn $ALB_ARN +else + echo "No leftover ALB found." +fi + +# Delete S3 bucket +BUCKET_PREFIX="eks-workshop-canary-artifacts-" +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, '${BUCKET_PREFIX}')].Name" --output text); do + echo "Deleting S3 bucket: $bucket" + # First, remove all objects from the bucket + aws s3 rm s3://$bucket --recursive + # Then delete the bucket + aws s3api delete-bucket --bucket $bucket --region us-west-2 +done + +# Delete CloudWatch Synthetics canary +CANARY_NAME="eks-workshop-canary" +if aws synthetics get-canary --name $CANARY_NAME --region us-west-2 &> /dev/null; then + echo "Deleting CloudWatch Synthetics canary: $CANARY_NAME" + aws synthetics delete-canary --name $CANARY_NAME --region us-west-2 +else + echo "CloudWatch Synthetics canary $CANARY_NAME not found." +fi + +echo "Cleanup completed successfully." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/resiliency/.workshop/terraform/main.tf new file mode 100644 index 000000000..7e039cbdf --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/main.tf @@ -0,0 +1,256 @@ +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "1.16.3" + + cluster_name = var.addon_context.eks_cluster_id + cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint + cluster_version = var.eks_cluster_version + oidc_provider_arn = var.addon_context.eks_oidc_provider_arn + + enable_aws_load_balancer_controller = true + create_kubernetes_resources = false + +} + + +// ALB creation +resource "kubernetes_manifest" "ui_alb" { + manifest = { + "apiVersion" = "networking.k8s.io/v1" + "kind" = "Ingress" + "metadata" = { + "name" = "ui" + "namespace" = "ui" + "annotations" = { + "alb.ingress.kubernetes.io/scheme" = "internet-facing" + "alb.ingress.kubernetes.io/target-type" = "ip" + "alb.ingress.kubernetes.io/healthcheck-path" = "/actuator/health/liveness" + } + } + "spec" = { + ingressClassName = "alb", + "rules" = [{ + "http" = { + paths = [{ + path = "/" + pathType = "Prefix" + "backend" = { + service = { + name = "ui" + port = { + number = 80 + } + } + } + }] + } + }] + } + } +} + +// Create RBAC and Rolebinding +resource "kubernetes_role" "chaos_mesh_role" { + metadata { + name = "chaos-mesh-role" + namespace = "ui" + } + + rule { + api_groups = ["chaos-mesh.org"] + resources = ["podchaos"] + verbs = ["create", "delete", "get", "list", "patch", "update", "watch"] + } + + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["get", "list", "watch"] + } +} + +data "aws_caller_identity" "current" {} + +resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { + metadata { + name = "chaos-mesh-rolebinding" + namespace = "ui" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.chaos_mesh_role.metadata[0].name + } + + subject { + kind = "User" + name = data.aws_caller_identity.current.arn + namespace = "ui" + } +} + +// Add AWS Load Balancer controller +resource "helm_release" "aws_load_balancer_controller" { + name = "aws-load-balancer-controller" + repository = "https://aws.github.io/eks-charts" + chart = "aws-load-balancer-controller" + namespace = "kube-system" + version = var.load_balancer_controller_chart_version + + set { + name = "clusterName" + value = var.addon_context.eks_cluster_id + } + + set { + name = "serviceAccount.name" + value = "aws-load-balancer-controller-sa" + } + + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + } +} + + +// Chaos Mesh Helm Release +resource "helm_release" "chaos_mesh" { + name = "chaos-mesh" + repository = "https://charts.chaos-mesh.org" + chart = "chaos-mesh" + namespace = "chaos-mesh" + version = "2.5.1" + + create_namespace = true +} + +// FIS IAM role +resource "random_id" "suffix" { + byte_length = 8 +} + +resource "aws_iam_role" "fis_role" { + name = "fis-execution-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "fis.amazonaws.com" + } + Action = "sts:AssumeRole" + }, + { + Effect = "Allow" + Principal = { + Federated = var.addon_context.eks_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${trimprefix(var.addon_context.eks_oidc_provider_arn, "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/")}:sub" = [ + "system:serviceaccount:ui:chaos-mesh-sa" + ] + } + } + }, + { + Effect = "Allow" + Principal = { + Service = "ssm.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } + + depends_on = [kubernetes_role_binding.chaos_mesh_rolebinding] +} + +// Attach FIS Access Policy +resource "aws_iam_role_policy_attachment" "fis_eks_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_network_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess" + role = aws_iam_role.fis_role.name +} + +// Policy for creating FIS experiment templates +resource "aws_iam_policy" "eks_resiliency_fis_policy" { + name = "eks-resiliency-fis-policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency FIS experiments" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + // FIS + "fis:CreateExperimentTemplate", + "fis:GetExperimentTemplate", + "fis:ListExperimentTemplates", + "fis:DeleteExperimentTemplate", + "fis:UpdateExperimentTemplate", + "fis:TagResource", + "fis:UntagResource", + "fis:StartExperiment", + "fis:GetExperiment", + "fis:ListExperiments", + "ec2:DescribeInstances", + "ec2:DescribeInstanceStatus", + "ec2:TerminateInstances", + "eks:DescribeCluster", + "eks:ListNodegroups", + "eks:DescribeNodegroup", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:SetDesiredCapacity", + "logs:CreateLogDelivery", + "logs:GetLogDelivery", + "logs:UpdateLogDelivery", + "logs:DeleteLogDelivery", + "logs:ListLogDeliveries", + // Synthetic Canary + "synthetics:CreateCanary", + "synthetics:DeleteCanary", + "synthetics:DescribeCanaries", + "synthetics:StartCanary", + "synthetics:StopCanary", + "synthetics:UpdateCanary", + "s3:PutObject", + "s3:GetBucketLocation", + "s3:ListAllMyBuckets", + "cloudwatch:PutMetricData", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.fis_role.arn + } + ] + }) +} + +// Attach custom policy to the role +resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn + role = aws_iam_role.fis_role.name +} diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/resiliency/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..413de0df1 --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/outputs.tf @@ -0,0 +1,10 @@ +output "environment_variables" { + description = "Environment variables to be added to the IDE shell" + value = { + LBC_CHART_VERSION = var.load_balancer_controller_chart_version + LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + FIS_ROLE_ARN = aws_iam_role.fis_role.arn + RANDOM_SUFFIX = random_id.suffix.hex + SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + } +} diff --git a/manifests/modules/resiliency/.workshop/terraform/vars.tf b/manifests/modules/resiliency/.workshop/terraform/vars.tf new file mode 100644 index 000000000..42bd4d060 --- /dev/null +++ b/manifests/modules/resiliency/.workshop/terraform/vars.tf @@ -0,0 +1,43 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} + +variable "load_balancer_controller_chart_version" { + description = "The chart version of aws-load-balancer-controller to use" + type = string + # renovate-helm: depName=aws-load-balancer-controller + default = "1.8.1" +} + diff --git a/manifests/modules/resiliency/high-availability/config/kustomization.yaml b/manifests/modules/resiliency/high-availability/config/kustomization.yaml new file mode 100644 index 000000000..b71687089 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/config/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../../../manifests/base-application/ui + +patches: + - path: scale_and_affinity_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml new file mode 100644 index 000000000..c84b9a056 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui + namespace: ui +spec: + replicas: 5 + selector: + matchLabels: + app: ui + template: + metadata: + labels: + app: ui + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - ui + topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml new file mode 100644 index 000000000..5e5981a82 --- /dev/null +++ b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: ui + name: chaos-mesh-role +rules: + - apiGroups: ["chaos-mesh.org"] + resources: ["podchaos"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml new file mode 100644 index 000000000..338d88c3b --- /dev/null +++ b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: chaos-mesh-rolebinding + namespace: ui +subjects: + - kind: User + name: PLACEHOLDER + namespace: ui +roleRef: + kind: Role + name: chaos-mesh-role + apiGroup: rbac.authorization.k8s.io diff --git a/manifests/modules/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/resiliency/scripts/get-pods-by-az.sh new file mode 100755 index 000000000..8063f1094 --- /dev/null +++ b/manifests/modules/resiliency/scripts/get-pods-by-az.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Credit to "Disaster recovery, high availability, and resiliency on Amazon EKS" +# https://catalog.us-east-1.prod.workshops.aws/workshops/6140457f-53b2-48b8-a007-2d4be06ba2fc + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +CURRENT_CONTEXT=$(kubectl config current-context) +REGION=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"$CURRENT_CONTEXT\")].context.cluster}" | cut -d : -f 4) + +for az in a b c +do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done + done + echo "" +done \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/node-failure.sh b/manifests/modules/resiliency/scripts/node-failure.sh new file mode 100755 index 000000000..80d3fc3b9 --- /dev/null +++ b/manifests/modules/resiliency/scripts/node-failure.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# node-failure.sh - Simulates node failure by stopping an EC2 instance with running pods + +# Get a list of nodes with running pods +node_with_pods=$(kubectl get pods --all-namespaces -o wide | awk 'NR>1 {print $8}' | sort | uniq) + +if [ -z "$node_with_pods" ]; then + echo "No nodes with running pods found. Please run this script: $SCRIPT_DIR/verify-cluster.sh" + exit 1 +fi + +# Select a random node from the list +selected_node=$(echo "$node_with_pods" | shuf -n 1) + +# Get the EC2 instance ID for the selected node +instance_id=$(aws ec2 describe-instances \ + --filters "Name=private-dns-name,Values=$selected_node" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + +# Stop the instance to simulate a node failure +echo "Stopping instance: $instance_id (Node: $selected_node)" +aws ec2 stop-instances --instance-ids $instance_id + +echo "Instance $instance_id is being stopped. Monitoring pod distribution..." diff --git a/manifests/modules/resiliency/scripts/pod-failure.sh b/manifests/modules/resiliency/scripts/pod-failure.sh new file mode 100755 index 000000000..3ed7df813 --- /dev/null +++ b/manifests/modules/resiliency/scripts/pod-failure.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# pod-failure.sh - Simulates pod failure using Chaos Mesh + +# Generates a unique identifier for the pod failure experiment +unique_id=$(date +%s) + +# Create a YAML configuration for the PodChaos resource +cat < pod-failure.yaml +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: pod-failure-$unique_id + namespace: ui +spec: + action: pod-kill + mode: one + selector: + namespaces: + - ui + labelSelectors: + "app.kubernetes.io/name": "ui" + duration: "60s" +EOF + +# Apply the PodChaos configuration to simulate the failure +kubectl apply -f pod-failure.yaml diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/resiliency/scripts/verify-cluster.sh new file mode 100755 index 000000000..56e2844df --- /dev/null +++ b/manifests/modules/resiliency/scripts/verify-cluster.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# verify-cluster.sh - Verifies cluster state and corrects replica count + +DESIRED_REPLICAS=5 +MAX_WAIT_TIME=300 # 5 minutes +POLL_INTERVAL=10 # 10 seconds +NAMESPACE="ui" + +print_header() { + echo -e "\n==== $1 ====\n" +} + +wait_for_condition() { + local end_time=$((SECONDS + MAX_WAIT_TIME)) + while [ $SECONDS -lt $end_time ]; do + if eval "$1"; then + return 0 + fi + echo -n "." + sleep $POLL_INTERVAL + done + echo " Timeout!" + return 1 +} + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Node Information" +kubectl get nodes -o wide + +print_header "Verifying Cluster State" +node_count=$(kubectl get nodes --no-headers | grep " Ready " | grep -vc "SchedulingDisabled") +current_pod_count=$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) + +echo "Ready and schedulable nodes: $node_count" +echo "Current active ui pods: $current_pod_count" +echo "Desired ui pods: $DESIRED_REPLICAS" + +if [ $current_pod_count -ne $DESIRED_REPLICAS ]; then + print_header "Adjusting Replica Count" + echo "Scaling deployment to $DESIRED_REPLICAS replicas..." + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pod count to stabilize" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pod count has reached the desired number." + else + echo -e "\n⚠️ Warning: Failed to reach desired pod count within the timeout period." + fi +else + echo "✅ Number of replicas is correct." +fi + +print_header "Checking Pod Distribution" +if [ $node_count -gt 0 ]; then + max_pods_per_node=$((DESIRED_REPLICAS / node_count + 1)) + uneven_distribution=false + + for node in $(kubectl get nodes -o name | grep -v "SchedulingDisabled"); do + pods_on_node=$(kubectl get pods -n $NAMESPACE -l app=ui --field-selector spec.nodeName=${node#node/} --no-headers | grep -v Terminating | wc -l) + if [ $pods_on_node -gt $max_pods_per_node ]; then + uneven_distribution=true + break + fi + done + + if $uneven_distribution; then + echo "⚠️ Pod distribution is uneven. Rebalancing..." + kubectl scale deployment ui -n $NAMESPACE --replicas=0 + sleep $POLL_INTERVAL + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pods to be ready" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep Running | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pods are ready and balanced." + else + echo -e "\n⚠️ Warning: Pods did not reach ready state within the timeout period." + fi + else + echo "✅ Pod distribution is balanced." + fi +else + echo "⚠️ Warning: No Ready and schedulable nodes found. Cannot check pod distribution." +fi + +print_header "Final Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +echo +if [ $node_count -gt 0 ] && [ $current_pod_count -eq $DESIRED_REPLICAS ]; then + echo "✅ Cluster verification and correction complete." +else + echo "⚠️ Cluster verification complete, but some issues may require attention." +fi \ No newline at end of file diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md new file mode 100644 index 000000000..03b327af8 --- /dev/null +++ b/website/docs/resiliency/high-availability/01-setup.md @@ -0,0 +1,90 @@ +--- +title: "Scaling and Pod Anti-Affinity for UI Service" +sidebar_position: 1 +description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." +--- + +TODO: + +- Update Name +- Update/Remove Verification + +This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. + +## Scaling and Pod Anti-Affinity + +We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. + +Here's the content of our patch file: + +```file +manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml +``` + +Apply the changes using Kustomize patch and + + + +```bash +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ +``` + +## Create Helper Script: Get Pods by AZ + +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file + + + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/get-pods-by-az.sh +``` + +### Script Execution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/get-pods-by-az.sh +``` + +:::tip +Use this to quickly assess the distribution of your pods across multiple zones. +::: + +## Verification + +After applying these changes, verify the setup: + +1. Check for 5 running UI pods: + +```bash +$ kubectl get pods -n ui +``` + +2. Verify pod distribution across nodes: + +```bash +$ kubectl get pods -n ui -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}' +``` + +3. Check that AWS Load Balancer Controller is installed and working: + +```bash +$ kubectl get pods -n kube-system | grep aws-load-balancer-controller +$ kubectl get ingress --all-namespaces +``` + +4. Ensure the Load Balancer is working and access to the Retail URL: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +:::info +For more information on these changes, check out these sections: + +- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) + ::: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/resiliency/high-availability/02-pod-failure.md new file mode 100644 index 000000000..cbde69d2c --- /dev/null +++ b/website/docs/resiliency/high-availability/02-pod-failure.md @@ -0,0 +1,50 @@ +--- +title: "Simulating Pod Failure" +sidebar_position: 2 +description: "Simulate pod failure in your environment using ChaosMesh to test the resiliency of your application." +--- + +## Overview + +TODO: + +- fix file visual? +- add more information about this lab and a conclusion +- Note that this experiment is repeatable +- Note that retail store should still work even when the pod fails + +In this experiment, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds. The `pod-failure.sh` script will simulate a pod failure using Chaos Mesh. This is the script we will be using: + +```file +manifests/modules/resiliency/scripts/pod-failure.sh +``` + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/pod-failure.sh +``` + +## Running the Experiment + +Run the experiment and monitor the effects on pod distribution: + +```bash +$ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command initiates the pod failure and monitors the pod distribution for 30 seconds to observe how the system handles the failure. You should see one pod dissapear and then reappear. + +Check the status of pods in the `ui` namespace: + +```bash +$ kubectl get pods -n ui -o wide +``` + +## Verify Retail Store Availability + +To ensure that the retail store is operational, check its availability with the url fetched with this command: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md new file mode 100644 index 000000000..7e154f2b0 --- /dev/null +++ b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md @@ -0,0 +1,82 @@ +--- +title: "Simulating Node Failure without FIS" +sidebar_position: 3 +description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." +--- + +# Simulating Node Failure without FIS + +TODO: + +- add information and concluding thoughts +- note that this is repeatable +- should see node failure after about a minute, pods come return shortly after to current working nodes, node comes back online after about 2 minutes +- should I make more things following the verify-cluster.sh visual? +- Load balancer does not appear to work although it should +- Rather than the seeing whole script, show expected output? +- Update script to wait for 3 nodes online + +## Overview + +This experiment simulate a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: + +```file +manifests/modules/resiliency/scripts/node-failure.sh +``` + +To make this script executable: + +```bash +$ chmod +x $SCRIPT_DIR/node-failure.sh +``` + +## Running the Experiment + +Run the node failure experiment and monitor the effects on pod distribution: + +```bash +$ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. + +During the experiment, you should observe the following: + +- One node disappearing from the list +- Kubernetes will detect the node failure and reschedule the pods that were running on the failed node +- These pods being redistributed to the remaining healthy nodes +- The failed node will come back online + +The total number of running pods should remain constant, ensuring application availability. + +## Verify Retail Store Availability + +After simulating the node failure, verify if the retail store application remains accessible: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +## Verifying Cluster Recovery + +After simulating the node failure, we'll verify the cluster's self-healing and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. + +Use the following + + + +to verify the cluster state and rebalance pods: + +```bash +$ chmod +x $SCRIPT_DIR/verify-cluster.sh +$ $SCRIPT_DIR/verify-cluster.sh +``` + +This script will: + +- Counts the number of nodes and ui pods +- Checks if the pods are evenly distributed across the nodes + +## Conclusion + +add concluding thoughts diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md new file mode 100644 index 000000000..4b9091fd5 --- /dev/null +++ b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md @@ -0,0 +1,82 @@ +--- +title: "Simulating Partial Node Failure with FIS" +sidebar_position: 4 +description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." +--- + +# Simulating Partial Node Failure with FIS + +TODO: + +- More FIS info? +- More information about the experiment +- Explain what FIS is doing different, what the experiment is doing +- should see a 1 node failing after about a minute, pods to come back up after 2 and a half minutes, and the node come back up after +- check to make sure retail app stays up +- retail app apears to not work -> need to fix load balancer configs +- A conclusion / learning from experiment +- Note that FIS can allow automatic testing for failure and whatever else is cool + +## AWS Fault Injection Simulator (FIS) Overview + +AWS Fault Injection Simulator is a fully managed service that helps you perform fault injection experiments on your AWS workloads. In the context of EKS, FIS allows us to simulate various failure scenarios, which is crucial for: + +1. Validating high availability configurations +2. Testing auto-scaling and self-healing capabilities +3. Identifying potential single points of failure +4. Improving incident response procedures + +By using FIS, you can: + +- Discover hidden bugs and performance bottlenecks +- Observe how your systems behave under stress +- Implement and validate automated recovery procedures + +In our FIS experiment, we'll simulate a partial node failure in our EKS cluster and observe how our application responds, providing practical insights into building resilient systems. + +:::info +For more information on AWS FIS check out: + +- [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) + ::: + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the node failure: + +```bash +$ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the node failure and monitor the response: + +```bash +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This will trigger the node failure and begin monitoring the pods for 5 minutes, observing how the cluster responds to losing part of its capacity. + +## Verifying Retail Store Availability + +After simulating the node failure, check if the retail store application remains operational: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +Despite a partial node failure, the retail store continues to serve traffic, demonstrating the resilience of your deployment setup. + +:::caution +Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. +::: + +:::note +To verify clusters and rebalance pods you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md new file mode 100644 index 000000000..ab5cbdd95 --- /dev/null +++ b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md @@ -0,0 +1,65 @@ +--- +title: "Simulating Complete Node Failure with FIS" +sidebar_position: 5 +description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." +--- + +# Simulating Complete Node Failure with FIS + +TODO: + +- Fix script to mimic last experiment again +- Why is this different than last experiment +- Explain what is happening in more detail +- Note timings +- Concluding Statement +- You should see all nodes and pods dissapear rather quickly then after about 2 minutes will start to see 1 node and pods coming online, after 4 minutes a second node will come online and 3 more pods. + +## Overview + +This experiment is an extensive test that isn't necessary but demonstrates the robust capabilities of AWS Fault Injection Simulator by simulating a complete node failure in a Kubernetes cluster. + +:::info Important +This test showcases how FIS can be used to simulate worst-case scenarios to help validate the resilience and recovery strategies of your applications. +::: + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the complete failure of all nodes in a specific node group: + +```bash +$ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"ALL"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"100"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the complete node failure: + +```bash +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +Monitor the cluster as it loses all node resources temporarily, observing how the Kubernetes system and your application respond. + +## Verifying Retail Store Availability + +After simulating the node failure, check if the retail store application is still operational: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +This command helps confirm that despite complete node failure, the application begins to recover as the Kubernetes cluster auto-scales back up. + +:::caution +This test can cause significant disruption, so it's recommended for use only in controlled environments where recovery mechanisms are thoroughly tested. +::: + +:::note +To verify clusters and rebalance pods you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: diff --git a/website/docs/resiliency/high-availability/06-az-failure.md b/website/docs/resiliency/high-availability/06-az-failure.md new file mode 100644 index 000000000..1091b41e7 --- /dev/null +++ b/website/docs/resiliency/high-availability/06-az-failure.md @@ -0,0 +1,134 @@ +--- +title: "Simulating AZ Failure" +sidebar_position: 6 +description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." +--- + +# Simulating AZ Failure + +TODO: + +- Fix canary +- Check AZ failure still works +- add specific cloudwatch iam role +- add conclustion + +## Overview + +This experiment simulates an Availability Zone (AZ) failure, demonstrating how robust your application is when faced with significant disruptions. It leverages AWS Fault Injection Simulator (FIS) and additional AWS services to test the resilience of the system under the stress of an AZ going offline. + +## Preparation + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash +$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 +``` + +2. Create the canary: + +Set up the blueprint: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ cat < canary_script.js +var synthetics = require('Synthetics'); +var log = require('SyntheticsLogger'); + +const pageLoadBlueprint = async function () { + const PAGE_LOAD_TIMEOUT = 30; + const URL = 'http://${INGRESS_URL}'; + let page = await synthetics.getPage(); + await synthetics.executeStep('Navigate to ' + URL, async function () { + await page.goto(URL, {waitUntil: 'domcontentloaded', timeout: PAGE_LOAD_TIMEOUT * 1000}); + }); + await synthetics.executeStep('Page loaded successfully', async function () { + log.info('Page loaded successfully'); + }); +}; + +exports.handler = async () => { + return await pageLoadBlueprint(); +}; +EOF +$ aws s3 cp canary_script.js s3://$BUCKET_NAME/canary-script/canary_script.js +``` + +Create a synthetic canary: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $FIS_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-9.0 \ + --schedule Expression="rate(1 minute)" \ + --code S3Bucket=$BUCKET_NAME,S3Key=canary-script/canary_script.js,Handler="canary_script.handler" \ + --region us-west-2 +$ sleep 30 +$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 +``` + +3. Create a CloudWatch alarm for the canary: + +```bash +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region us-west-2 +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +### Setting up the Experiment + +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: + +```bash +$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) +$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') +``` + +Create the FIS experiment template to simulate the AZ failure: + +```bash +$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"arn:aws:iam::'$AWS_ACCOUNT_ID':role/WSParticipantRole\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the AZ failure: + +```bash +aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && \ +timeout 450 watch -n 1 --color $SCRIPT_DIR/get-pods-by-az.sh +``` + +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs to understand the immediate impact of the simulated AZ failure. + +## Post-Experiment Verification + +Ensure that your application remains operational despite the simulated AZ failure, confirming the effectiveness of Kubernetes high availability: + +```bash +wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +## Conclusion + +This experiment demonstrates the resilience of your EKS cluster in the face of an Availability Zone failure. By monitoring the canary and observing the redistribution of pods, you can assess how well your application maintains availability during significant infrastructure disruptions. diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/resiliency/high-availability/index.md new file mode 100644 index 000000000..31556db21 --- /dev/null +++ b/website/docs/resiliency/high-availability/index.md @@ -0,0 +1,49 @@ +--- +title: "High Availability" +sidebar_position: 20 +sidebar_custom_props: { "module": true } +description: "Prepare your EKS environment to handle high availability scenarios effectively." +--- + +TODO: + +- have to delete deployment before? why? is that due to dev or what +- expected time for lab completion +- expected time for prepare-env (about 5 minutes without cleanup.sh and any previous applications) +- Lab overview +- Check info sections +- Are we able to chmod in backend? +- Check why the load balancer stopped working + +::required-time + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=300 wait=30 +$ chmod +x /manifests/modules/resiliency/.workshop/cleanup.sh +$ /manifests/modules/resiliency/.workshop/cleanup.sh +$ prepare-environment resiliency +``` + +This will make the following changes to your lab environment: + +- Create the ingress load balancer +- Create RBAC and Rolebindings +- Install AWS Load Balancer controller +- Install ChaosMesh +- Create an IAM role for AWS Fault Injection Simulator (FIS) + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). +::: + +In this lab, we'll look at... +information + +:::info +For more information on these changes checkout: + +- [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) +- [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) +- [Chaos Mesh](https://chaos-mesh.org/) + ::: diff --git a/website/docs/resiliency/index.md b/website/docs/resiliency/index.md new file mode 100644 index 000000000..1541ba19d --- /dev/null +++ b/website/docs/resiliency/index.md @@ -0,0 +1,54 @@ +--- +title: "Resiliency" +sidebar_position: 11 +weight: 10 +--- + +TODO: + +- Add intro information +- Find a lab to input + +Other TODO: + +- autotesting +- Containers on couch vod (link it here?) + +## What is Resiliency? + +Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: + +1. **Fault Tolerance**: The ability to continue operating properly in the event of the failure of some of its components. +2. **Self-Healing**: The capability to detect and recover from failures automatically. +3. **Scalability**: The ability to handle increased load by adding resources. +4. **Disaster Recovery**: The process of preparing for and recovering from potential disasters. + +## Why is Resiliency Important in EKS? + +Amazon EKS provides a managed Kubernetes platform, but it's still crucial to design and implement resilient architectures. Here's why: + +1. **High Availability**: Ensure your applications remain accessible even during partial system failures. +2. **Data Integrity**: Prevent data loss and maintain consistency during unexpected events. +3. **User Experience**: Minimize downtime and performance degradation to maintain user satisfaction. +4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. + +## Resiliency Scenarios Covered in this Chapter + +We'll explore several scenarios to show resiliency by performing: + +1. Pod Failures +2. Node Failures +3. Availability Zone Failures + +## What You'll Learn + +By the end of this chapter, you'll be able to: + +- Use AWS FIS to simulate and learn from controlled failure scenarios +- other info + +:::info + + + +::: diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index f712159a3..145d2f91d 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -141,6 +141,12 @@ const config = { position: "left", label: "Observability", }, + { + type: "doc", + docId: "resiliency/index", + position: "left", + label: "Resiliency", + }, { type: "doc", docId: "security/index", diff --git a/website/sidebars.js b/website/sidebars.js index 7da64994c..adf89ee4a 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,6 +20,7 @@ const sidebars = { networking: [{ type: "autogenerated", dirName: "networking" }], autoscaling: [{ type: "autogenerated", dirName: "autoscaling" }], observability: [{ type: "autogenerated", dirName: "observability" }], + resiliency: [{ type: "autogenerated", dirName: "resiliency" }], automation: [{ type: "autogenerated", dirName: "automation" }], aiml: [{ type: "autogenerated", dirName: "aiml" }], }; From a846a48e375534ab39228048195b17992dacccba Mon Sep 17 00:00:00 2001 From: cyturney Date: Fri, 16 Aug 2024 11:09:56 -0700 Subject: [PATCH 07/11] update --- .../modules/resiliency/.workshop/cleanup.sh | 180 +++++++++++------- .../resiliency/.workshop/terraform/main.tf | 163 ++++++++++++++-- .../resiliency/.workshop/terraform/outputs.tf | 14 +- .../multi_az/add_us_east_2_patch.yaml | 41 ++++ .../multi_az/kustomization.yaml | 8 + .../rbac/chaos-mesh-role.yaml | 12 -- .../rbac/chaos-mesh-rolebinding.yaml | 13 -- .../resiliency/scripts/create-second-az.sh | 52 +++++ .../scripts/eks_workshop_canary_script.js | 30 +++ .../resiliency/scripts/multi-az-get-pods.sh | 26 +++ .../resiliency/scripts/verify-cluster.sh | 15 ++ .../resiliency/high-availability/01-setup.md | 59 ++---- .../high-availability/02-pod-failure.md | 49 +++-- .../03-node-failure-no-fis.md | 86 +++++---- .../04-node-failure-partial-fis.md | 87 ++++++--- .../05-node-failure-complete-fis.md | 70 ++++--- .../high-availability/06-az-failure.md | 134 ------------- .../high-availability/06-az-setup.md | 123 ++++++++++++ .../high-availability/07-az-failure.md | 84 ++++++++ .../resiliency/high-availability/index.md | 38 ++-- .../high-availability/tests/hook-suite.sh | 11 ++ website/docs/resiliency/index.md | 39 ++-- 22 files changed, 901 insertions(+), 433 deletions(-) create mode 100644 manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml create mode 100644 manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml delete mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml delete mode 100644 manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml create mode 100755 manifests/modules/resiliency/scripts/create-second-az.sh create mode 100644 manifests/modules/resiliency/scripts/eks_workshop_canary_script.js create mode 100755 manifests/modules/resiliency/scripts/multi-az-get-pods.sh delete mode 100644 website/docs/resiliency/high-availability/06-az-failure.md create mode 100644 website/docs/resiliency/high-availability/06-az-setup.md create mode 100644 website/docs/resiliency/high-availability/07-az-failure.md create mode 100644 website/docs/resiliency/high-availability/tests/hook-suite.sh diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh index d4040bbde..537a7d260 100755 --- a/manifests/modules/resiliency/.workshop/cleanup.sh +++ b/manifests/modules/resiliency/.workshop/cleanup.sh @@ -2,97 +2,131 @@ set -e -# Delete Ingress -kubectl delete ingress -n ui ui --ignore-not-found -kubectl delete ingress ui -n ui --ignore-not-found +echo "Starting cleanup process..." -# Delete Deployments -kubectl delete deployment -n ui ui --ignore-not-found -kubectl delete deployment ui -n ui --ignore-not-found +# Function to safely delete a resource +safe_delete() { + local cmd=$1 + local resource=$2 + echo "Attempting to delete $resource..." + if $cmd 2>/dev/null; then + echo "$resource deleted successfully." + else + echo "Failed to delete $resource or it doesn't exist. Continuing..." + fi +} -# Delete Services -kubectl delete service -n ui ui-nlb --ignore-not-found +# Function to wait for resource deletion +wait_for_deletion() { + local check_cmd=$1 + local resource=$2 + local max_attempts=30 + local attempt=0 + echo "Waiting for $resource to be deleted..." + while $check_cmd &>/dev/null && [ $attempt -lt $max_attempts ]; do + sleep 10 + ((attempt++)) + done + if [ $attempt -eq $max_attempts ]; then + echo "Timeout waiting for $resource to be deleted." + else + echo "$resource deleted successfully." + fi +} + +# Function to cleanup EKS resources in a region +cleanup_eks_region() { + local region=$1 + local cluster_name=$2 + local nodegroup_name=$3 + local delete_cluster=$4 + + echo "Cleaning up EKS resources in $region..." + + # Switch to the specified region + aws configure set default.region $region -# Delete Roles and RoleBindings -kubectl delete role chaos-mesh-role -n ui --ignore-not-found -kubectl delete rolebinding chaos-mesh-rolebinding -n ui --ignore-not-found + # Delete Kubernetes resources + echo "Cleaning up Kubernetes resources..." + kubectl delete ingress,deployment,service -n ui --all --ignore-not-found + kubectl delete role,rolebinding -n ui --all --ignore-not-found + kubectl delete namespace chaos-mesh --ignore-not-found -# Uninstall Helm chart -if command -v helm &> /dev/null; then - echo "Uninstalling aws-load-balancer-controller Helm chart" + # Delete EKS Cluster and Node Group if specified + if [ "$delete_cluster" = true ]; then + echo "Attempting to delete EKS cluster and node group..." + if aws eks describe-cluster --name $cluster_name --region $region &>/dev/null; then + aws eks delete-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region || true + wait_for_deletion "aws eks describe-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region" "node group" + + aws eks delete-cluster --name $cluster_name --region $region + wait_for_deletion "aws eks describe-cluster --name $cluster_name --region $region" "EKS cluster" + else + echo "EKS cluster $cluster_name not found in $region. Skipping deletion." + fi + else + echo "Skipping EKS cluster and node group deletion in $region as requested." + fi + + # Uninstall Helm charts + echo "Uninstalling Helm charts..." helm uninstall aws-load-balancer-controller -n kube-system || true - - echo "Uninstalling Chaos Mesh Helm chart" helm uninstall chaos-mesh -n chaos-mesh || true - - # Wait for resources to be cleaned up - echo "Waiting for resources to be cleaned up..." - sleep 30 -else - echo "Helm command not found. Skipping Helm chart uninstallations." -fi -kubectl delete namespace chaos-mesh --ignore-not-found + # Delete ALBs + echo "Cleaning up ALBs in $region..." + for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do + safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" + done +} + +# Cleanup in PRIMARY_REGION (preserve cluster and node groups) +cleanup_eks_region $PRIMARY_REGION "eks-workshop" "default" false + +# Cleanup in SECONDARY_REGION (full cleanup) +cleanup_eks_region $SECONDARY_REGION "eks-workshop-east" "us-east-2-node-group" true + +# Global cleanup (not region-specific) # Delete IAM Roles and Policies -ROLE_PREFIX="fis-execution-role-eks-workshop" -POLICY_PREFIX="eks-resiliency-fis-policy" - -# List and delete roles -for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${ROLE_PREFIX}')].RoleName" --output text); do - echo "Detaching policies and deleting role: $role" - # Detach managed policies - aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess || true - aws iam detach-role-policy --role-name $role --policy-arn arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess || true - - # Detach and delete inline policies - for policy in $(aws iam list-role-policies --role-name $role --query PolicyNames --output text); do - aws iam delete-role-policy --role-name $role --policy-name $policy || true +echo "Cleaning up IAM roles and policies..." +for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do + for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do + echo "Processing role: $role" + for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do + safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" + done + for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do + safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" + done + safe_delete "aws iam delete-role --role-name $role" "IAM role $role" done - - # Delete the role - aws iam delete-role --role-name $role || true done -# List and delete policies -for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${POLICY_PREFIX}')].Arn" --output text); do - echo "Deleting policy: $policy_arn" - - # Detach policy from all attached roles - for role in $(aws iam list-entities-for-policy --policy-arn $policy_arn --entity-filter Role --query 'PolicyRoles[*].RoleName' --output text); do - aws iam detach-role-policy --role-name $role --policy-arn $policy_arn +for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do + for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do + safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" done - - # Delete the policy - aws iam delete-policy --policy-arn $policy_arn done -# Delete any leftover ALBs -ALB_ARN=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text) -if [ ! -z "$ALB_ARN" ]; then - echo "Deleting leftover ALB: $ALB_ARN" - aws elbv2 delete-load-balancer --load-balancer-arn $ALB_ARN -else - echo "No leftover ALB found." -fi - -# Delete S3 bucket -BUCKET_PREFIX="eks-workshop-canary-artifacts-" -for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, '${BUCKET_PREFIX}')].Name" --output text); do - echo "Deleting S3 bucket: $bucket" - # First, remove all objects from the bucket +# Delete S3 buckets +echo "Cleaning up S3 buckets..." +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do aws s3 rm s3://$bucket --recursive - # Then delete the bucket - aws s3api delete-bucket --bucket $bucket --region us-west-2 + safe_delete "aws s3api delete-bucket --bucket $bucket --region $PRIMARY_REGION" "S3 bucket $bucket" done -# Delete CloudWatch Synthetics canary +# Delete CloudWatch Synthetics canary and alarm CANARY_NAME="eks-workshop-canary" -if aws synthetics get-canary --name $CANARY_NAME --region us-west-2 &> /dev/null; then - echo "Deleting CloudWatch Synthetics canary: $CANARY_NAME" - aws synthetics delete-canary --name $CANARY_NAME --region us-west-2 -else - echo "CloudWatch Synthetics canary $CANARY_NAME not found." +ALARM_NAME="eks-workshop-canary-alarm" + +echo "Cleaning up CloudWatch Synthetics canary and alarm..." +if aws synthetics get-canary --name $CANARY_NAME --region $PRIMARY_REGION &>/dev/null; then + aws synthetics stop-canary --name $CANARY_NAME --region $PRIMARY_REGION || true + sleep 30 + safe_delete "aws synthetics delete-canary --name $CANARY_NAME --region $PRIMARY_REGION" "CloudWatch Synthetics canary $CANARY_NAME" fi -echo "Cleanup completed successfully." \ No newline at end of file +safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME --region $PRIMARY_REGION" "CloudWatch alarm $ALARM_NAME" + +echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/resiliency/.workshop/terraform/main.tf index 7e039cbdf..ae6da7511 100644 --- a/manifests/modules/resiliency/.workshop/terraform/main.tf +++ b/manifests/modules/resiliency/.workshop/terraform/main.tf @@ -13,7 +13,7 @@ module "eks_blueprints_addons" { } -// ALB creation +# ALB creation resource "kubernetes_manifest" "ui_alb" { manifest = { "apiVersion" = "networking.k8s.io/v1" @@ -49,7 +49,7 @@ resource "kubernetes_manifest" "ui_alb" { } } -// Create RBAC and Rolebinding +# Create RBAC and Rolebinding resource "kubernetes_role" "chaos_mesh_role" { metadata { name = "chaos-mesh-role" @@ -90,10 +90,10 @@ resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { } } -// Add AWS Load Balancer controller +# Add AWS Load Balancer controller resource "helm_release" "aws_load_balancer_controller" { name = "aws-load-balancer-controller" - repository = "https://aws.github.io/eks-charts" + repository = "https:#aws.github.io/eks-charts" chart = "aws-load-balancer-controller" namespace = "kube-system" version = var.load_balancer_controller_chart_version @@ -115,10 +115,10 @@ resource "helm_release" "aws_load_balancer_controller" { } -// Chaos Mesh Helm Release +# Chaos Mesh Helm Release resource "helm_release" "chaos_mesh" { name = "chaos-mesh" - repository = "https://charts.chaos-mesh.org" + repository = "https:#charts.chaos-mesh.org" chart = "chaos-mesh" namespace = "chaos-mesh" version = "2.5.1" @@ -126,7 +126,7 @@ resource "helm_release" "chaos_mesh" { create_namespace = true } -// FIS IAM role +# FIS IAM role resource "random_id" "suffix" { byte_length = 8 } @@ -140,7 +140,12 @@ resource "aws_iam_role" "fis_role" { { Effect = "Allow" Principal = { - Service = "fis.amazonaws.com" + Service = [ + "fis.amazonaws.com", + # for second region + "ec2.amazonaws.com", + "eks.amazonaws.com" + ] } Action = "sts:AssumeRole" }, @@ -175,7 +180,7 @@ resource "aws_iam_role" "fis_role" { depends_on = [kubernetes_role_binding.chaos_mesh_rolebinding] } -// Attach FIS Access Policy +# Attach FIS Access Policy resource "aws_iam_role_policy_attachment" "fis_eks_access" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess" role = aws_iam_role.fis_role.name @@ -186,7 +191,23 @@ resource "aws_iam_role_policy_attachment" "fis_network_access" { role = aws_iam_role.fis_role.name } -// Policy for creating FIS experiment templates +# Attach to FIS for EKS node group +resource "aws_iam_role_policy_attachment" "fis_node_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_ecr_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_cni_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.fis_role.name +} + +# Policy for creating FIS experiment templates resource "aws_iam_policy" "eks_resiliency_fis_policy" { name = "eks-resiliency-fis-policy-${random_id.suffix.hex}" path = "/" @@ -198,7 +219,7 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { { Effect = "Allow" Action = [ - // FIS + # FIS "fis:CreateExperimentTemplate", "fis:GetExperimentTemplate", "fis:ListExperimentTemplates", @@ -212,6 +233,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "ec2:DescribeInstances", "ec2:DescribeInstanceStatus", "ec2:TerminateInstances", + "ec2:StartInstances", + "ec2:StopInstances", "eks:DescribeCluster", "eks:ListNodegroups", "eks:DescribeNodegroup", @@ -223,7 +246,72 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "logs:UpdateLogDelivery", "logs:DeleteLogDelivery", "logs:ListLogDeliveries", - // Synthetic Canary + "ssm:StartAutomationExecution", + "ssm:GetAutomationExecution", + "cloudwatch:DescribeAlarms", + "cloudwatch:GetMetricData" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.fis_role.arn + } + ] + }) +} + +# Attach custom policy to the role +resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn + role = aws_iam_role.fis_role.name +} + + +# Canary IAM role +resource "aws_iam_role" "canary_role" { + name = "canary-execution-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = [ + "lambda.amazonaws.com", + "synthetics.amazonaws.com" + ] + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } +} + +# Attach Lambda Basic Execution Role to Canary role +resource "aws_iam_role_policy_attachment" "canary_lambda_basic_execution" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + role = aws_iam_role.canary_role.name +} + +# Policy for Canary +resource "aws_iam_policy" "eks_resiliency_canary_policy" { + name = "eks-resiliency-canary-policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency Canary" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ "synthetics:CreateCanary", "synthetics:DeleteCanary", "synthetics:DescribeCanaries", @@ -233,24 +321,59 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "s3:PutObject", "s3:GetBucketLocation", "s3:ListAllMyBuckets", + "s3:GetObject", + "s3:ListBucket", "cloudwatch:PutMetricData", + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", "logs:CreateLogGroup", "logs:CreateLogStream", - "logs:PutLogEvents" + "logs:PutLogEvents", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "lambda:InvokeFunction" ] Resource = "*" - }, + } + ] + }) +} + +# Attach custom policy to the Canary role +resource "aws_iam_role_policy_attachment" "eks_resiliency_canary_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_canary_policy.arn + role = aws_iam_role.canary_role.name +} + +# EKS Cluster IAM Role +resource "aws_iam_role" "eks_cluster_role" { + name = "eks-cluster-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ { Effect = "Allow" - Action = "iam:PassRole" - Resource = aws_iam_role.fis_role.arn + Principal = { + Service = "eks.amazonaws.com" + } + Action = "sts:AssumeRole" } ] }) + + lifecycle { + create_before_destroy = true + } } -// Attach custom policy to the role -resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { - policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn - role = aws_iam_role.fis_role.name +# Attach required policies to EKS Cluster role +resource "aws_iam_role_policy_attachment" "eks_cluster_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" + role = aws_iam_role.eks_cluster_role.name +} + +resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" + role = aws_iam_role.eks_cluster_role.name } diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/resiliency/.workshop/terraform/outputs.tf index 413de0df1..a584978a7 100644 --- a/manifests/modules/resiliency/.workshop/terraform/outputs.tf +++ b/manifests/modules/resiliency/.workshop/terraform/outputs.tf @@ -1,10 +1,14 @@ output "environment_variables" { description = "Environment variables to be added to the IDE shell" value = { - LBC_CHART_VERSION = var.load_balancer_controller_chart_version - LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn - FIS_ROLE_ARN = aws_iam_role.fis_role.arn - RANDOM_SUFFIX = random_id.suffix.hex - SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + LBC_CHART_VERSION = var.load_balancer_controller_chart_version + LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + FIS_ROLE_ARN = aws_iam_role.fis_role.arn + RANDOM_SUFFIX = random_id.suffix.hex + SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + CANARY_ROLE_ARN = aws_iam_role.canary_role.arn + EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn + PRIMARY_REGION = "us-west-2" + SECONDARY_REGION = "us-east-2" } } diff --git a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml new file mode 100644 index 000000000..b2a276fde --- /dev/null +++ b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui + namespace: ui +spec: + replicas: 9 # Total number of replicas + template: + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 60 + preference: + matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-west-2a + - us-west-2b + - us-west-2c + - weight: 40 + preference: + matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + - us-east-2b + - us-east-2c + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - ui + topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml new file mode 100644 index 000000000..32bf6179b --- /dev/null +++ b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../../../manifests/base-application/ui + +patches: + - path: add_us_east_2_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml deleted file mode 100644 index 5e5981a82..000000000 --- a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-role.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: ui - name: chaos-mesh-role -rules: - - apiGroups: ["chaos-mesh.org"] - resources: ["podchaos"] - verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] diff --git a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml b/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml deleted file mode 100644 index 338d88c3b..000000000 --- a/manifests/modules/resiliency/high-availability/rbac/chaos-mesh-rolebinding.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: chaos-mesh-rolebinding - namespace: ui -subjects: - - kind: User - name: PLACEHOLDER - namespace: ui -roleRef: - kind: Role - name: chaos-mesh-role - apiGroup: rbac.authorization.k8s.io diff --git a/manifests/modules/resiliency/scripts/create-second-az.sh b/manifests/modules/resiliency/scripts/create-second-az.sh new file mode 100755 index 000000000..09d9c28bb --- /dev/null +++ b/manifests/modules/resiliency/scripts/create-second-az.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Ensure SCRIPT_DIR is set +if [ -z "$SCRIPT_DIR" ]; then + echo "Error: SCRIPT_DIR environment variable is not set." + exit 1 +fi + +# Ensure PRIMARY_REGION and SECONDARY_REGION are set +if [ -z "$PRIMARY_REGION" ] || [ -z "$SECONDARY_REGION" ]; then + echo "Error: PRIMARY_REGION and SECONDARY_REGION must be set." + exit 1 +fi + +# Function to run multi-az-get-pods.sh and display region +run_multi_az_script() { + local region=$1 + echo "Current region: $region" + echo "Running multi-az-get-pods.sh..." + $SCRIPT_DIR/multi-az-get-pods.sh + echo "----------------------------------------" +} + +# Run multi-az-get-pods.sh in PRIMARY_REGION +aws configure set default.region $PRIMARY_REGION +run_multi_az_script $PRIMARY_REGION + +# Switch to SECONDARY_REGION +echo "Switching to SECONDARY_REGION: $SECONDARY_REGION" +aws configure set default.region $SECONDARY_REGION + +# Prepare environment for resiliency module +echo "Preparing environment for resiliency module..." +prepare-environment resiliency + +# Verify the EKS cluster in SECONDARY_REGION +echo "Verifying EKS cluster in SECONDARY_REGION..." +aws eks list-clusters + +# Check node groups in SECONDARY_REGION +CLUSTER_NAME=$(aws eks list-clusters --query 'clusters[0]' --output text) +echo "Checking node groups for cluster: $CLUSTER_NAME" +aws eks list-nodegroups --cluster-name $CLUSTER_NAME + +# Switch back to PRIMARY_REGION +echo "Switching back to PRIMARY_REGION: $PRIMARY_REGION" +aws configure set default.region $PRIMARY_REGION + +# Run multi-az-get-pods.sh one last time in PRIMARY_REGION +run_multi_az_script $PRIMARY_REGION + +echo "Setup complete. \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js new file mode 100644 index 000000000..74deb4591 --- /dev/null +++ b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js @@ -0,0 +1,30 @@ +const synthetics = require("Synthetics"); +const log = require("SyntheticsLogger"); + +const pageLoadBlueprint = async function () { + const PAGE_LOAD_TIMEOUT = 30; + const URL = process.env.INGRESS_URL || "http://localhost"; // Use environment variable or fallback + + let page = await synthetics.getPage(); + + await synthetics.executeStep("Navigate to " + URL, async function () { + const response = await page.goto(URL, { + waitUntil: "domcontentloaded", + timeout: PAGE_LOAD_TIMEOUT * 1000, + }); + + // Verify the page loaded successfully + if (response.status() !== 200) { + throw new Error(`Failed to load page. Status code: ${response.status()}`); + } + }); + + await synthetics.executeStep("Verify page content", async function () { + const pageTitle = await page.title(); + log.info("Page title: " + pageTitle); + }); +}; + +exports.handler = async () => { + return await pageLoadBlueprint(); +}; diff --git a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh new file mode 100755 index 000000000..f47649eb8 --- /dev/null +++ b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +REGIONS=("us-west-2" "us-east-2") + +for REGION in "${REGIONS[@]}" +do + echo "Region: $REGION" + for az in a b c + do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers 2>/dev/null | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>/dev/null | while read line; do echo " ${line}"; done + done + echo "" + done + echo "" +done \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/resiliency/scripts/verify-cluster.sh index 56e2844df..2e6329b90 100755 --- a/manifests/modules/resiliency/scripts/verify-cluster.sh +++ b/manifests/modules/resiliency/scripts/verify-cluster.sh @@ -5,6 +5,7 @@ DESIRED_REPLICAS=5 MAX_WAIT_TIME=300 # 5 minutes POLL_INTERVAL=10 # 10 seconds NAMESPACE="ui" +EXPECTED_READY_NODES=3 print_header() { echo -e "\n==== $1 ====\n" @@ -26,6 +27,20 @@ wait_for_condition() { print_header "Checking Current Pod Distribution" $SCRIPT_DIR/get-pods-by-az.sh +print_header "Waiting for nodes to be Ready" +total_nodes=$(kubectl get nodes --no-headers | wc -l) +echo "Total nodes in the cluster: $total_nodes" +echo "Waiting for $EXPECTED_READY_NODES nodes to be in Ready state" +if wait_for_condition "[ \$(kubectl get nodes --no-headers | grep ' Ready ' | wc -l) -eq $EXPECTED_READY_NODES ]"; then + echo -e "\n✅ $EXPECTED_READY_NODES nodes are in Ready state." +else + echo -e "\n⚠️ Warning: $EXPECTED_READY_NODES nodes did not reach Ready state within the timeout period." + exit 1 +fi + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + print_header "Node Information" kubectl get nodes -o wide diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md index 03b327af8..31821d93a 100644 --- a/website/docs/resiliency/high-availability/01-setup.md +++ b/website/docs/resiliency/high-availability/01-setup.md @@ -4,11 +4,6 @@ sidebar_position: 1 description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." --- -TODO: - -- Update Name -- Update/Remove Verification - This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. ## Scaling and Pod Anti-Affinity @@ -21,67 +16,43 @@ Here's the content of our patch file: manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml ``` -Apply the changes using Kustomize patch and - - +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/config/kustomization.yaml): ```bash $ kubectl delete deployment ui -n ui $ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ ``` -## Create Helper Script: Get Pods by AZ - -The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file +## Verify Retail Store Accessibility - - -To make this script executable: +After applying these changes, it's important to verify that your retail store is accessible: ```bash -$ chmod +x $SCRIPT_DIR/get-pods-by-az.sh +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` -### Script Execution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/get-pods-by-az.sh -``` +Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. :::tip -Use this to quickly assess the distribution of your pods across multiple zones. +If the retail store doesn't load immediately, wait a few moments and refresh the page. It may take a short time for all components to become fully operational. ::: -## Verification +## Helper Script: Get Pods by AZ -After applying these changes, verify the setup: +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/get-pods-by-az.sh). -1. Check for 5 running UI pods: - -```bash -$ kubectl get pods -n ui -``` - -2. Verify pod distribution across nodes: - -```bash -$ kubectl get pods -n ui -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}' -``` +### Script Execution -3. Check that AWS Load Balancer Controller is installed and working: +To run the script and see the distribution of pods across availability zones, execute: ```bash -$ kubectl get pods -n kube-system | grep aws-load-balancer-controller -$ kubectl get ingress --all-namespaces +$ $SCRIPT_DIR/get-pods-by-az.sh ``` -4. Ensure the Load Balancer is working and access to the Retail URL: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` +:::tip +Use this to quickly assess the distribution of your pods across multiple zones. +::: :::info For more information on these changes, check out these sections: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/resiliency/high-availability/02-pod-failure.md index cbde69d2c..b1bcc55c8 100644 --- a/website/docs/resiliency/high-availability/02-pod-failure.md +++ b/website/docs/resiliency/high-availability/02-pod-failure.md @@ -6,45 +6,62 @@ description: "Simulate pod failure in your environment using ChaosMesh to test t ## Overview -TODO: +In this lab, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds and recovers. This experiment is designed to test the resiliency of your application under adverse conditions, specifically when a pod unexpectedly fails. -- fix file visual? -- add more information about this lab and a conclusion -- Note that this experiment is repeatable -- Note that retail store should still work even when the pod fails +The `pod-failure.sh` script utilizes Chaos Mesh, a powerful chaos engineering platform for Kubernetes, to simulate a pod failure. This controlled experiment allows you to: -In this experiment, you'll simulate a pod failure within your Kubernetes environment to observe how the system responds. The `pod-failure.sh` script will simulate a pod failure using Chaos Mesh. This is the script we will be using: +1. Observe the system's immediate response to pod failure +2. Monitor the automatic recovery process +3. Verify that your application remains available despite the simulated failure + +This experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. This is the script we will be using: ```file manifests/modules/resiliency/scripts/pod-failure.sh ``` -To make this script executable: - -```bash -$ chmod +x $SCRIPT_DIR/pod-failure.sh -``` - ## Running the Experiment -Run the experiment and monitor the effects on pod distribution: +To simulate the pod failure and monitor its effects, run the following command: ```bash $ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -This command initiates the pod failure and monitors the pod distribution for 30 seconds to observe how the system handles the failure. You should see one pod dissapear and then reappear. +This command does the following: + +1. Initiates the pod failure simulation using the `pod-failure.sh` script +2. Monitors the pod distribution across Availability Zones (AZs) for 30 seconds +3. Updates the display every second to show real-time changes + +During the experiment, you should observe one pod disappearing and then reappearing, demonstrating the system's ability to detect and recover from failures. -Check the status of pods in the `ui` namespace: +To get a more detailed view of the pods in the `ui` namespace, use the following command: ```bash $ kubectl get pods -n ui -o wide ``` +This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. + ## Verify Retail Store Availability -To ensure that the retail store is operational, check its availability with the url fetched with this command: +An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: ```bash $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` + +This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. + +## Conclusion + +This pod failure simulation demonstrates the resilience of your Kubernetes-based application. By intentionally causing a pod to fail, you can observe: + +1. The system's ability to detect failures quickly +2. Kubernetes' automatic rescheduling and recovery of failed pods +3. The application's continued availability during pod failures + +Remember that the retail store should remain operational even when a pod fails, showcasing the high availability and fault tolerance of your Kubernetes setup. This experiment helps validate your application's resilience and can be repeated as needed to ensure consistent behavior across different scenarios or after making changes to your infrastructure. + +By regularly performing such chaos engineering experiments, you can build confidence in your system's ability to withstand and recover from various types of failures, ultimately leading to a more robust and reliable application. diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md index 7e154f2b0..494bddbd1 100644 --- a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md +++ b/website/docs/resiliency/high-availability/03-node-failure-no-fis.md @@ -6,33 +6,21 @@ description: "Manually simulate a node failure in your Kubernetes environment to # Simulating Node Failure without FIS -TODO: - -- add information and concluding thoughts -- note that this is repeatable -- should see node failure after about a minute, pods come return shortly after to current working nodes, node comes back online after about 2 minutes -- should I make more things following the verify-cluster.sh visual? -- Load balancer does not appear to work although it should -- Rather than the seeing whole script, show expected output? -- Update script to wait for 3 nodes online - ## Overview -This experiment simulate a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: +This experiment simulates a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. By deliberately causing a node to fail, we can observe how Kubernetes handles the failure and maintains the overall health of the cluster. + +The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: ```file manifests/modules/resiliency/scripts/node-failure.sh ``` -To make this script executable: - -```bash -$ chmod +x $SCRIPT_DIR/node-failure.sh -``` +It's important to note that this experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. ## Running the Experiment -Run the node failure experiment and monitor the effects on pod distribution: +To simulate the node failure and monitor its effects, run the following command: ```bash $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done @@ -40,43 +28,67 @@ $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. -During the experiment, you should observe the following: +During the experiment, you should observe the following sequence of events: -- One node disappearing from the list -- Kubernetes will detect the node failure and reschedule the pods that were running on the failed node -- These pods being redistributed to the remaining healthy nodes -- The failed node will come back online +1. After about 1 minute, you'll see one node disappear from the list. This represents the simulated node failure. +2. Shortly after the node failure, you'll notice pods being redistributed to the remaining healthy nodes. Kubernetes detects the node failure and automatically reschedules the affected pods. +3. Approximately 2 minutes after the initial failure, the failed node will come back online. -The total number of running pods should remain constant, ensuring application availability. +Throughout this process, the total number of running pods should remain constant, ensuring application availability. -## Verify Retail Store Availability +## Verifying Cluster Recovery + +While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -After simulating the node failure, verify if the retail store application remains accessible: +Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: ```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ $SCRIPT_DIR/verify-cluster.sh ``` -## Verifying Cluster Recovery +This script will: + +- Wait for nodes to come back online +- Count the number of nodes and ui pods +- Check if the pods are evenly distributed across the nodes -After simulating the node failure, we'll verify the cluster's self-healing and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. +## Verify Retail Store Availability -Use the following +After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: - +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` -to verify the cluster state and rebalance pods: +This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated node failure. + +:::caution +The retail url may take 10 minutes to become operational. You can optionally continue on with the lab by pressing `ctrl` + `z` to move operation to the background. To access it again input: ```bash -$ chmod +x $SCRIPT_DIR/verify-cluster.sh -$ $SCRIPT_DIR/verify-cluster.sh +$ fg ``` -This script will: +The url may not become operational by the time `wait-for-lb` times out. In that case, it should become operational after running the command again: -- Counts the number of nodes and ui pods -- Checks if the pods are evenly distributed across the nodes +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +::: ## Conclusion -add concluding thoughts +This node failure simulation demonstrates the robustness and self-healing capabilities of your Kubernetes cluster. Key observations and lessons from this experiment include: + +1. Kubernetes' ability to quickly detect node failures and respond accordingly. +2. The automatic rescheduling of pods from the failed node to healthy nodes, ensuring continuity of service. +3. The cluster's self-healing process, bringing the failed node back online after a short period. +4. The importance of proper resource allocation and pod distribution to maintain application availability during node failures. + +By regularly performing such experiments, you can: + +- Validate your cluster's resilience to node failures. +- Identify potential weaknesses in your application's architecture or deployment strategy. +- Gain confidence in your system's ability to handle unexpected infrastructure issues. +- Refine your incident response procedures and automation. diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md index 4b9091fd5..0d5a738db 100644 --- a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md @@ -6,20 +6,9 @@ description: "Simulate a partial node failures in your Kubernetes environment us # Simulating Partial Node Failure with FIS -TODO: - -- More FIS info? -- More information about the experiment -- Explain what FIS is doing different, what the experiment is doing -- should see a 1 node failing after about a minute, pods to come back up after 2 and a half minutes, and the node come back up after -- check to make sure retail app stays up -- retail app apears to not work -> need to fix load balancer configs -- A conclusion / learning from experiment -- Note that FIS can allow automatic testing for failure and whatever else is cool - ## AWS Fault Injection Simulator (FIS) Overview -AWS Fault Injection Simulator is a fully managed service that helps you perform fault injection experiments on your AWS workloads. In the context of EKS, FIS allows us to simulate various failure scenarios, which is crucial for: +AWS Fault Injection Simulator (FIS) is a fully managed service that enables you to perform controlled fault injection experiments on your AWS workloads. FIS allows you to simulate various failure scenarios, which is crucial for: 1. Validating high availability configurations 2. Testing auto-scaling and self-healing capabilities @@ -31,18 +20,31 @@ By using FIS, you can: - Discover hidden bugs and performance bottlenecks - Observe how your systems behave under stress - Implement and validate automated recovery procedures +- Conduct repeatable experiments to ensure consistent behavior In our FIS experiment, we'll simulate a partial node failure in our EKS cluster and observe how our application responds, providing practical insights into building resilient systems. :::info -For more information on AWS FIS check out: +For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) +- [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) ::: +## Experiment Details + +This experiment differs from the previous manual node failure simulation in several ways: + +1. Automated execution: FIS manages the experiment, allowing for more controlled and repeatable tests. +2. Partial failure: Instead of simulating a complete node failure, we're testing a scenario where a portion of the nodes fail. +3. Scale: FIS allows us to target multiple nodes simultaneously, providing a more realistic large-scale failure scenario. +4. Precision: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. + +In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. + ## Creating the Node Failure Experiment -Create a new AWS FIS experiment template to simulate the node failure: +Create a new AWS FIS experiment template to simulate the partial node failure: ```bash $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -53,30 +55,67 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: ```bash -$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -This will trigger the node failure and begin monitoring the pods for 5 minutes, observing how the cluster responds to losing part of its capacity. +This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. + +During the experiment, you should observe the following: + +1. After about 1 minute, you'll see one or more nodes disappear from the list, representing the simulated partial node failure. +2. Over the next 2 minutes, you'll notice pods being rescheduled and redistributed to the remaining healthy nodes. +3. Shortly after you'll see the new node coming online to replace the terminated one. + +Your retail url should stay operational unlike the node failure without FIS. + +:::note +To verify clusters and rebalance pods, you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: ## Verifying Retail Store Availability -After simulating the node failure, check if the retail store application remains operational: +Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: ```bash $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` -Despite a partial node failure, the retail store continues to serve traffic, demonstrating the resilience of your deployment setup. +:::tip +The retail url may take 10 minutes to become operational. +::: + +Despite the partial node failure, the retail store should continue to serve traffic, demonstrating the resilience of your deployment setup. :::caution Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. ::: -:::note -To verify clusters and rebalance pods you can run: +## Conclusion -```bash -$ $SCRIPT_DIR/verify-cluster.sh -``` +This partial node failure simulation using AWS FIS demonstrates several key aspects of your Kubernetes cluster's resilience: -::: +1. Automatic detection of node failures by Kubernetes +2. Swift rescheduling of pods from failed nodes to healthy ones +3. The cluster's ability to maintain service availability during significant infrastructure disruptions +4. Auto-scaling capabilities to replace failed nodes + +Key takeaways from this experiment: + +- The importance of distributing your workload across multiple nodes and availability zones +- The value of having appropriate resource requests and limits set for your pods +- The effectiveness of Kubernetes' self-healing mechanisms +- The need for robust monitoring and alerting systems to detect and respond to node failures + +By leveraging AWS FIS for such experiments, you gain several advantages: + +1. Repeatability: You can run this experiment multiple times to ensure consistent behavior. +2. Automation: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. +3. Comprehensive testing: You can create more complex scenarios involving multiple AWS services to test your entire application stack. +4. Controlled chaos: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. + +Regular execution of such experiments helps build confidence in your system's resilience and provides valuable insights for continuous improvement of your architecture and operational procedures. diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md index ab5cbdd95..a7f142b71 100644 --- a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md @@ -6,26 +6,25 @@ description: "Demonstrates the impact of a complete node failure on a Kubernetes # Simulating Complete Node Failure with FIS -TODO: - -- Fix script to mimic last experiment again -- Why is this different than last experiment -- Explain what is happening in more detail -- Note timings -- Concluding Statement -- You should see all nodes and pods dissapear rather quickly then after about 2 minutes will start to see 1 node and pods coming online, after 4 minutes a second node will come online and 3 more pods. - ## Overview -This experiment is an extensive test that isn't necessary but demonstrates the robust capabilities of AWS Fault Injection Simulator by simulating a complete node failure in a Kubernetes cluster. +This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. :::info Important -This test showcases how FIS can be used to simulate worst-case scenarios to help validate the resilience and recovery strategies of your applications. +This test simulates a worst-case scenario. It's designed for controlled environments with thoroughly tested recovery mechanisms. ::: +## Experiment Details + +Unlike the partial node failure simulation, this experiment: + +1. Terminates 100% of the instances in all node groups. +2. Tests your cluster's ability to recover from a state of complete failure. +3. Allows observation of the full recovery process, from total outage to full restoration. + ## Creating the Node Failure Experiment -Create a new AWS FIS experiment template to simulate the complete failure of all nodes in a specific node group: +Create a new AWS FIS experiment template to simulate the complete node failure: ```bash $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"ALL"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"100"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -33,33 +32,56 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc ## Running the Experiment -Execute the FIS experiment to simulate the complete node failure: +Execute the FIS experiment and monitor the cluster's response: ```bash $ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done ``` -Monitor the cluster as it loses all node resources temporarily, observing how the Kubernetes system and your application respond. +This command will show the pods distribution over 5 minutes while we observe the experiment. We should see: -## Verifying Retail Store Availability +1. Shortly after the experment is initiated, all nodes and pods dissapear. +2. After about 2 minutes, First node and some pods will come back online. +3. Around 4 minutes, a second node appears and more pods start up. +4. At 5 minutes, continued recovery as the last node come online. -After simulating the node failure, check if the retail store application is still operational: +Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. + +:::note +To verify clusters and rebalance pods, you can run: ```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ $SCRIPT_DIR/verify-cluster.sh ``` -This command helps confirm that despite complete node failure, the application begins to recover as the Kubernetes cluster auto-scales back up. - -:::caution -This test can cause significant disruption, so it's recommended for use only in controlled environments where recovery mechanisms are thoroughly tested. ::: -:::note -To verify clusters and rebalance pods you can run: +## Verifying Retail Store Availability + +Check the retail store application's recovery: ```bash -$ $SCRIPT_DIR/verify-cluster.sh +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` +:::tip +The retail url may take 10 minutes to become operational. ::: + +## Conclusion + +This experiment demonstrates: + +1. Your cluster's response to catastrophic failure. +2. Effectiveness of auto-scaling in replacing all failed nodes. +3. Kubernetes' ability to reschedule all pods onto new nodes. +4. Total system recovery time from complete failure. + +Key learnings: + +- Importance of robust auto-scaling configurations. +- Value of effective pod priority and preemption settings. +- Need for architectures that can withstand complete cluster failure. +- Significance of regular testing of extreme scenarios. + +By using FIS for such tests, you can safely simulate catastrophic failures, validate recovery procedures, identify critical dependencies, and measure recovery times. This helps in refining your disaster recovery plans and improving overall system resilience. diff --git a/website/docs/resiliency/high-availability/06-az-failure.md b/website/docs/resiliency/high-availability/06-az-failure.md deleted file mode 100644 index 1091b41e7..000000000 --- a/website/docs/resiliency/high-availability/06-az-failure.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -title: "Simulating AZ Failure" -sidebar_position: 6 -description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." ---- - -# Simulating AZ Failure - -TODO: - -- Fix canary -- Check AZ failure still works -- add specific cloudwatch iam role -- add conclustion - -## Overview - -This experiment simulates an Availability Zone (AZ) failure, demonstrating how robust your application is when faced with significant disruptions. It leverages AWS Fault Injection Simulator (FIS) and additional AWS services to test the resilience of the system under the stress of an AZ going offline. - -## Preparation - -### Setting up a Synthetic Canary - -Before starting the experiment, set up a synthetic canary for heartbeat monitoring: - -1. First, create an S3 bucket for the canary artifacts: - -```bash -$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" -$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 -``` - -2. Create the canary: - -Set up the blueprint: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ cat < canary_script.js -var synthetics = require('Synthetics'); -var log = require('SyntheticsLogger'); - -const pageLoadBlueprint = async function () { - const PAGE_LOAD_TIMEOUT = 30; - const URL = 'http://${INGRESS_URL}'; - let page = await synthetics.getPage(); - await synthetics.executeStep('Navigate to ' + URL, async function () { - await page.goto(URL, {waitUntil: 'domcontentloaded', timeout: PAGE_LOAD_TIMEOUT * 1000}); - }); - await synthetics.executeStep('Page loaded successfully', async function () { - log.info('Page loaded successfully'); - }); -}; - -exports.handler = async () => { - return await pageLoadBlueprint(); -}; -EOF -$ aws s3 cp canary_script.js s3://$BUCKET_NAME/canary-script/canary_script.js -``` - -Create a synthetic canary: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ aws synthetics create-canary \ - --name eks-workshop-canary \ - --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ - --execution-role-arn $FIS_ROLE_ARN \ - --runtime-version syn-nodejs-puppeteer-9.0 \ - --schedule Expression="rate(1 minute)" \ - --code S3Bucket=$BUCKET_NAME,S3Key=canary-script/canary_script.js,Handler="canary_script.handler" \ - --region us-west-2 -$ sleep 30 -$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 -``` - -3. Create a CloudWatch alarm for the canary: - -```bash -$ aws cloudwatch put-metric-alarm \ - --alarm-name "eks-workshop-canary-alarm" \ - --metric-name SuccessPercent \ - --namespace CloudWatchSynthetics \ - --statistic Average \ - --period 60 \ - --threshold 95 \ - --comparison-operator LessThanThreshold \ - --dimensions Name=CanaryName,Value=eks-workshop-canary \ - --evaluation-periods 1 \ - --alarm-description "Alarm when Canary success rate drops below 95%" \ - --unit Percent \ - --region us-west-2 -``` - -This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. - -### Setting up the Experiment - -Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: - -```bash -$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) -$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') -``` - -Create the FIS experiment template to simulate the AZ failure: - -```bash -$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"arn:aws:iam::'$AWS_ACCOUNT_ID':role/WSParticipantRole\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') -``` - -## Running the Experiment - -Execute the FIS experiment to simulate the AZ failure: - -```bash -aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && \ -timeout 450 watch -n 1 --color $SCRIPT_DIR/get-pods-by-az.sh -``` - -This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs to understand the immediate impact of the simulated AZ failure. - -## Post-Experiment Verification - -Ensure that your application remains operational despite the simulated AZ failure, confirming the effectiveness of Kubernetes high availability: - -```bash -wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -## Conclusion - -This experiment demonstrates the resilience of your EKS cluster in the face of an Availability Zone failure. By monitoring the canary and observing the redistribution of pods, you can assess how well your application maintains availability during significant infrastructure disruptions. diff --git a/website/docs/resiliency/high-availability/06-az-setup.md b/website/docs/resiliency/high-availability/06-az-setup.md new file mode 100644 index 000000000..4c7d2eeb9 --- /dev/null +++ b/website/docs/resiliency/high-availability/06-az-setup.md @@ -0,0 +1,123 @@ +--- +title: "AZ Failure Experiment Setup" +sidebar_position: 6 +description: "Scale your application to two Availability Zones and prepare for an AZ failure simulation experiment." +--- + +This guide outlines steps to enhance the resilience of your UI service by scaling it across two Availability Zones (AZs) and preparing for an AZ failure simulation experiment. + +## Scaling to Two AZs + +We'll use a Kustomize patch to modify the UI deployment, adding a second AZ and adjusting the number of replicas. We'll scale to 4 replicas in the new AZ while maintaining 5 replicas in the first AZ. + +First we need to make ann EKS Cluster in `us-east-2`. Run this to create a second AZ: + +```bash timeout=300 wait=30 +$ $SCRIPT_DIR/multi-az-get-pods.sh +$ aws configure set default.region $SECONDARY_REGION +$ prepare-environment resiliency +$ aws configure set default.region $PRIMARY_REGION +$ $SCRIPT_DIR/multi-az-get-pods.sh +``` + +Now we need to Kustomize our content with a patch file: + +```file +manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml +``` + +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml): + +```bash +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/resiliency/high-availability/multi_az/ +``` + +## Verify Retail Store Accessibility + +After applying these changes, it's important to verify that your retail store is accessible: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +:::tip +The retail url may take 10 minutes to become operational. +::: + +## Check Pod Distribution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/multi-az-get-pods.sh +``` + +## AZ Failure Experiment Preparation + +### Overview + +This experiment will simulate an Availability Zone (AZ) failure, demonstrating how resilient your application is when faced with significant infrastructure disruptions. We'll use AWS Fault Injection Simulator (FIS) and additional AWS services to test how well your system maintains functionality when an entire AZ becomes unavailable. + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash +$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 +``` + +2. Create the blueprint: + +```file +manifests/modules/resiliency/scripts/eks_workshop_canary_script.js +``` + +Place this canary script into the bucket: + +```bash +$ aws s3 cp /manifests/modules/resiliency/scripts/eks_workshop_canary_script.zip s3://$BUCKET_NAME/canary-scripts/eks_workshop_canary_script.zip +``` + +3. Create a synthetic canary: + +```bash +$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $CANARY_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-6.2 \ + --schedule Expression="rate(1 minute)" \ + --code S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/eks_workshop_canary_script.zip,Handler="exports.handler" \ + --run-config "EnvironmentVariables={INGRESS_URL=http://$INGRESS_URL}" \ + --region us-west-2 +$ sleep 30 +$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 +``` + +4. Create a CloudWatch alarm for the canary: + +```bash +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region us-west-2 +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +With these steps completed, your application is now scaled across two AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docs/resiliency/high-availability/07-az-failure.md b/website/docs/resiliency/high-availability/07-az-failure.md new file mode 100644 index 000000000..c164d3c85 --- /dev/null +++ b/website/docs/resiliency/high-availability/07-az-failure.md @@ -0,0 +1,84 @@ +--- +title: "Simulating AZ Failure" +sidebar_position: 7 +description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." +--- + +# Simulating AZ Failure + +## Overview + +This experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. + +### Setting up the Experiment + +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: + +```bash +$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) +$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') +``` + +Create the FIS experiment template to simulate the AZ failure: + +```bash +$ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the AZ failure: + +```bash +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 450 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +``` + +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 7.5 minutes to understand the immediate impact of the simulated AZ failure. + +During the experiment, you should observe the following sequence of events: + +- input here + +:::note +To verify clusters and rebalance pods, you can run: + +```bash +$ $SCRIPT_DIR/verify-cluster.sh +``` + +::: + +## Post-Experiment Verification + +After the experiment, verify that your application remains operational despite the simulated AZ failure: + +```bash +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +This step confirms the effectiveness of your Kubernetes cluster's high availability configuration and its ability to maintain service continuity during significant infrastructure disruptions. + +## Conclusion + +The AZ failure simulation represents a critical test of your EKS cluster's resilience and your application's high availability design. Through this experiment, you've gained valuable insights into: + +1. The effectiveness of your multi-AZ deployment strategy +2. Kubernetes' ability to reschedule pods across remaining healthy AZs +3. The impact of an AZ failure on your application's performance and availability +4. The efficiency of your monitoring and alerting systems in detecting and responding to major infrastructure issues + +Key takeaways from this experiment include: + +- The importance of distributing your workload across multiple AZs +- The value of proper resource allocation and pod anti-affinity rules +- The need for robust monitoring and alerting systems that can quickly detect AZ-level issues +- The effectiveness of your disaster recovery and business continuity plans + +By regularly conducting such experiments, you can: + +- Identify potential weaknesses in your infrastructure and application architecture +- Refine your incident response procedures +- Build confidence in your system's ability to withstand major failures +- Continuously improve your application's resilience and reliability + +Remember, true resilience comes not just from surviving such failures, but from maintaining performance and user experience even in the face of significant infrastructure disruptions. Use the insights gained from this experiment to further enhance your application's fault tolerance and ensure seamless operations across all scenarios. diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/resiliency/high-availability/index.md index 31556db21..6dec235d9 100644 --- a/website/docs/resiliency/high-availability/index.md +++ b/website/docs/resiliency/high-availability/index.md @@ -5,23 +5,10 @@ sidebar_custom_props: { "module": true } description: "Prepare your EKS environment to handle high availability scenarios effectively." --- -TODO: - -- have to delete deployment before? why? is that due to dev or what -- expected time for lab completion -- expected time for prepare-env (about 5 minutes without cleanup.sh and any previous applications) -- Lab overview -- Check info sections -- Are we able to chmod in backend? -- Check why the load balancer stopped working - -::required-time - :::tip Before you start Prepare your environment for this section: ```bash timeout=300 wait=30 -$ chmod +x /manifests/modules/resiliency/.workshop/cleanup.sh $ /manifests/modules/resiliency/.workshop/cleanup.sh $ prepare-environment resiliency ``` @@ -37,13 +24,32 @@ This will make the following changes to your lab environment: You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). ::: -In this lab, we'll look at... -information +## Lab Overview + +In this lab, we'll explore various high availability scenarios and test the resilience of your EKS environment. Through a series of experiments, you'll gain hands-on experience in handling different types of failures and understanding how your Kubernetes cluster responds to these challenges. + +The experiments we'll conduct include: + +1. Pod Failure Simulation: Using ChaosMesh to test your application's resilience to individual pod failures. +2. Node Failure without FIS: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. +3. Partial Node Failure with FIS: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. +4. Complete Node Failure with FIS: Testing your cluster's response to a catastrophic failure of all nodes. +5. Availability Zone Failure: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. + +These experiments will help you understand: + +- How Kubernetes handles different types of failures +- The importance of proper resource allocation and pod distribution +- The effectiveness of your monitoring and alerting systems +- How to improve your application's fault tolerance and recovery strategies + +By the end of this lab, you'll have a comprehensive understanding of your EKS environment's high availability capabilities and areas for potential improvement. :::info -For more information on these changes checkout: +For more information on the components used in this lab, check out: - [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) - [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) - [Chaos Mesh](https://chaos-mesh.org/) +- [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) ::: diff --git a/website/docs/resiliency/high-availability/tests/hook-suite.sh b/website/docs/resiliency/high-availability/tests/hook-suite.sh new file mode 100644 index 000000000..8b5a4baea --- /dev/null +++ b/website/docs/resiliency/high-availability/tests/hook-suite.sh @@ -0,0 +1,11 @@ +set -e + +before() { + echo "noop" +} + +after() { + prepare-environment +} + +"$@" diff --git a/website/docs/resiliency/index.md b/website/docs/resiliency/index.md index 1541ba19d..0252fee19 100644 --- a/website/docs/resiliency/index.md +++ b/website/docs/resiliency/index.md @@ -4,16 +4,6 @@ sidebar_position: 11 weight: 10 --- -TODO: - -- Add intro information -- Find a lab to input - -Other TODO: - -- autotesting -- Containers on couch vod (link it here?) - ## What is Resiliency? Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: @@ -31,10 +21,11 @@ Amazon EKS provides a managed Kubernetes platform, but it's still crucial to des 2. **Data Integrity**: Prevent data loss and maintain consistency during unexpected events. 3. **User Experience**: Minimize downtime and performance degradation to maintain user satisfaction. 4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. +5. **Compliance**: Meet regulatory requirements for uptime and data protection in various industries. ## Resiliency Scenarios Covered in this Chapter -We'll explore several scenarios to show resiliency by performing: +We'll explore several scenarios to show resiliency by by simulating and responding to: 1. Pod Failures 2. Node Failures @@ -44,11 +35,29 @@ We'll explore several scenarios to show resiliency by performing: By the end of this chapter, you'll be able to: -- Use AWS FIS to simulate and learn from controlled failure scenarios -- other info +- Use AWS Fault Injection Simulator (FIS) to simulate and learn from controlled failure scenarios +- Understand how Kubernetes handles different types of failures (pod, node, and availability zone) +- Observe the self-healing capabilities of Kubernetes in action +- Gain practical experience in chaos engineering for EKS environments -:::info +## Tools and Technologies - +Throughout this chapter, we'll be using: +- AWS Fault Injection Simulator (FIS) for controlled chaos engineering +- Chaos Mesh for Kubernetes-native chaos testing +- AWS CloudWatch Synthetics for creating and monitoring a canary +- Kubernetes native features for observing pod and node behavior during failures + +## Importance of Chaos Engineering + +Chaos engineering is the practice of intentionally introducing controlled failures to identify weaknesses in your system. By proactively testing your system's resilience, you can: + +1. Uncover hidden issues before they affect users +2. Build confidence in your system's ability to withstand turbulent conditions +3. Improve your incident response procedures +4. Foster a culture of resilience within your organization + +:::info +For more information on AWS Resiliency features in greater depth, we recommend checking out [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) ::: From 4c3ce2d5405fdc11b0586d9bdb2884a33bd68996 Mon Sep 17 00:00:00 2001 From: cyturney Date: Tue, 20 Aug 2024 10:39:12 -0700 Subject: [PATCH 08/11] fixed az failure, added autotesting, added outputs to bash --- cluster/eksctl/cluster.yaml | 2 +- .../resiliency/.workshop/cleanup.sh | 77 ++++++++++ .../resiliency/.workshop/terraform/main.tf | 54 ++++--- .../resiliency/.workshop/terraform/outputs.tf | 6 +- .../resiliency/.workshop/terraform/vars.tf | 6 + .../config/kustomization.yaml | 2 +- .../config/scale_and_affinity_patch.yaml | 0 .../resiliency/scripts/AZ-verify-clusters.sh | 110 +++++++++++++++ .../resiliency/scripts/create-blueprint.sh | 114 +++++++++++++++ .../resiliency/scripts/get-pods-by-az.sh | 0 .../resiliency/scripts/node-failure.sh | 0 .../resiliency/scripts/pod-failure.sh | 0 .../resiliency/scripts/testing.sh | 31 ++++ .../resiliency/scripts/verify-cluster.sh | 0 .../modules/resiliency/.workshop/cleanup.sh | 132 ------------------ .../multi_az/add_us_east_2_patch.yaml | 41 ------ .../multi_az/kustomization.yaml | 8 -- .../resiliency/scripts/create-second-az.sh | 52 ------- .../scripts/eks_workshop_canary_script.js | 30 ---- .../resiliency/scripts/multi-az-get-pods.sh | 26 ---- .../high-availability/00-setup.md} | 22 ++- .../high-availability/01-scale.md | 96 +++++++++++++ .../high-availability/02-pod-failure.md | 33 ++++- .../03-node-failure-no-fis.md | 52 +++++-- .../04-node-failure-partial-fis.md | 63 ++++++--- .../05-node-failure-complete-fis.md | 57 ++++++-- .../high-availability/06-az-setup.md | 100 +++++++++++++ .../high-availability/07-az-failure.md | 78 ++++++++--- .../high-availability}/index.md | 3 +- .../high-availability/tests/hook-suite.sh | 0 .../resiliency/high-availability/01-setup.md | 61 -------- .../high-availability/06-az-setup.md | 123 ---------------- website/docusaurus.config.js | 6 - website/sidebars.js | 1 - 34 files changed, 806 insertions(+), 580 deletions(-) create mode 100755 manifests/modules/observability/resiliency/.workshop/cleanup.sh rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/main.tf (90%) rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/outputs.tf (77%) rename manifests/modules/{ => observability}/resiliency/.workshop/terraform/vars.tf (85%) rename manifests/modules/{ => observability}/resiliency/high-availability/config/kustomization.yaml (70%) rename manifests/modules/{ => observability}/resiliency/high-availability/config/scale_and_affinity_patch.yaml (100%) create mode 100755 manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh create mode 100755 manifests/modules/observability/resiliency/scripts/create-blueprint.sh rename manifests/modules/{ => observability}/resiliency/scripts/get-pods-by-az.sh (100%) rename manifests/modules/{ => observability}/resiliency/scripts/node-failure.sh (100%) rename manifests/modules/{ => observability}/resiliency/scripts/pod-failure.sh (100%) create mode 100644 manifests/modules/observability/resiliency/scripts/testing.sh rename manifests/modules/{ => observability}/resiliency/scripts/verify-cluster.sh (100%) delete mode 100755 manifests/modules/resiliency/.workshop/cleanup.sh delete mode 100644 manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml delete mode 100644 manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml delete mode 100755 manifests/modules/resiliency/scripts/create-second-az.sh delete mode 100644 manifests/modules/resiliency/scripts/eks_workshop_canary_script.js delete mode 100755 manifests/modules/resiliency/scripts/multi-az-get-pods.sh rename website/docs/{resiliency/high-availability/index.md => observability/high-availability/00-setup.md} (65%) create mode 100644 website/docs/observability/high-availability/01-scale.md rename website/docs/{resiliency => observability}/high-availability/02-pod-failure.md (63%) rename website/docs/{resiliency => observability}/high-availability/03-node-failure-no-fis.md (74%) rename website/docs/{resiliency => observability}/high-availability/04-node-failure-partial-fis.md (58%) rename website/docs/{resiliency => observability}/high-availability/05-node-failure-complete-fis.md (58%) create mode 100644 website/docs/observability/high-availability/06-az-setup.md rename website/docs/{resiliency => observability}/high-availability/07-az-failure.md (50%) rename website/docs/{resiliency => observability/high-availability}/index.md (97%) rename website/docs/{resiliency => observability}/high-availability/tests/hook-suite.sh (100%) delete mode 100644 website/docs/resiliency/high-availability/01-setup.md delete mode 100644 website/docs/resiliency/high-availability/06-az-setup.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index b038c2441..8306530c2 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -32,7 +32,7 @@ managedNodeGroups: instanceType: m5.large privateNetworking: true # had to remove use make create - #releaseVersion: "1.30.0-20240625" + releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/observability/resiliency/.workshop/cleanup.sh b/manifests/modules/observability/resiliency/.workshop/cleanup.sh new file mode 100755 index 000000000..1bb63ce1e --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/cleanup.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -e + +echo "Starting cleanup process..." + +# Function to safely delete a resource +safe_delete() { + local cmd=$1 + local resource=$2 + echo "Attempting to delete $resource..." + if $cmd 2>/dev/null; then + echo "$resource deleted successfully." + else + echo "Failed to delete $resource or it doesn't exist. Continuing..." + fi +} + +# Delete Kubernetes resources +echo "Cleaning up Kubernetes resources..." +kubectl delete ingress,deployment,service -n ui --all --ignore-not-found +kubectl delete role,rolebinding -n ui --all --ignore-not-found +kubectl delete namespace chaos-mesh --ignore-not-found + +# Uninstall Helm charts +echo "Uninstalling Helm charts..." +helm uninstall aws-load-balancer-controller -n kube-system || true +helm uninstall chaos-mesh -n chaos-mesh || true + +# Delete ALBs +echo "Cleaning up ALBs..." +for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do + safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" +done + +# Delete IAM Roles and Policies +echo "Cleaning up IAM roles and policies..." +for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do + for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do + echo "Processing role: $role" + for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do + safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" + done + for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do + safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" + done + safe_delete "aws iam delete-role --role-name $role" "IAM role $role" + done +done + +for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do + for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do + safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" + done +done + +# Delete S3 buckets +echo "Cleaning up S3 buckets..." +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do + aws s3 rm s3://$bucket --recursive + safe_delete "aws s3api delete-bucket --bucket $bucket" "S3 bucket $bucket" +done + +# Delete CloudWatch Synthetics canary and alarm +CANARY_NAME="eks-workshop-canary" +ALARM_NAME="eks-workshop-canary-alarm" + +echo "Cleaning up CloudWatch Synthetics canary and alarm..." +if aws synthetics get-canary --name $CANARY_NAME &>/dev/null; then + aws synthetics stop-canary --name $CANARY_NAME || true + sleep 30 + safe_delete "aws synthetics delete-canary --name $CANARY_NAME" "CloudWatch Synthetics canary $CANARY_NAME" +fi + +safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME" "CloudWatch alarm $ALARM_NAME" + +echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/.workshop/terraform/main.tf b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf similarity index 90% rename from manifests/modules/resiliency/.workshop/terraform/main.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/main.tf index ae6da7511..4da3d5fde 100644 --- a/manifests/modules/resiliency/.workshop/terraform/main.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf @@ -93,7 +93,7 @@ resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { # Add AWS Load Balancer controller resource "helm_release" "aws_load_balancer_controller" { name = "aws-load-balancer-controller" - repository = "https:#aws.github.io/eks-charts" + repository = "https://aws.github.io/eks-charts" chart = "aws-load-balancer-controller" namespace = "kube-system" version = var.load_balancer_controller_chart_version @@ -116,15 +116,15 @@ resource "helm_release" "aws_load_balancer_controller" { # Chaos Mesh Helm Release -resource "helm_release" "chaos_mesh" { - name = "chaos-mesh" - repository = "https:#charts.chaos-mesh.org" - chart = "chaos-mesh" - namespace = "chaos-mesh" - version = "2.5.1" - - create_namespace = true -} +#resource "helm_release" "chaos_mesh" { +# name = "chaos-mesh" +# repository = "https://charts.chaos-mesh.org" +# chart = "chaos-mesh" +# namespace = "chaos-mesh" +# version = "2.5.1" +# +# create_namespace = true +#} # FIS IAM role resource "random_id" "suffix" { @@ -141,10 +141,7 @@ resource "aws_iam_role" "fis_role" { Effect = "Allow" Principal = { Service = [ - "fis.amazonaws.com", - # for second region - "ec2.amazonaws.com", - "eks.amazonaws.com" + "fis.amazonaws.com" ] } Action = "sts:AssumeRole" @@ -241,6 +238,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "autoscaling:DescribeAutoScalingGroups", "autoscaling:DescribeAutoScalingInstances", "autoscaling:SetDesiredCapacity", + "autoscaling:SuspendProcesses", + "autoscaling:ResumeProcesses", "logs:CreateLogDelivery", "logs:GetLogDelivery", "logs:UpdateLogDelivery", @@ -249,7 +248,8 @@ resource "aws_iam_policy" "eks_resiliency_fis_policy" { "ssm:StartAutomationExecution", "ssm:GetAutomationExecution", "cloudwatch:DescribeAlarms", - "cloudwatch:GetMetricData" + "cloudwatch:GetMetricData", + "iam:PassRole" ] Resource = "*" }, @@ -331,7 +331,15 @@ resource "aws_iam_policy" "eks_resiliency_canary_policy" { "logs:PutLogEvents", "logs:DescribeLogGroups", "logs:DescribeLogStreams", - "lambda:InvokeFunction" + "lambda:CreateFunction", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:GetFunction", + "lambda:DeleteFunction", + "lambda:InvokeFunction", + "lambda:AddPermission", + "lambda:RemovePermission", + "iam:PassRole" ] Resource = "*" } @@ -377,3 +385,17 @@ resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" { policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" role = aws_iam_role.eks_cluster_role.name } + +# Executable Scripts +resource "null_resource" "chmod_all_scripts_bash" { + provisioner "local-exec" { + command = "find ${var.script_dir} -type f -exec chmod +x {} + || true" + } +} + +# Add Region terraform +data "aws_region" "current" {} + + + + diff --git a/manifests/modules/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf similarity index 77% rename from manifests/modules/resiliency/.workshop/terraform/outputs.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf index a584978a7..8538519e6 100644 --- a/manifests/modules/resiliency/.workshop/terraform/outputs.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf @@ -5,10 +5,10 @@ output "environment_variables" { LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn FIS_ROLE_ARN = aws_iam_role.fis_role.arn RANDOM_SUFFIX = random_id.suffix.hex - SCRIPT_DIR = "/manifests/modules/resiliency/scripts" + SCRIPT_DIR = var.script_dir CANARY_ROLE_ARN = aws_iam_role.canary_role.arn EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn - PRIMARY_REGION = "us-west-2" - SECONDARY_REGION = "us-east-2" + AWS_REGION = data.aws_region.current.name } } + diff --git a/manifests/modules/resiliency/.workshop/terraform/vars.tf b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf similarity index 85% rename from manifests/modules/resiliency/.workshop/terraform/vars.tf rename to manifests/modules/observability/resiliency/.workshop/terraform/vars.tf index 42bd4d060..f0b4e480c 100644 --- a/manifests/modules/resiliency/.workshop/terraform/vars.tf +++ b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf @@ -41,3 +41,9 @@ variable "load_balancer_controller_chart_version" { default = "1.8.1" } +# Executable Scripts +variable "script_dir" { + description = "Directory where scripts are located" + type = string + default = "/manifests/modules/observability/resiliency/scripts" +} \ No newline at end of file diff --git a/manifests/modules/resiliency/high-availability/config/kustomization.yaml b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml similarity index 70% rename from manifests/modules/resiliency/high-availability/config/kustomization.yaml rename to manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml index b71687089..deae0ee7f 100644 --- a/manifests/modules/resiliency/high-availability/config/kustomization.yaml +++ b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - ../../../../../manifests/base-application/ui + - ../../../../../../manifests/base-application/ui patches: - path: scale_and_affinity_patch.yaml diff --git a/manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml similarity index 100% rename from manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml rename to manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml diff --git a/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh new file mode 100755 index 000000000..a136332b2 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# verify-cluster.sh - Verifies cluster state and corrects replica count + +DESIRED_REPLICAS=9 +MAX_WAIT_TIME=300 # 5 minutes +POLL_INTERVAL=10 # 10 seconds +NAMESPACE="ui" +EXPECTED_READY_NODES=6 + +print_header() { + echo -e "\n==== $1 ====\n" +} + +wait_for_condition() { + local end_time=$((SECONDS + MAX_WAIT_TIME)) + while [ $SECONDS -lt $end_time ]; do + if eval "$1"; then + return 0 + fi + echo -n "." + sleep $POLL_INTERVAL + done + echo " Timeout!" + return 1 +} + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Waiting for nodes to be Ready" +total_nodes=$(kubectl get nodes --no-headers | wc -l) +echo "Total nodes in the cluster: $total_nodes" +echo "Waiting for $EXPECTED_READY_NODES nodes to be in Ready state" +if wait_for_condition "[ \$(kubectl get nodes --no-headers | grep ' Ready ' | wc -l) -eq $EXPECTED_READY_NODES ]"; then + echo -e "\n✅ $EXPECTED_READY_NODES nodes are in Ready state." +else + echo -e "\n⚠️ Warning: $EXPECTED_READY_NODES nodes did not reach Ready state within the timeout period." + exit 1 +fi + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Node Information" +kubectl get nodes -o wide + +print_header "Verifying Cluster State" +node_count=$(kubectl get nodes --no-headers | grep " Ready " | grep -vc "SchedulingDisabled") +current_pod_count=$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) + +echo "Ready and schedulable nodes: $node_count" +echo "Current active ui pods: $current_pod_count" +echo "Desired ui pods: $DESIRED_REPLICAS" + +if [ $current_pod_count -ne $DESIRED_REPLICAS ]; then + print_header "Adjusting Replica Count" + echo "Scaling deployment to $DESIRED_REPLICAS replicas..." + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pod count to stabilize" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pod count has reached the desired number." + else + echo -e "\n⚠️ Warning: Failed to reach desired pod count within the timeout period." + fi +else + echo "✅ Number of replicas is correct." +fi + +print_header "Checking Pod Distribution" +if [ $node_count -gt 0 ]; then + max_pods_per_node=$((DESIRED_REPLICAS / node_count + 1)) + uneven_distribution=false + + for node in $(kubectl get nodes -o name | grep -v "SchedulingDisabled"); do + pods_on_node=$(kubectl get pods -n $NAMESPACE -l app=ui --field-selector spec.nodeName=${node#node/} --no-headers | grep -v Terminating | wc -l) + if [ $pods_on_node -gt $max_pods_per_node ]; then + uneven_distribution=true + break + fi + done + + if $uneven_distribution; then + echo "⚠️ Pod distribution is uneven. Rebalancing..." + kubectl scale deployment ui -n $NAMESPACE --replicas=0 + sleep $POLL_INTERVAL + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pods to be ready" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep Running | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pods are ready and balanced." + else + echo -e "\n⚠️ Warning: Pods did not reach ready state within the timeout period." + fi + else + echo "✅ Pod distribution is balanced." + fi +else + echo "⚠️ Warning: No Ready and schedulable nodes found. Cannot check pod distribution." +fi + +print_header "Final Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +echo +if [ $node_count -gt 0 ] && [ $current_pod_count -eq $DESIRED_REPLICAS ]; then + echo "✅ Cluster verification and correction complete." +else + echo "⚠️ Cluster verification complete, but some issues may require attention." +fi \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/scripts/create-blueprint.sh b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh new file mode 100755 index 000000000..4f8ab5112 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Get Ingress URL +INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +# Create the required directory structure +mkdir -p nodejs/node_modules + +# Create the Node.js canary script with heartbeat blueprint +cat << EOF > nodejs/node_modules/canary.js +const { URL } = require('url'); +const synthetics = require('Synthetics'); +const log = require('SyntheticsLogger'); +const syntheticsConfiguration = synthetics.getConfiguration(); +const syntheticsLogHelper = require('SyntheticsLogHelper'); + +const loadBlueprint = async function () { + const urls = ['http://${INGRESS_URL}']; + + // Set screenshot option + const takeScreenshot = true; + + // Configure synthetics settings + syntheticsConfiguration.disableStepScreenshots(); + syntheticsConfiguration.setConfig({ + continueOnStepFailure: true, + includeRequestHeaders: true, + includeResponseHeaders: true, + restrictedHeaders: [], + restrictedUrlParameters: [] + }); + + let page = await synthetics.getPage(); + + for (const url of urls) { + await loadUrl(page, url, takeScreenshot); + } +}; + +// Reset the page in-between +const resetPage = async function(page) { + try { + await page.goto('about:blank', {waitUntil: ['load', 'networkidle0'], timeout: 30000}); + } catch (e) { + synthetics.addExecutionError('Unable to open a blank page. ', e); + } +}; + +const loadUrl = async function (page, url, takeScreenshot) { + let stepName = null; + let domcontentloaded = false; + + try { + stepName = new URL(url).hostname; + } catch (e) { + const errorString = \`Error parsing url: \${url}. \${e}\`; + log.error(errorString); + throw e; + } + + await synthetics.executeStep(stepName, async function () { + const sanitizedUrl = syntheticsLogHelper.getSanitizedUrl(url); + + const response = await page.goto(url, { waitUntil: ['domcontentloaded'], timeout: 30000}); + if (response) { + domcontentloaded = true; + const status = response.status(); + const statusText = response.statusText(); + + logResponseString = \`Response from url: \${sanitizedUrl} Status: \${status} Status Text: \${statusText}\`; + + if (response.status() < 200 || response.status() > 299) { + throw new Error(\`Failed to load url: \${sanitizedUrl} \${response.status()} \${response.statusText()}\`); + } + } else { + const logNoResponseString = \`No response returned for url: \${sanitizedUrl}\`; + log.error(logNoResponseString); + throw new Error(logNoResponseString); + } + }); + + // Wait for 15 seconds to let page load fully before taking screenshot. + if (domcontentloaded && takeScreenshot) { + await new Promise(r => setTimeout(r, 15000)); + await synthetics.takeScreenshot(stepName, 'loaded'); + } + + // Reset page + await resetPage(page); +}; + +exports.handler = async () => { + return await loadBlueprint(); +}; +EOF + +# Zip the Node.js script +python3 - << EOL +import zipfile +with zipfile.ZipFile('canary.zip', 'w') as zipf: + zipf.write('nodejs/node_modules/canary.js', arcname='nodejs/node_modules/canary.js') +EOL + +# Ensure BUCKET_NAME is set +if [ -z "$BUCKET_NAME" ]; then + echo "Error: BUCKET_NAME environment variable is not set." + exit 1 +fi + +# Upload the zipped canary script to S3 +aws s3 cp canary.zip "s3://${BUCKET_NAME}/canary-scripts/canary.zip" + +echo "Canary script has been zipped and uploaded to s3://${BUCKET_NAME}/canary-scripts/canary.zip" +echo "The script is configured to check the URL: http://${INGRESS_URL}" diff --git a/manifests/modules/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh similarity index 100% rename from manifests/modules/resiliency/scripts/get-pods-by-az.sh rename to manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh diff --git a/manifests/modules/resiliency/scripts/node-failure.sh b/manifests/modules/observability/resiliency/scripts/node-failure.sh similarity index 100% rename from manifests/modules/resiliency/scripts/node-failure.sh rename to manifests/modules/observability/resiliency/scripts/node-failure.sh diff --git a/manifests/modules/resiliency/scripts/pod-failure.sh b/manifests/modules/observability/resiliency/scripts/pod-failure.sh similarity index 100% rename from manifests/modules/resiliency/scripts/pod-failure.sh rename to manifests/modules/observability/resiliency/scripts/pod-failure.sh diff --git a/manifests/modules/observability/resiliency/scripts/testing.sh b/manifests/modules/observability/resiliency/scripts/testing.sh new file mode 100644 index 000000000..b42708e1f --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/testing.sh @@ -0,0 +1,31 @@ +ZONE_EXP_ID=$(aws fis create-experiment-template \ + --cli-input-json '{ + "description": "publicdocument-azfailure", + "targets": {}, + "actions": { + "azfailure": { + "actionId": "aws:ssm:start-automation-execution", + "parameters": { + "documentArn": "arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23", + "documentParameters": "{ + \"AutoScalingGroupName\":\"'$ASG_NAME'\", + \"CanaryAlarmName\":\"eks-workshop-canary-alarm\", + \"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\", + \"IsRollback\":\"false\", + \"TestDurationInMinutes\":\"2\" + }", + "maxDuration": "PT6M" + } + } + }, + "stopConditions": [ + { + "source": "none" + } + ], + "roleArn": "'$FIS_ROLE_ARN'", + "tags": { + "ExperimentSuffix": "'$RANDOM_SUFFIX'" + } + }' \ + --output json | jq -r '.experimentTemplate.id') \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/verify-cluster.sh b/manifests/modules/observability/resiliency/scripts/verify-cluster.sh similarity index 100% rename from manifests/modules/resiliency/scripts/verify-cluster.sh rename to manifests/modules/observability/resiliency/scripts/verify-cluster.sh diff --git a/manifests/modules/resiliency/.workshop/cleanup.sh b/manifests/modules/resiliency/.workshop/cleanup.sh deleted file mode 100755 index 537a7d260..000000000 --- a/manifests/modules/resiliency/.workshop/cleanup.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -set -e - -echo "Starting cleanup process..." - -# Function to safely delete a resource -safe_delete() { - local cmd=$1 - local resource=$2 - echo "Attempting to delete $resource..." - if $cmd 2>/dev/null; then - echo "$resource deleted successfully." - else - echo "Failed to delete $resource or it doesn't exist. Continuing..." - fi -} - -# Function to wait for resource deletion -wait_for_deletion() { - local check_cmd=$1 - local resource=$2 - local max_attempts=30 - local attempt=0 - echo "Waiting for $resource to be deleted..." - while $check_cmd &>/dev/null && [ $attempt -lt $max_attempts ]; do - sleep 10 - ((attempt++)) - done - if [ $attempt -eq $max_attempts ]; then - echo "Timeout waiting for $resource to be deleted." - else - echo "$resource deleted successfully." - fi -} - -# Function to cleanup EKS resources in a region -cleanup_eks_region() { - local region=$1 - local cluster_name=$2 - local nodegroup_name=$3 - local delete_cluster=$4 - - echo "Cleaning up EKS resources in $region..." - - # Switch to the specified region - aws configure set default.region $region - - # Delete Kubernetes resources - echo "Cleaning up Kubernetes resources..." - kubectl delete ingress,deployment,service -n ui --all --ignore-not-found - kubectl delete role,rolebinding -n ui --all --ignore-not-found - kubectl delete namespace chaos-mesh --ignore-not-found - - # Delete EKS Cluster and Node Group if specified - if [ "$delete_cluster" = true ]; then - echo "Attempting to delete EKS cluster and node group..." - if aws eks describe-cluster --name $cluster_name --region $region &>/dev/null; then - aws eks delete-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region || true - wait_for_deletion "aws eks describe-nodegroup --cluster-name $cluster_name --nodegroup-name $nodegroup_name --region $region" "node group" - - aws eks delete-cluster --name $cluster_name --region $region - wait_for_deletion "aws eks describe-cluster --name $cluster_name --region $region" "EKS cluster" - else - echo "EKS cluster $cluster_name not found in $region. Skipping deletion." - fi - else - echo "Skipping EKS cluster and node group deletion in $region as requested." - fi - - # Uninstall Helm charts - echo "Uninstalling Helm charts..." - helm uninstall aws-load-balancer-controller -n kube-system || true - helm uninstall chaos-mesh -n chaos-mesh || true - - # Delete ALBs - echo "Cleaning up ALBs in $region..." - for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do - safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" - done -} - -# Cleanup in PRIMARY_REGION (preserve cluster and node groups) -cleanup_eks_region $PRIMARY_REGION "eks-workshop" "default" false - -# Cleanup in SECONDARY_REGION (full cleanup) -cleanup_eks_region $SECONDARY_REGION "eks-workshop-east" "us-east-2-node-group" true - -# Global cleanup (not region-specific) - -# Delete IAM Roles and Policies -echo "Cleaning up IAM roles and policies..." -for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do - for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do - echo "Processing role: $role" - for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do - safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" - done - for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do - safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" - done - safe_delete "aws iam delete-role --role-name $role" "IAM role $role" - done -done - -for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do - for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do - safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" - done -done - -# Delete S3 buckets -echo "Cleaning up S3 buckets..." -for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do - aws s3 rm s3://$bucket --recursive - safe_delete "aws s3api delete-bucket --bucket $bucket --region $PRIMARY_REGION" "S3 bucket $bucket" -done - -# Delete CloudWatch Synthetics canary and alarm -CANARY_NAME="eks-workshop-canary" -ALARM_NAME="eks-workshop-canary-alarm" - -echo "Cleaning up CloudWatch Synthetics canary and alarm..." -if aws synthetics get-canary --name $CANARY_NAME --region $PRIMARY_REGION &>/dev/null; then - aws synthetics stop-canary --name $CANARY_NAME --region $PRIMARY_REGION || true - sleep 30 - safe_delete "aws synthetics delete-canary --name $CANARY_NAME --region $PRIMARY_REGION" "CloudWatch Synthetics canary $CANARY_NAME" -fi - -safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME --region $PRIMARY_REGION" "CloudWatch alarm $ALARM_NAME" - -echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml b/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml deleted file mode 100644 index b2a276fde..000000000 --- a/manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: ui - namespace: ui -spec: - replicas: 9 # Total number of replicas - template: - spec: - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 60 - preference: - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-west-2a - - us-west-2b - - us-west-2c - - weight: 40 - preference: - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - - us-east-2b - - us-east-2c - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - ui - topologyKey: "kubernetes.io/hostname" diff --git a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml b/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml deleted file mode 100644 index 32bf6179b..000000000 --- a/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../../../../manifests/base-application/ui - -patches: - - path: add_us_east_2_patch.yaml diff --git a/manifests/modules/resiliency/scripts/create-second-az.sh b/manifests/modules/resiliency/scripts/create-second-az.sh deleted file mode 100755 index 09d9c28bb..000000000 --- a/manifests/modules/resiliency/scripts/create-second-az.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Ensure SCRIPT_DIR is set -if [ -z "$SCRIPT_DIR" ]; then - echo "Error: SCRIPT_DIR environment variable is not set." - exit 1 -fi - -# Ensure PRIMARY_REGION and SECONDARY_REGION are set -if [ -z "$PRIMARY_REGION" ] || [ -z "$SECONDARY_REGION" ]; then - echo "Error: PRIMARY_REGION and SECONDARY_REGION must be set." - exit 1 -fi - -# Function to run multi-az-get-pods.sh and display region -run_multi_az_script() { - local region=$1 - echo "Current region: $region" - echo "Running multi-az-get-pods.sh..." - $SCRIPT_DIR/multi-az-get-pods.sh - echo "----------------------------------------" -} - -# Run multi-az-get-pods.sh in PRIMARY_REGION -aws configure set default.region $PRIMARY_REGION -run_multi_az_script $PRIMARY_REGION - -# Switch to SECONDARY_REGION -echo "Switching to SECONDARY_REGION: $SECONDARY_REGION" -aws configure set default.region $SECONDARY_REGION - -# Prepare environment for resiliency module -echo "Preparing environment for resiliency module..." -prepare-environment resiliency - -# Verify the EKS cluster in SECONDARY_REGION -echo "Verifying EKS cluster in SECONDARY_REGION..." -aws eks list-clusters - -# Check node groups in SECONDARY_REGION -CLUSTER_NAME=$(aws eks list-clusters --query 'clusters[0]' --output text) -echo "Checking node groups for cluster: $CLUSTER_NAME" -aws eks list-nodegroups --cluster-name $CLUSTER_NAME - -# Switch back to PRIMARY_REGION -echo "Switching back to PRIMARY_REGION: $PRIMARY_REGION" -aws configure set default.region $PRIMARY_REGION - -# Run multi-az-get-pods.sh one last time in PRIMARY_REGION -run_multi_az_script $PRIMARY_REGION - -echo "Setup complete. \ No newline at end of file diff --git a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js b/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js deleted file mode 100644 index 74deb4591..000000000 --- a/manifests/modules/resiliency/scripts/eks_workshop_canary_script.js +++ /dev/null @@ -1,30 +0,0 @@ -const synthetics = require("Synthetics"); -const log = require("SyntheticsLogger"); - -const pageLoadBlueprint = async function () { - const PAGE_LOAD_TIMEOUT = 30; - const URL = process.env.INGRESS_URL || "http://localhost"; // Use environment variable or fallback - - let page = await synthetics.getPage(); - - await synthetics.executeStep("Navigate to " + URL, async function () { - const response = await page.goto(URL, { - waitUntil: "domcontentloaded", - timeout: PAGE_LOAD_TIMEOUT * 1000, - }); - - // Verify the page loaded successfully - if (response.status() !== 200) { - throw new Error(`Failed to load page. Status code: ${response.status()}`); - } - }); - - await synthetics.executeStep("Verify page content", async function () { - const pageTitle = await page.title(); - log.info("Page title: " + pageTitle); - }); -}; - -exports.handler = async () => { - return await pageLoadBlueprint(); -}; diff --git a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh b/manifests/modules/resiliency/scripts/multi-az-get-pods.sh deleted file mode 100755 index f47649eb8..000000000 --- a/manifests/modules/resiliency/scripts/multi-az-get-pods.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -GREEN='\033[0;32m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -REGIONS=("us-west-2" "us-east-2") - -for REGION in "${REGIONS[@]}" -do - echo "Region: $REGION" - for az in a b c - do - AZ=$REGION$az - echo -n "------" - echo -n -e "${GREEN}$AZ${NC}" - echo "------" - for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers 2>/dev/null | grep -v NotReady | cut -d " " -f1) - do - echo -e " ${RED}$node:${NC}" - kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>/dev/null | while read line; do echo " ${line}"; done - done - echo "" - done - echo "" -done \ No newline at end of file diff --git a/website/docs/resiliency/high-availability/index.md b/website/docs/observability/high-availability/00-setup.md similarity index 65% rename from website/docs/resiliency/high-availability/index.md rename to website/docs/observability/high-availability/00-setup.md index 6dec235d9..cf774351b 100644 --- a/website/docs/resiliency/high-availability/index.md +++ b/website/docs/observability/high-availability/00-setup.md @@ -1,16 +1,14 @@ --- title: "High Availability" -sidebar_position: 20 -sidebar_custom_props: { "module": true } +sidebar_position: 1 description: "Prepare your EKS environment to handle high availability scenarios effectively." --- :::tip Before you start Prepare your environment for this section: -```bash timeout=300 wait=30 -$ /manifests/modules/resiliency/.workshop/cleanup.sh -$ prepare-environment resiliency +```bash timeout=600 wait=30 +$ prepare-environment observability/resiliency ``` This will make the following changes to your lab environment: @@ -18,10 +16,9 @@ This will make the following changes to your lab environment: - Create the ingress load balancer - Create RBAC and Rolebindings - Install AWS Load Balancer controller -- Install ChaosMesh - Create an IAM role for AWS Fault Injection Simulator (FIS) -You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/.workshop/terraform). +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/.workshop/terraform). ::: ## Lab Overview @@ -30,11 +27,11 @@ In this lab, we'll explore various high availability scenarios and test the resi The experiments we'll conduct include: -1. Pod Failure Simulation: Using ChaosMesh to test your application's resilience to individual pod failures. -2. Node Failure without FIS: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. -3. Partial Node Failure with FIS: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. -4. Complete Node Failure with FIS: Testing your cluster's response to a catastrophic failure of all nodes. -5. Availability Zone Failure: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. +1. **Pod Failure Simulation**: Using ChaosMesh to test your application's resilience to individual pod failures. +2. **Node Failure without FIS**: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. +3. **Partial Node Failure with FIS**: Leveraging AWS Fault Injection Simulator to create a more controlled node failure scenario. +4. **Complete Node Failure with FIS**: Testing your cluster's response to a catastrophic failure of all nodes. +5. **Availability Zone Failure**: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. These experiments will help you understand: @@ -50,6 +47,5 @@ For more information on the components used in this lab, check out: - [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) - [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) -- [Chaos Mesh](https://chaos-mesh.org/) - [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) ::: diff --git a/website/docs/observability/high-availability/01-scale.md b/website/docs/observability/high-availability/01-scale.md new file mode 100644 index 000000000..03c24ccaf --- /dev/null +++ b/website/docs/observability/high-availability/01-scale.md @@ -0,0 +1,96 @@ +--- +title: "Lab Setup: Chaos Mesh, Scaling, and Pod affinity" +sidebar_position: 2 +description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." +--- + +This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover installing helm, scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. + +## Installing Chaos Mesh + +To enhance our cluster's resilience testing capabilities, we'll install Chaos Mesh. Chaos Mesh is a powerful chaos engineering tool for Kubernetes environments. It allows us to simulate various failure scenarios and test how our applications respond. + +Let's install Chaos Mesh in our cluster using Helm: + +```bash timeout= 180 wait=30 +$ helm repo add chaos-mesh https://charts.chaos-mesh.org +$ helm upgrade --install chaos-mesh chaos-mesh/chaos-mesh \ + --namespace chaos-mesh \ + --create-namespace \ + --version 2.5.1 \ + --set dashboard.create=true \ +Release "chaos-mesh" does not exist. Installing it now. +NAME: chaos-mesh +LAST DEPLOYED: Tue Aug 20 04:44:31 2024 +NAMESPACE: chaos-mesh +STATUS: deployed +REVISION: 1 +TEST SUITE: None +``` + +## Scaling and Pod Anti-Affinity + +We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. + +Here's the content of our patch file: + +```kustomization +modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml +Deployment/ui +``` + +Apply the changes using Kustomize patch and +[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml): + +```bash wait=30 +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +``` + +## Verify Retail Store Accessibility + +After applying these changes, it's important to verify that your retail store is accessible: + +```bash timeout=600 wait=30 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. + +:::tip +The retail url may take up to 10 minutes to become operational. +::: + +## Helper Script: Get Pods by AZ + +The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh). + +### Script Execution + +To run the script and see the distribution of pods across availability zones, execute: + +```bash +$ $SCRIPT_DIR/get-pods-by-az.sh +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-6fzrk 1/1 Running 0 56s + ui-6dfb84cf67-dsp55 1/1 Running 0 56s + +------us-west-2b------ + ip-10-42-153-179.us-west-2.compute.internal: + ui-6dfb84cf67-2pxnp 1/1 Running 0 59s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-n8x4f 1/1 Running 0 61s + ui-6dfb84cf67-wljth 1/1 Running 0 61s +``` + +:::info +For more information on these changes, check out these sections: + +- [Chaos Mesh](https://chaos-mesh.org/) +- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) + ::: diff --git a/website/docs/resiliency/high-availability/02-pod-failure.md b/website/docs/observability/high-availability/02-pod-failure.md similarity index 63% rename from website/docs/resiliency/high-availability/02-pod-failure.md rename to website/docs/observability/high-availability/02-pod-failure.md index b1bcc55c8..5cbba76ef 100644 --- a/website/docs/resiliency/high-availability/02-pod-failure.md +++ b/website/docs/observability/high-availability/02-pod-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating Pod Failure" -sidebar_position: 2 +sidebar_position: 3 description: "Simulate pod failure in your environment using ChaosMesh to test the resiliency of your application." --- @@ -17,15 +17,28 @@ The `pod-failure.sh` script utilizes Chaos Mesh, a powerful chaos engineering pl This experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. This is the script we will be using: ```file -manifests/modules/resiliency/scripts/pod-failure.sh +manifests/modules/observability/resiliency/scripts/pod-failure.sh ``` ## Running the Experiment To simulate the pod failure and monitor its effects, run the following command: -```bash +```bash timeout=90 wait=30 $ $SCRIPT_DIR/pod-failure.sh && SECONDS=0; while [ $SECONDS -lt 30 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-dsp55 1/1 Running 0 2m10s + ui-6dfb84cf67-gzd9s 1/1 Running 0 8s + +------us-west-2b------ + ip-10-42-153-179.us-west-2.compute.internal: + ui-6dfb84cf67-2pxnp 1/1 Running 0 2m13s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-n8x4f 1/1 Running 0 2m17s + ui-6dfb84cf67-wljth 1/1 Running 0 2m17s ``` This command does the following: @@ -38,8 +51,14 @@ During the experiment, you should observe one pod disappearing and then reappear To get a more detailed view of the pods in the `ui` namespace, use the following command: -```bash +```bash wait=15 $ kubectl get pods -n ui -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +ui-6dfb84cf67-2pxnp 1/1 Running 0 2m56s 10.42.154.151 ip-10-42-153-179.us-west-2.compute.internal +ui-6dfb84cf67-dsp55 1/1 Running 0 2m56s 10.42.126.161 ip-10-42-127-82.us-west-2.compute.internal +ui-6dfb84cf67-gzd9s 1/1 Running 0 71s 10.42.126.246 ip-10-42-127-82.us-west-2.compute.internal +ui-6dfb84cf67-n8x4f 1/1 Running 0 2m56s 10.42.190.250 ip-10-42-186-246.us-west-2.compute.internal +ui-6dfb84cf67-wljth 1/1 Running 0 2m56s 10.42.190.249 ip-10-42-186-246.us-west-2.compute.internal ``` This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. @@ -48,11 +67,13 @@ This will show you the status, IP addresses, and nodes for each pod in the `ui` An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` -This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. +Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. ## Conclusion diff --git a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md similarity index 74% rename from website/docs/resiliency/high-availability/03-node-failure-no-fis.md rename to website/docs/observability/high-availability/03-node-failure-no-fis.md index 494bddbd1..ac487042c 100644 --- a/website/docs/resiliency/high-availability/03-node-failure-no-fis.md +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Node Failure without FIS" -sidebar_position: 3 +sidebar_position: 4 description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." --- @@ -13,7 +13,7 @@ This experiment simulates a node failure manually in your Kubernetes cluster to The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: ```file -manifests/modules/resiliency/scripts/node-failure.sh +manifests/modules/observability/resiliency/scripts/node-failure.sh ``` It's important to note that this experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. @@ -22,8 +22,22 @@ It's important to note that this experiment is repeatable, allowing you to run i To simulate the node failure and monitor its effects, run the following command: -```bash +```bash timeout=180 wait=30 $ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-dsp55 1/1 Running 0 10m + ui-6dfb84cf67-gzd9s 1/1 Running 0 8m19s + +------us-west-2b------ + ip-10-42-133-195.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-4bmjm 1/1 Running 0 44s + ui-6dfb84cf67-n8x4f 1/1 Running 0 10m + ui-6dfb84cf67-wljth 1/1 Running 0 10m ``` This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. @@ -40,10 +54,28 @@ Throughout this process, the total number of running pods should remain constant While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: +Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: -```bash +```bash timeout=300 wait=30 $ $SCRIPT_DIR/verify-cluster.sh + +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-vwk4x 1/1 Running 0 25s + +------us-west-2b------ + ip-10-42-133-195.us-west-2.compute.internal: + ui-6dfb84cf67-2rb6s 1/1 Running 0 27s + ui-6dfb84cf67-dk495 1/1 Running 0 27s + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-7bftc 1/1 Running 0 29s + ui-6dfb84cf67-nqgdn 1/1 Running 0 29s + + ``` This script will: @@ -56,8 +88,10 @@ This script will: After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated node failure. @@ -65,13 +99,13 @@ This command retrieves the load balancer hostname for the ingress and waits for :::caution The retail url may take 10 minutes to become operational. You can optionally continue on with the lab by pressing `ctrl` + `z` to move operation to the background. To access it again input: -```bash -$ fg +```bash test=false +$ fg %1 ``` The url may not become operational by the time `wait-for-lb` times out. In that case, it should become operational after running the command again: -```bash +```bash test=false $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') ``` diff --git a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md similarity index 58% rename from website/docs/resiliency/high-availability/04-node-failure-partial-fis.md rename to website/docs/observability/high-availability/04-node-failure-partial-fis.md index 0d5a738db..4ca8d6c4b 100644 --- a/website/docs/resiliency/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Partial Node Failure with FIS" -sidebar_position: 4 +sidebar_position: 5 description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." --- @@ -29,18 +29,20 @@ For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) - [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) +- [AWS Systems Manager, Automation](https://console.aws.amazon.com/systems-manager/automation/executions) ::: ## Experiment Details This experiment differs from the previous manual node failure simulation in several ways: -1. Automated execution: FIS manages the experiment, allowing for more controlled and repeatable tests. -2. Partial failure: Instead of simulating a complete node failure, we're testing a scenario where a portion of the nodes fail. -3. Scale: FIS allows us to target multiple nodes simultaneously, providing a more realistic large-scale failure scenario. -4. Precision: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. +1. **Automated execution**: FIS manages the experiment, allowing for more controlled and repeatable tests compared to the manual script execution in the previous experiment. +2. **Partial failure**: Instead of simulating a complete failure of a single node, FIS allows us to simulate a partial failure across multiple nodes. This provides a more nuanced and realistic failure scenario. +3. **Scale**: FIS allows us to target multiple nodes simultaneously. This allows us to test the resilience of our application at a larger scale compared to the single-node failure in the manual experiment. +4. **Precision**: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. This level of control wasn't possible in the manual experiment, where we were limited to terminating entire nodes. +5. **Minimal disruption**: The FIS experiment is designed to maintain service availability throughout the test, whereas the manual node failure might have caused temporary disruptions to the retail store's accessibility. -In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. +These differences allows for a more comprehensive and realistic test of our application's resilience to failures, while maintaining better control over the experiment parameters. In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. Similar to previous experiments, this experiment is also repeatable ## Creating the Node Failure Experiment @@ -54,8 +56,20 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: -```bash +```bash timeout=240 wait=30 $ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-s6kw4 1/1 Running 0 2m16s + ui-6dfb84cf67-vwk4x 1/1 Running 0 4m54s + +------us-west-2b------ + +------us-west-2c------ + ip-10-42-180-16.us-west-2.compute.internal: + ui-6dfb84cf67-29xtf 1/1 Running 0 79s + ui-6dfb84cf67-68hbw 1/1 Running 0 79s + ui-6dfb84cf67-plv9f 1/1 Running 0 79s ``` This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. @@ -69,10 +83,25 @@ During the experiment, you should observe the following: Your retail url should stay operational unlike the node failure without FIS. :::note -To verify clusters and rebalance pods, you can run: +To verify nodes and rebalance pods, you can run: -```bash +```bash timeout=240 wait=30 $ $SCRIPT_DIR/verify-cluster.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-v2xj6 1/1 Running 0 14s + +------us-west-2b------ + ip-10-42-148-187.us-west-2.compute.internal: + ui-6dfb84cf67-4xq4n 1/1 Running 0 16s + ui-6dfb84cf67-56d6d 1/1 Running 0 16s + +------us-west-2c------ + ip-10-42-180-16.us-west-2.compute.internal: + ui-6dfb84cf67-86mpz 1/1 Running 0 18s + ui-6dfb84cf67-nhx4j 1/1 Running 0 18s ``` ::: @@ -81,8 +110,10 @@ $ $SCRIPT_DIR/verify-cluster.sh Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` :::tip @@ -91,10 +122,6 @@ The retail url may take 10 minutes to become operational. Despite the partial node failure, the retail store should continue to serve traffic, demonstrating the resilience of your deployment setup. -:::caution -Partial node failures test the limits of your application's failover capabilities. Monitor and determine how well your applications and services recover from such events. -::: - ## Conclusion This partial node failure simulation using AWS FIS demonstrates several key aspects of your Kubernetes cluster's resilience: @@ -113,9 +140,9 @@ Key takeaways from this experiment: By leveraging AWS FIS for such experiments, you gain several advantages: -1. Repeatability: You can run this experiment multiple times to ensure consistent behavior. -2. Automation: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. -3. Comprehensive testing: You can create more complex scenarios involving multiple AWS services to test your entire application stack. -4. Controlled chaos: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. +1. **Repeatability**: You can run this experiment multiple times to ensure consistent behavior. +2. **Automation**: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. +3. **Comprehensive testing**: You can create more complex scenarios involving multiple AWS services to test your entire application stack. +4. **Controlled chaos**: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. Regular execution of such experiments helps build confidence in your system's resilience and provides valuable insights for continuous improvement of your architecture and operational procedures. diff --git a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md similarity index 58% rename from website/docs/resiliency/high-availability/05-node-failure-complete-fis.md rename to website/docs/observability/high-availability/05-node-failure-complete-fis.md index a7f142b71..722341fd0 100644 --- a/website/docs/resiliency/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Complete Node Failure with FIS" -sidebar_position: 5 +sidebar_position: 6 description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." --- @@ -8,15 +8,11 @@ description: "Demonstrates the impact of a complete node failure on a Kubernetes ## Overview -This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. - -:::info Important -This test simulates a worst-case scenario. It's designed for controlled environments with thoroughly tested recovery mechanisms. -::: +This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. This is essentially a cluster failure. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. ## Experiment Details -Unlike the partial node failure simulation, this experiment: +This experiment is similar to the partial node failure as it is repeatable. Unlike the partial node failure simulation, this experiment: 1. Terminates 100% of the instances in all node groups. 2. Tests your cluster's ability to recover from a state of complete failure. @@ -34,24 +30,53 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc Execute the FIS experiment and monitor the cluster's response: -```bash -$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 300 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=420 wait=30 +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 360 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-106-250.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2b------ + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-n9xns 1/1 Running 0 4m8s + ui-6dfb84cf67-slknv 1/1 Running 0 2m48s + +------us-west-2c------ + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-5xht5 1/1 Running 0 4m52s + ui-6dfb84cf67-b6xbf 1/1 Running 0 4m10s + ui-6dfb84cf67-fpg8j 1/1 Running 0 4m52s ``` -This command will show the pods distribution over 5 minutes while we observe the experiment. We should see: +This command will show the pods distribution over 6 minutes while we observe the experiment. We should see: 1. Shortly after the experment is initiated, all nodes and pods dissapear. 2. After about 2 minutes, First node and some pods will come back online. 3. Around 4 minutes, a second node appears and more pods start up. -4. At 5 minutes, continued recovery as the last node come online. +4. At 6 minutes, continued recovery as the last node come online. -Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. +Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. If the node is not operational after this test, run `$SCRIPT_DIR/verify-clsuter.sh` to wait for the final node to change state to running before proceeding. :::note -To verify clusters and rebalance pods, you can run: +To verify nodes and rebalance pods, you can run: -```bash +```bash timeout=240 wait=30 $ $SCRIPT_DIR/verify-cluster.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-106-250.us-west-2.compute.internal: + ui-6dfb84cf67-4fjhh 1/1 Running 0 15s + ui-6dfb84cf67-gkrtn 1/1 Running 0 14s + +------us-west-2b------ + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-7qnkz 1/1 Running 0 16s + ui-6dfb84cf67-n58b9 1/1 Running 0 16s + +------us-west-2c------ + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-lvdc2 1/1 Running 0 18s ``` ::: @@ -60,8 +85,10 @@ $ $SCRIPT_DIR/verify-cluster.sh Check the retail store application's recovery: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` :::tip diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md new file mode 100644 index 000000000..04a3bbb83 --- /dev/null +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -0,0 +1,100 @@ +--- +title: "AZ Failure Experiment Setup" +sidebar_position: 7 +description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." +--- + +### Scaling Instances + +To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9: + +```bash timeout=120 wait=30 +$ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text) +$ aws autoscaling update-auto-scaling-group \ + --auto-scaling-group-name $ASG_NAME \ + --desired-capacity 6 \ + --min-size 6 \ + --max-size 6 +$ sleep 60 +$ kubectl scale deployment ui --replicas=9 -n ui +$ $SCRIPT_DIR/get-pods-by-az.sh +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s + ip-10-42-106-250.us-west-2.compute.internal: + ui-6dfb84cf67-4fjhh 1/1 Running 0 5m20s + ui-6dfb84cf67-gkrtn 1/1 Running 0 5m19s + +------us-west-2b------ + ip-10-42-139-198.us-west-2.compute.internal: + ui-6dfb84cf67-7rfkf 0/1 ContainerCreating 0 4s + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-7qnkz 1/1 Running 0 5m23s + ui-6dfb84cf67-n58b9 1/1 Running 0 5m23s + +------us-west-2c------ + ip-10-42-175-140.us-west-2.compute.internal: + ui-6dfb84cf67-8xfk8 0/1 ContainerCreating 0 8s + ui-6dfb84cf67-s55nb 0/1 ContainerCreating 0 8s + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-lvdc2 1/1 Running 0 5m26s +``` + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + +```bash wait=15 +$ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" +$ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION +make_bucket: eks-workshop-canary-artifacts-1724131402 +``` + +2. Create the blueprint: + +```file +manifests/modules/observability/resiliency/scripts/create-blueprint.sh +``` + +Place this canary blueprint into the bucket: + +```bash wait=15 +$ $SCRIPT_DIR/create-blueprint.sh +upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip +Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip +The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +3. Create a synthetic canary with a Cloudwatch alarm: + +```bash timeout=120 wait=30 +$ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $CANARY_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-9.0 \ + --schedule "Expression=rate(1 minute)" \ + --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ + --region $AWS_REGION +$ sleep 45 +$ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION +$ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region $AWS_REGION +``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +With these steps completed, your application is now scaled across to two instances in AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docs/resiliency/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md similarity index 50% rename from website/docs/resiliency/high-availability/07-az-failure.md rename to website/docs/observability/high-availability/07-az-failure.md index c164d3c85..97d1043b3 100644 --- a/website/docs/resiliency/high-availability/07-az-failure.md +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating AZ Failure" -sidebar_position: 7 +sidebar_position: 8 description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." --- @@ -8,18 +8,11 @@ description: "This experiment simulates an Availability Zone failure to test the ## Overview -This experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. +This repeatable experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. ### Setting up the Experiment -Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster: - -```bash -$ ASG_NAME_BOTH=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']]".AutoScalingGroupName --output text) -$ ASG_NAME=$(echo $ASG_NAME_BOTH | awk '{print $1}') -``` - -Create the FIS experiment template to simulate the AZ failure: +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster and creat the FIS experiment template to simulate the AZ failure: ```bash $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') @@ -29,21 +22,70 @@ $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the AZ failure: -```bash -$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 450 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=560 wait=30 +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 480 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-h57sp 1/1 Running 0 12m + ui-6dfb84cf67-h87h8 1/1 Running 0 12m + ip-10-42-111-144.us-west-2.compute.internal: + ui-6dfb84cf67-4xvmc 1/1 Running 0 11m + ui-6dfb84cf67-crl2s 1/1 Running 0 6m23s + +------us-west-2b------ + ip-10-42-141-243.us-west-2.compute.internal: + No resources found in ui namespace. + ip-10-42-150-255.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-164-250.us-west-2.compute.internal: + ui-6dfb84cf67-fl4hk 1/1 Running 0 11m + ui-6dfb84cf67-mptkw 1/1 Running 0 11m + ui-6dfb84cf67-zxnts 1/1 Running 0 6m27s + ip-10-42-178-108.us-west-2.compute.internal: + ui-6dfb84cf67-8vmcz 1/1 Running 0 6m28s + ui-6dfb84cf67-wknc5 1/1 Running 0 12m ``` -This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 7.5 minutes to understand the immediate impact of the simulated AZ failure. +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 8 minutes to understand the immediate impact of the simulated AZ failure. During the experiment, you should observe the following sequence of events: -- input here +1. After about 3 minutes, an AZ zone will fail. +2. Looking at the [Synthetic Canary]() you will see change state to `In Alarm` +3. Around 4 minutes after the experiment started, you will see pods reappearing in the other AZs +4. After the experiment is complete, after about 7 minutes, it marks the AZ as healthy, and replacement EC2 instances will be launched as a result of an EC2 autoscaling action, bringing the number of instances in each AZ to 2 again. + +During this time, the retail url will stay available showimg how resilient EKS is to AZ failures. :::note To verify clusters and rebalance pods, you can run: -```bash -$ $SCRIPT_DIR/verify-cluster.sh +```bash timeout=240 wait=30 +$ $SCRIPT_DIR/AZ-verify-clusters.sh +==== Final Pod Distribution ==== + +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-lwd86 1/1 Running 0 16s + ip-10-42-111-144.us-west-2.compute.internal: + ui-6dfb84cf67-hfrcf 1/1 Running 0 17s + ui-6dfb84cf67-qdr4s 1/1 Running 0 17s + +------us-west-2b------ + ip-10-42-141-243.us-west-2.compute.internal: + ui-6dfb84cf67-dxtg4 1/1 Running 0 19s + ip-10-42-150-255.us-west-2.compute.internal: + ui-6dfb84cf67-jvvg6 1/1 Running 0 20s + ui-6dfb84cf67-tmbzc 1/1 Running 0 20s + +------us-west-2c------ + ip-10-42-164-250.us-west-2.compute.internal: + ui-6dfb84cf67-k5mn8 1/1 Running 0 23s + ui-6dfb84cf67-zbm8j 1/1 Running 0 23s + ip-10-42-178-108.us-west-2.compute.internal: + ui-6dfb84cf67-svwqp 1/1 Running 0 24s ``` ::: @@ -52,8 +94,10 @@ $ $SCRIPT_DIR/verify-cluster.sh After the experiment, verify that your application remains operational despite the simulated AZ failure: -```bash +```bash timeout=600 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` This step confirms the effectiveness of your Kubernetes cluster's high availability configuration and its ability to maintain service continuity during significant infrastructure disruptions. diff --git a/website/docs/resiliency/index.md b/website/docs/observability/high-availability/index.md similarity index 97% rename from website/docs/resiliency/index.md rename to website/docs/observability/high-availability/index.md index 0252fee19..3d9254faa 100644 --- a/website/docs/resiliency/index.md +++ b/website/docs/observability/high-availability/index.md @@ -1,6 +1,7 @@ --- title: "Resiliency" -sidebar_position: 11 +sidebar_position: 70 +sidebar_custom_props: { "module": true } weight: 10 --- diff --git a/website/docs/resiliency/high-availability/tests/hook-suite.sh b/website/docs/observability/high-availability/tests/hook-suite.sh similarity index 100% rename from website/docs/resiliency/high-availability/tests/hook-suite.sh rename to website/docs/observability/high-availability/tests/hook-suite.sh diff --git a/website/docs/resiliency/high-availability/01-setup.md b/website/docs/resiliency/high-availability/01-setup.md deleted file mode 100644 index 31821d93a..000000000 --- a/website/docs/resiliency/high-availability/01-setup.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: "Scaling and Pod Anti-Affinity for UI Service" -sidebar_position: 1 -description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." ---- - -This guide outlines steps to enhance the resilience of a UI service by implementing high availability practices. We'll cover scaling the UI service, implementing pod anti-affinity, and using a helper script to visualize pod distribution across availability zones. - -## Scaling and Pod Anti-Affinity - -We use a Kustomize patch to modify the UI deployment, scaling it to 5 replicas and adding pod anti-affinity rules. This ensures UI pods are distributed across different nodes, reducing the impact of node failures. - -Here's the content of our patch file: - -```file -manifests/modules/resiliency/high-availability/config/scale_and_affinity_patch.yaml -``` - -Apply the changes using Kustomize patch and -[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/config/kustomization.yaml): - -```bash -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/resiliency/high-availability/config/ -``` - -## Verify Retail Store Accessibility - -After applying these changes, it's important to verify that your retail store is accessible: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -Once this command completes, it will output a URL. Open this URL in a new browser tab to verify that your retail store is accessible and functioning correctly. - -:::tip -If the retail store doesn't load immediately, wait a few moments and refresh the page. It may take a short time for all components to become fully operational. -::: - -## Helper Script: Get Pods by AZ - -The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes pods across different availability zones in the terminal. You can view the script file on github [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/scripts/get-pods-by-az.sh). - -### Script Execution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/get-pods-by-az.sh -``` - -:::tip -Use this to quickly assess the distribution of your pods across multiple zones. -::: - -:::info -For more information on these changes, check out these sections: - -- [Pod Affinity and Anti-Affinity](/docs/fundamentals/managed-node-groups/basics/affinity/) - ::: diff --git a/website/docs/resiliency/high-availability/06-az-setup.md b/website/docs/resiliency/high-availability/06-az-setup.md deleted file mode 100644 index 4c7d2eeb9..000000000 --- a/website/docs/resiliency/high-availability/06-az-setup.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -title: "AZ Failure Experiment Setup" -sidebar_position: 6 -description: "Scale your application to two Availability Zones and prepare for an AZ failure simulation experiment." ---- - -This guide outlines steps to enhance the resilience of your UI service by scaling it across two Availability Zones (AZs) and preparing for an AZ failure simulation experiment. - -## Scaling to Two AZs - -We'll use a Kustomize patch to modify the UI deployment, adding a second AZ and adjusting the number of replicas. We'll scale to 4 replicas in the new AZ while maintaining 5 replicas in the first AZ. - -First we need to make ann EKS Cluster in `us-east-2`. Run this to create a second AZ: - -```bash timeout=300 wait=30 -$ $SCRIPT_DIR/multi-az-get-pods.sh -$ aws configure set default.region $SECONDARY_REGION -$ prepare-environment resiliency -$ aws configure set default.region $PRIMARY_REGION -$ $SCRIPT_DIR/multi-az-get-pods.sh -``` - -Now we need to Kustomize our content with a patch file: - -```file -manifests/modules/resiliency/high-availability/multi_az/add_us_east_2_patch.yaml -``` - -Apply the changes using Kustomize patch and -[Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/resiliency/high-availability/multi_az/kustomization.yaml): - -```bash -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/resiliency/high-availability/multi_az/ -``` - -## Verify Retail Store Accessibility - -After applying these changes, it's important to verify that your retail store is accessible: - -```bash -$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -``` - -:::tip -The retail url may take 10 minutes to become operational. -::: - -## Check Pod Distribution - -To run the script and see the distribution of pods across availability zones, execute: - -```bash -$ $SCRIPT_DIR/multi-az-get-pods.sh -``` - -## AZ Failure Experiment Preparation - -### Overview - -This experiment will simulate an Availability Zone (AZ) failure, demonstrating how resilient your application is when faced with significant infrastructure disruptions. We'll use AWS Fault Injection Simulator (FIS) and additional AWS services to test how well your system maintains functionality when an entire AZ becomes unavailable. - -### Setting up a Synthetic Canary - -Before starting the experiment, set up a synthetic canary for heartbeat monitoring: - -1. First, create an S3 bucket for the canary artifacts: - -```bash -$ BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" -$ aws s3 mb s3://$BUCKET_NAME --region us-west-2 -``` - -2. Create the blueprint: - -```file -manifests/modules/resiliency/scripts/eks_workshop_canary_script.js -``` - -Place this canary script into the bucket: - -```bash -$ aws s3 cp /manifests/modules/resiliency/scripts/eks_workshop_canary_script.zip s3://$BUCKET_NAME/canary-scripts/eks_workshop_canary_script.zip -``` - -3. Create a synthetic canary: - -```bash -$ INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') -$ aws synthetics create-canary \ - --name eks-workshop-canary \ - --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ - --execution-role-arn $CANARY_ROLE_ARN \ - --runtime-version syn-nodejs-puppeteer-6.2 \ - --schedule Expression="rate(1 minute)" \ - --code S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/eks_workshop_canary_script.zip,Handler="exports.handler" \ - --run-config "EnvironmentVariables={INGRESS_URL=http://$INGRESS_URL}" \ - --region us-west-2 -$ sleep 30 -$ aws synthetics start-canary --name eks-workshop-canary --region us-west-2 -``` - -4. Create a CloudWatch alarm for the canary: - -```bash -$ aws cloudwatch put-metric-alarm \ - --alarm-name "eks-workshop-canary-alarm" \ - --metric-name SuccessPercent \ - --namespace CloudWatchSynthetics \ - --statistic Average \ - --period 60 \ - --threshold 95 \ - --comparison-operator LessThanThreshold \ - --dimensions Name=CanaryName,Value=eks-workshop-canary \ - --evaluation-periods 1 \ - --alarm-description "Alarm when Canary success rate drops below 95%" \ - --unit Percent \ - --region us-west-2 -``` - -This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. - -With these steps completed, your application is now scaled across two AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 145d2f91d..f712159a3 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -141,12 +141,6 @@ const config = { position: "left", label: "Observability", }, - { - type: "doc", - docId: "resiliency/index", - position: "left", - label: "Resiliency", - }, { type: "doc", docId: "security/index", diff --git a/website/sidebars.js b/website/sidebars.js index adf89ee4a..7da64994c 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,7 +20,6 @@ const sidebars = { networking: [{ type: "autogenerated", dirName: "networking" }], autoscaling: [{ type: "autogenerated", dirName: "autoscaling" }], observability: [{ type: "autogenerated", dirName: "observability" }], - resiliency: [{ type: "autogenerated", dirName: "resiliency" }], automation: [{ type: "autogenerated", dirName: "automation" }], aiml: [{ type: "autogenerated", dirName: "aiml" }], }; From b04124fc348155b3a8d0ffe599af00079863c82b Mon Sep 17 00:00:00 2001 From: cyturney Date: Fri, 23 Aug 2024 14:42:12 -0700 Subject: [PATCH 09/11] some updates based on PR input --- cluster/eksctl/cluster.yaml | 2 +- .../config/scale_and_affinity_patch.yaml | 13 ++++ .../resiliency/scripts/get-pods-by-az.sh | 50 +++++++++---- .../resiliency/scripts/pod-failure.sh | 4 +- .../resiliency/scripts/testing.sh | 0 .../resiliency/scripts/verify-cluster.sh | 6 +- .../high-availability/00-setup.md | 51 ------------- .../high-availability/01-scale.md | 18 +++-- .../high-availability/02-pod-failure.md | 71 +++++++++++-------- .../03-node-failure-no-fis.md | 47 ++++++------ .../04-node-failure-partial-fis.md | 37 ++++------ .../05-node-failure-complete-fis.md | 33 +++------ .../high-availability/06-az-setup.md | 9 ++- .../high-availability/07-az-failure.md | 44 ++++-------- .../observability/high-availability/index.md | 52 +++++++++++--- 15 files changed, 219 insertions(+), 218 deletions(-) mode change 100644 => 100755 manifests/modules/observability/resiliency/scripts/testing.sh delete mode 100644 website/docs/observability/high-availability/00-setup.md diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index 8306530c2..b038c2441 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -32,7 +32,7 @@ managedNodeGroups: instanceType: m5.large privateNetworking: true # had to remove use make create - releaseVersion: "1.30.0-20240625" + #releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: diff --git a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml index c84b9a056..3637434f5 100644 --- a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml +++ b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -25,3 +25,16 @@ spec: values: - ui topologyKey: "kubernetes.io/hostname" + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui diff --git a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh index 8063f1094..3306c9b0f 100755 --- a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh +++ b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Credit to "Disaster recovery, high availability, and resiliency on Amazon EKS" +# Modified from "Disaster recovery, high availability, and resiliency on Amazon EKS" # https://catalog.us-east-1.prod.workshops.aws/workshops/6140457f-53b2-48b8-a007-2d4be06ba2fc GREEN='\033[0;32m' @@ -10,16 +10,40 @@ NC='\033[0m' # No Color CURRENT_CONTEXT=$(kubectl config current-context) REGION=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"$CURRENT_CONTEXT\")].context.cluster}" | cut -d : -f 4) -for az in a b c -do - AZ=$REGION$az - echo -n "------" - echo -n -e "${GREEN}$AZ${NC}" - echo "------" - for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) - do - echo -e " ${RED}$node:${NC}" - kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done - done - echo "" +# Function to clear the screen and move cursor to top-left +clear_screen() { + echo -e "\033[2J\033[H" +} + +# Function to generate the output +generate_output() { + for az in a b c + do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done + done + echo "" + done +} + +# Initial clear screen +clear_screen + +# Main loop +while true; do + # Generate output to a temporary file + generate_output > temp_output.txt + + # Clear screen and display the new output + clear_screen + cat temp_output.txt + + # Wait before next update + sleep 1 done \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/scripts/pod-failure.sh b/manifests/modules/observability/resiliency/scripts/pod-failure.sh index 3ed7df813..fd7ea7b49 100755 --- a/manifests/modules/observability/resiliency/scripts/pod-failure.sh +++ b/manifests/modules/observability/resiliency/scripts/pod-failure.sh @@ -5,7 +5,7 @@ unique_id=$(date +%s) # Create a YAML configuration for the PodChaos resource -cat < pod-failure.yaml +kubectl apply -f - < +ui-6dfb84cf67-6d5lq 1/1 Running 0 46s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 46s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-qqltz 1/1 Running 0 46s 10.42.185.149 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 46s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal ``` -This command does the following: +Note that all pods have similar start times (shown in the AGE column). + +### Step 2: Simulate Pod Failure + +Now, let's simulate a pod failure: + +```bash +$ $SCRIPT_DIR/pod-failure.sh +``` -1. Initiates the pod failure simulation using the `pod-failure.sh` script -2. Monitors the pod distribution across Availability Zones (AZs) for 30 seconds -3. Updates the display every second to show real-time changes +This script will use Chaos Mesh to terminate one of the pods. -During the experiment, you should observe one pod disappearing and then reappearing, demonstrating the system's ability to detect and recover from failures. +### Step 3: Observe Recovery -To get a more detailed view of the pods in the `ui` namespace, use the following command: +Wait for a couple of seconds to allow Kubernetes to detect the failure and initiate recovery. Then, check the pod status again: -```bash wait=15 +```bash timeout=5 $ kubectl get pods -n ui -o wide +``` + +You should now see output similar to this: + +``` NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -ui-6dfb84cf67-2pxnp 1/1 Running 0 2m56s 10.42.154.151 ip-10-42-153-179.us-west-2.compute.internal -ui-6dfb84cf67-dsp55 1/1 Running 0 2m56s 10.42.126.161 ip-10-42-127-82.us-west-2.compute.internal -ui-6dfb84cf67-gzd9s 1/1 Running 0 71s 10.42.126.246 ip-10-42-127-82.us-west-2.compute.internal -ui-6dfb84cf67-n8x4f 1/1 Running 0 2m56s 10.42.190.250 ip-10-42-186-246.us-west-2.compute.internal -ui-6dfb84cf67-wljth 1/1 Running 0 2m56s 10.42.190.249 ip-10-42-186-246.us-west-2.compute.internal +ui-6dfb84cf67-44hc9 1/1 Running 0 2m57s 10.42.121.37 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-6d5lq 1/1 Running 0 2m57s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-ghp5z 1/1 Running 0 6s 10.42.185.150 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 2m57s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 2m57s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal +[ec2-user@bc44085aafa9 environment]$ ``` +Notice that one of the pods (in this example, `ui-6dfb84cf67-ghp5z`) has a much lower AGE value. This is the pod that Kubernetes automatically created to replace the one that was terminated by our simulation. + This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. ## Verify Retail Store Availability An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md index ac487042c..817c8f75c 100644 --- a/website/docs/observability/high-availability/03-node-failure-no-fis.md +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Node Failure without FIS" -sidebar_position: 4 +sidebar_position: 3 description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." --- -# Simulating Node Failure without FIS - ## Overview This experiment simulates a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. By deliberately causing a node to fail, we can observe how Kubernetes handles the failure and maintains the overall health of the cluster. @@ -22,8 +20,9 @@ It's important to note that this experiment is repeatable, allowing you to run i To simulate the node failure and monitor its effects, run the following command: -```bash timeout=180 wait=30 -$ $SCRIPT_DIR/node-failure.sh && SECONDS=0; while [ $SECONDS -lt 120 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +```bash timeout=240 wait=30 +$ $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: ui-6dfb84cf67-dsp55 1/1 Running 0 10m @@ -54,42 +53,36 @@ Throughout this process, the total number of running pods should remain constant While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially rebalance the pod distribution if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods. -Use the following [script](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/scripts/verify-cluster.sh) to verify the cluster state and rebalance pods: +First let's ensure all nodes are in the `Ready` state: ```bash timeout=300 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh - -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-127-82.us-west-2.compute.internal: - ui-6dfb84cf67-vwk4x 1/1 Running 0 25s - -------us-west-2b------ - ip-10-42-133-195.us-west-2.compute.internal: - ui-6dfb84cf67-2rb6s 1/1 Running 0 27s - ui-6dfb84cf67-dk495 1/1 Running 0 27s +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +``` -------us-west-2c------ - ip-10-42-186-246.us-west-2.compute.internal: - ui-6dfb84cf67-7bftc 1/1 Running 0 29s - ui-6dfb84cf67-nqgdn 1/1 Running 0 29s +This command counts the total number of nodes in the `Ready` state and continuously checks until all 3 active nodes are ready. +Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes: +```bash timeout=60 wait=30 +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` -This script will: +These commands perform the following actions: -- Wait for nodes to come back online -- Count the number of nodes and ui pods -- Check if the pods are evenly distributed across the nodes +1. Delete the existing ui deployment. +2. Reapply the configuration to create a new deployment. +3. Use the `get-pods-by-az.sh` script to check the distribution of pods across availability zones. ## Verify Retail Store Availability After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md index 4ca8d6c4b..7ca211192 100644 --- a/website/docs/observability/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Partial Node Failure with FIS" -sidebar_position: 5 +sidebar_position: 4 description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." --- -# Simulating Partial Node Failure with FIS - ## AWS Fault Injection Simulator (FIS) Overview AWS Fault Injection Simulator (FIS) is a fully managed service that enables you to perform controlled fault injection experiments on your AWS workloads. FIS allows you to simulate various failure scenarios, which is crucial for: @@ -30,7 +28,8 @@ For more information on AWS FIS, check out: - [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) - [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) - [AWS Systems Manager, Automation](https://console.aws.amazon.com/systems-manager/automation/executions) - ::: + +::: ## Experiment Details @@ -57,7 +56,8 @@ $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the node failure and monitor the response: ```bash timeout=240 wait=30 -$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 180 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: ui-6dfb84cf67-s6kw4 1/1 Running 0 2m16s @@ -70,6 +70,7 @@ $ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json & ui-6dfb84cf67-29xtf 1/1 Running 0 79s ui-6dfb84cf67-68hbw 1/1 Running 0 79s ui-6dfb84cf67-plv9f 1/1 Running 0 79s + ``` This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. @@ -85,23 +86,12 @@ Your retail url should stay operational unlike the node failure without FIS. :::note To verify nodes and rebalance pods, you can run: -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-127-82.us-west-2.compute.internal: - ui-6dfb84cf67-v2xj6 1/1 Running 0 14s - -------us-west-2b------ - ip-10-42-148-187.us-west-2.compute.internal: - ui-6dfb84cf67-4xq4n 1/1 Running 0 16s - ui-6dfb84cf67-56d6d 1/1 Running 0 16s - -------us-west-2c------ - ip-10-42-180-16.us-west-2.compute.internal: - ui-6dfb84cf67-86mpz 1/1 Running 0 18s - ui-6dfb84cf67-nhx4j 1/1 Running 0 18s +```bash timeout=300 wait=30 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -110,8 +100,9 @@ $ $SCRIPT_DIR/verify-cluster.sh Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md index 722341fd0..4bc755886 100644 --- a/website/docs/observability/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -1,11 +1,9 @@ --- title: "Simulating Complete Node Failure with FIS" -sidebar_position: 6 +sidebar_position: 5 description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." --- -# Simulating Complete Node Failure with FIS - ## Overview This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. This is essentially a cluster failure. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. @@ -31,7 +29,8 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc Execute the FIS experiment and monitor the cluster's response: ```bash timeout=420 wait=30 -$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 360 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 360s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-106-250.us-west-2.compute.internal: No resources found in ui namespace. @@ -60,23 +59,12 @@ Due to the severity of the experiment, the retail store url will not stay operat :::note To verify nodes and rebalance pods, you can run: -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/verify-cluster.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-106-250.us-west-2.compute.internal: - ui-6dfb84cf67-4fjhh 1/1 Running 0 15s - ui-6dfb84cf67-gkrtn 1/1 Running 0 14s - -------us-west-2b------ - ip-10-42-141-133.us-west-2.compute.internal: - ui-6dfb84cf67-7qnkz 1/1 Running 0 16s - ui-6dfb84cf67-n58b9 1/1 Running 0 16s - -------us-west-2c------ - ip-10-42-179-59.us-west-2.compute.internal: - ui-6dfb84cf67-lvdc2 1/1 Running 0 18s +```bash timeout=300 wait=30 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -85,8 +73,9 @@ $ $SCRIPT_DIR/verify-cluster.sh Check the retail store application's recovery: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md index 04a3bbb83..21b8c83a7 100644 --- a/website/docs/observability/high-availability/06-az-setup.md +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -1,6 +1,6 @@ --- title: "AZ Failure Experiment Setup" -sidebar_position: 7 +sidebar_position: 6 description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." --- @@ -17,7 +17,8 @@ $ aws autoscaling update-auto-scaling-group \ --max-size 6 $ sleep 60 $ kubectl scale deployment ui --replicas=9 -n ui -$ $SCRIPT_DIR/get-pods-by-az.sh +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 + ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s @@ -49,6 +50,7 @@ Before starting the experiment, set up a synthetic canary for heartbeat monitori ```bash wait=15 $ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" $ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION + make_bucket: eks-workshop-canary-artifacts-1724131402 ``` @@ -62,6 +64,7 @@ Place this canary blueprint into the bucket: ```bash wait=15 $ $SCRIPT_DIR/create-blueprint.sh + upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com @@ -78,7 +81,7 @@ $ aws synthetics create-canary \ --schedule "Expression=rate(1 minute)" \ --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ --region $AWS_REGION -$ sleep 45 +$ aws synthetics wait canary-ready --name eks-workshop-canary --region $AWS_REGION $ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION $ aws cloudwatch put-metric-alarm \ --alarm-name "eks-workshop-canary-alarm" \ diff --git a/website/docs/observability/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md index 97d1043b3..94c6274c1 100644 --- a/website/docs/observability/high-availability/07-az-failure.md +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -1,11 +1,9 @@ --- title: "Simulating AZ Failure" -sidebar_position: 8 +sidebar_position: 7 description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." --- -# Simulating AZ Failure - ## Overview This repeatable experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. @@ -23,7 +21,8 @@ $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the AZ failure: ```bash timeout=560 wait=30 -$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && SECONDS=0; while [ $SECONDS -lt 480 ]; do clear; $SCRIPT_DIR/get-pods-by-az.sh; sleep 1; done +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 480s $SCRIPT_DIR/get-pods-by-az.sh + ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: ui-6dfb84cf67-h57sp 1/1 Running 0 12m @@ -60,32 +59,14 @@ During the experiment, you should observe the following sequence of events: During this time, the retail url will stay available showimg how resilient EKS is to AZ failures. :::note -To verify clusters and rebalance pods, you can run: - -```bash timeout=240 wait=30 -$ $SCRIPT_DIR/AZ-verify-clusters.sh -==== Final Pod Distribution ==== - -------us-west-2a------ - ip-10-42-100-4.us-west-2.compute.internal: - ui-6dfb84cf67-lwd86 1/1 Running 0 16s - ip-10-42-111-144.us-west-2.compute.internal: - ui-6dfb84cf67-hfrcf 1/1 Running 0 17s - ui-6dfb84cf67-qdr4s 1/1 Running 0 17s - -------us-west-2b------ - ip-10-42-141-243.us-west-2.compute.internal: - ui-6dfb84cf67-dxtg4 1/1 Running 0 19s - ip-10-42-150-255.us-west-2.compute.internal: - ui-6dfb84cf67-jvvg6 1/1 Running 0 20s - ui-6dfb84cf67-tmbzc 1/1 Running 0 20s - -------us-west-2c------ - ip-10-42-164-250.us-west-2.compute.internal: - ui-6dfb84cf67-k5mn8 1/1 Running 0 23s - ui-6dfb84cf67-zbm8j 1/1 Running 0 23s - ip-10-42-178-108.us-west-2.compute.internal: - ui-6dfb84cf67-svwqp 1/1 Running 0 24s +To verify nodes and rebalance pods, you can run: + +```bash timeout=300 wait=30 +$ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete deployment ui -n ui +$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ +$ sleep 30 +$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -94,8 +75,9 @@ $ $SCRIPT_DIR/AZ-verify-clusters.sh After the experiment, verify that your application remains operational despite the simulated AZ failure: -```bash timeout=600 wait=30 +```bash timeout=900 wait=30 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com ``` diff --git a/website/docs/observability/high-availability/index.md b/website/docs/observability/high-availability/index.md index 3d9254faa..9873c4990 100644 --- a/website/docs/observability/high-availability/index.md +++ b/website/docs/observability/high-availability/index.md @@ -1,10 +1,28 @@ --- -title: "Resiliency" +title: "Chaos Engineering with EKS" sidebar_position: 70 sidebar_custom_props: { "module": true } weight: 10 --- +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=900 wait=30 +$ kubectl delete deployment ui -n ui +$ prepare-environment observability/resiliency +``` + +This will make the following changes to your lab environment: + +- Create the ingress load balancer +- Create RBAC and Rolebindings +- Install AWS Load Balancer controller +- Create an IAM role for AWS Fault Injection Simulator (FIS) + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/.workshop/terraform). +::: + ## What is Resiliency? Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: @@ -24,13 +42,17 @@ Amazon EKS provides a managed Kubernetes platform, but it's still crucial to des 4. **Cost Efficiency**: Avoid overprovisioning by building systems that can handle variable loads and partial failures. 5. **Compliance**: Meet regulatory requirements for uptime and data protection in various industries. -## Resiliency Scenarios Covered in this Chapter +## Lab Overview and Resiliency Scenarios -We'll explore several scenarios to show resiliency by by simulating and responding to: +In this lab, we'll explore various high availability scenarios and test the resilience of your EKS environment. Through a series of experiments, you'll gain hands-on experience in handling different types of failures and understanding how your Kubernetes cluster responds to these challenges. -1. Pod Failures -2. Node Failures -3. Availability Zone Failures +The simulate and respond to: + +1. **Pod Failures**: Using ChaosMesh to test your application's resilience to individual pod failures. +2. **Node Failures**: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + - Without AWS Fault Injection Simulator: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + - With AWS Fault Injection Simulator: Leveraging AWS Fault Injection Simulator for partial and complete node failure scenarios. +3. **Availability Zone Failure**: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. ## What You'll Learn @@ -41,6 +63,13 @@ By the end of this chapter, you'll be able to: - Observe the self-healing capabilities of Kubernetes in action - Gain practical experience in chaos engineering for EKS environments +These experiments will help you understand: + +- How Kubernetes handles different types of failures +- The importance of proper resource allocation and pod distribution +- The effectiveness of your monitoring and alerting systems +- How to improve your application's fault tolerance and recovery strategies + ## Tools and Technologies Throughout this chapter, we'll be using: @@ -59,6 +88,13 @@ Chaos engineering is the practice of intentionally introducing controlled failur 3. Improve your incident response procedures 4. Foster a culture of resilience within your organization +By the end of this lab, you'll have a comprehensive understanding of your EKS environment's high availability capabilities and areas for potential improvement. + :::info -For more information on AWS Resiliency features in greater depth, we recommend checking out [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) -::: +For more information on AWS Resiliency features in greater depth, we recommend checking out: + +- [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) +- [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) +- [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) +- [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) + ::: From fa4198cb617dc41ccbc379632af19973b3c236c1 Mon Sep 17 00:00:00 2001 From: Sai Vennam Date: Fri, 27 Sep 2024 12:47:25 -0500 Subject: [PATCH 10/11] Update cluster.yaml --- cluster/eksctl/cluster.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cluster/eksctl/cluster.yaml b/cluster/eksctl/cluster.yaml index b038c2441..dbbaf5cc1 100644 --- a/cluster/eksctl/cluster.yaml +++ b/cluster/eksctl/cluster.yaml @@ -31,8 +31,7 @@ managedNodeGroups: maxSize: 6 instanceType: m5.large privateNetworking: true - # had to remove use make create - #releaseVersion: "1.30.0-20240625" + releaseVersion: "1.30.0-20240625" updateConfig: maxUnavailablePercentage: 50 labels: From c530e9a79c20717591d2b7f169fa6666e54c5f4c Mon Sep 17 00:00:00 2001 From: Divya Gupta Date: Fri, 27 Sep 2024 14:11:34 -0400 Subject: [PATCH 11/11] AZ reblancing changes --- .../config/scale_and_affinity_patch.yaml | 2 +- .../resiliency/scripts/get-pods-by-az.sh | 9 ++++-- .../high-availability/01-scale.md | 11 +++---- .../high-availability/02-pod-failure.md | 4 +-- .../03-node-failure-no-fis.md | 29 ++++++++++++------- .../04-node-failure-partial-fis.md | 28 +++++++++++------- .../05-node-failure-complete-fis.md | 25 ++++++++++------ .../high-availability/06-az-setup.md | 15 +++++----- .../high-availability/07-az-failure.md | 25 ++++++++++------ .../observability/high-availability/index.md | 8 +++-- 10 files changed, 96 insertions(+), 60 deletions(-) diff --git a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml index 3637434f5..29c93283f 100644 --- a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml +++ b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -24,7 +24,7 @@ spec: operator: In values: - ui - topologyKey: "kubernetes.io/hostname" + topologyKey: "topology.kubernetes.io/zone" topologySpreadConstraints: - maxSkew: 1 topologyKey: topology.kubernetes.io/zone diff --git a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh index 3306c9b0f..105a593ae 100755 --- a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh +++ b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh @@ -17,6 +17,7 @@ clear_screen() { # Function to generate the output generate_output() { + echo -e '\n\n\n' for az in a b c do AZ=$REGION$az @@ -30,10 +31,11 @@ generate_output() { done echo "" done + echo -e '\n\n\n' } # Initial clear screen -clear_screen +# clear_screen # Main loop while true; do @@ -41,9 +43,10 @@ while true; do generate_output > temp_output.txt # Clear screen and display the new output - clear_screen + # clear_screen cat temp_output.txt + # clear_screen # Wait before next update sleep 1 -done \ No newline at end of file +done diff --git a/website/docs/observability/high-availability/01-scale.md b/website/docs/observability/high-availability/01-scale.md index f1d5af630..ed9046dce 100644 --- a/website/docs/observability/high-availability/01-scale.md +++ b/website/docs/observability/high-availability/01-scale.md @@ -1,6 +1,6 @@ --- title: "Lab Setup: Chaos Mesh, Scaling, and Pod affinity" -sidebar_position: 1 +sidebar_position: 90 description: "Learn how to scale your pods, add Pod Anti-Affinity configurations, and use a helper script to visualize pod distribution." --- @@ -12,13 +12,14 @@ To enhance our cluster's resilience testing capabilities, we'll install Chaos Me Let's install Chaos Mesh in our cluster using Helm: -```bash timeout= 240 wait=30 +```bash timeout=240 $ helm repo add chaos-mesh https://charts.chaos-mesh.org $ helm upgrade --install chaos-mesh chaos-mesh/chaos-mesh \ --namespace chaos-mesh \ --create-namespace \ --version 2.5.1 \ --set dashboard.create=true \ + --wait Release "chaos-mesh" does not exist. Installing it now. NAME: chaos-mesh @@ -44,7 +45,7 @@ Deployment/ui Apply the changes using Kustomize patch and [Kustomization file](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml): -```bash timeout=120 wait=30 +```bash timeout=120 $ kubectl delete deployment ui -n ui $ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ ``` @@ -53,7 +54,7 @@ $ kubectl apply -k /manifests/modules/observability/resiliency/high-availability After applying these changes, it's important to verify that your retail store is accessible: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... @@ -75,7 +76,7 @@ The `get-pods-by-az.sh` script helps visualize the distribution of Kubernetes po To run the script and see the distribution of pods across availability zones, execute: ```bash -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: diff --git a/website/docs/observability/high-availability/02-pod-failure.md b/website/docs/observability/high-availability/02-pod-failure.md index 174e1fd80..afa1d3d53 100644 --- a/website/docs/observability/high-availability/02-pod-failure.md +++ b/website/docs/observability/high-availability/02-pod-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating Pod Failure" -sidebar_position: 2 +sidebar_position: 110 description: "Simulate pod failure in your environment using ChaosMesh to test the resiliency of your application." --- @@ -81,7 +81,7 @@ This will show you the status, IP addresses, and nodes for each pod in the `ui` An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... diff --git a/website/docs/observability/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md index 817c8f75c..bc8d36ce0 100644 --- a/website/docs/observability/high-availability/03-node-failure-no-fis.md +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Node Failure without FIS" -sidebar_position: 3 +sidebar_position: 130 description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." --- @@ -20,7 +20,7 @@ It's important to note that this experiment is repeatable, allowing you to run i To simulate the node failure and monitor its effects, run the following command: -```bash timeout=240 wait=30 +```bash timeout=240 $ $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh ------us-west-2a------ @@ -55,7 +55,7 @@ While waiting for the node to finish coming back online, we will verify the clus First let's ensure all nodes are in the `Ready` state: -```bash timeout=300 wait=30 +```bash timeout=300 $ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done ``` @@ -63,24 +63,31 @@ This command counts the total number of nodes in the `Ready` state and continuou Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes: -```bash timeout=60 wait=30 -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ -$ sleep 30 -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +```bash timeout=900 +$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service +$ kubectl rollout status -n ui deployment/ui --timeout 30s +$ kubectl rollout status -n orders deployment/orders --timeout 60s +$ kubectl rollout status -n catalog deployment/catalog --timeout 30s +$ kubectl rollout status -n checkout deployment/checkout --timeout 30s +$ kubectl rollout status -n carts deployment/carts --timeout 30s +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` These commands perform the following actions: -1. Delete the existing ui deployment. -2. Reapply the configuration to create a new deployment. +1. Delete the existing ui pods. +2. Wait for ui pods to get provision automatically. 3. Use the `get-pods-by-az.sh` script to check the distribution of pods across availability zones. ## Verify Retail Store Availability After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... diff --git a/website/docs/observability/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md index 7ca211192..58866a0fa 100644 --- a/website/docs/observability/high-availability/04-node-failure-partial-fis.md +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Partial Node Failure with FIS" -sidebar_position: 4 +sidebar_position: 150 description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." --- @@ -49,14 +49,15 @@ Create a new AWS FIS experiment template to simulate the partial node failure: ```bash $ NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') + ``` ## Running the Experiment Execute the FIS experiment to simulate the node failure and monitor the response: -```bash timeout=240 wait=30 -$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 180s $SCRIPT_DIR/get-pods-by-az.sh +```bash timeout=240 +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout 240s $SCRIPT_DIR/get-pods-by-az.sh ------us-west-2a------ ip-10-42-127-82.us-west-2.compute.internal: @@ -73,7 +74,7 @@ $ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json & ``` -This command triggers the node failure and monitors the pods for 3 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. +This command triggers the node failure and monitors the pods for 4 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. During the experiment, you should observe the following: @@ -86,12 +87,19 @@ Your retail url should stay operational unlike the node failure without FIS. :::note To verify nodes and rebalance pods, you can run: -```bash timeout=300 wait=30 +```bash timeout=900 $ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ -$ sleep 30 -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +$ kubectl delete pod --grace-period=0 -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n catalog -l app.kubernetes.io/component=service +$ kubectl rollout status -n ui deployment/ui --timeout 30s +$ kubectl rollout status -n orders deployment/orders --timeout 60s +$ kubectl rollout status -n catalog deployment/catalog --timeout 30s +$ kubectl rollout status -n checkout deployment/checkout --timeout 30s +$ kubectl rollout status -n carts deployment/carts --timeout 30s +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -100,7 +108,7 @@ $ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... diff --git a/website/docs/observability/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md index 4bc755886..6ad4dc844 100644 --- a/website/docs/observability/high-availability/05-node-failure-complete-fis.md +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -1,6 +1,6 @@ --- title: "Simulating Complete Node Failure with FIS" -sidebar_position: 5 +sidebar_position: 170 description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." --- @@ -28,8 +28,8 @@ $ FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"desc Execute the FIS experiment and monitor the cluster's response: -```bash timeout=420 wait=30 -$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 360s $SCRIPT_DIR/get-pods-by-az.sh +```bash timeout=420 +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout 360s $SCRIPT_DIR/get-pods-by-az.sh ------us-west-2a------ ip-10-42-106-250.us-west-2.compute.internal: @@ -59,12 +59,19 @@ Due to the severity of the experiment, the retail store url will not stay operat :::note To verify nodes and rebalance pods, you can run: -```bash timeout=300 wait=30 +```bash timeout=900 $ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ -$ sleep 30 -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +$ kubectl delete pod --grace-period=0 -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n catalog -l app.kubernetes.io/component=service +$ kubectl rollout status -n ui deployment/ui --timeout 30s +$ kubectl rollout status -n orders deployment/orders --timeout 60s +$ kubectl rollout status -n catalog deployment/catalog --timeout 30s +$ kubectl rollout status -n checkout deployment/checkout --timeout 30s +$ kubectl rollout status -n carts deployment/carts --timeout 30s +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -73,7 +80,7 @@ $ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 Check the retail store application's recovery: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md index 21b8c83a7..d8f43f25b 100644 --- a/website/docs/observability/high-availability/06-az-setup.md +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -1,6 +1,6 @@ --- title: "AZ Failure Experiment Setup" -sidebar_position: 6 +sidebar_position: 190 description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." --- @@ -8,7 +8,7 @@ description: "Scale your application to two instances and prepare for an AZ fail To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9: -```bash timeout=120 wait=30 +```bash timeout=120 $ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text) $ aws autoscaling update-auto-scaling-group \ --auto-scaling-group-name $ASG_NAME \ @@ -17,7 +17,7 @@ $ aws autoscaling update-auto-scaling-group \ --max-size 6 $ sleep 60 $ kubectl scale deployment ui --replicas=9 -n ui -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: @@ -47,7 +47,7 @@ Before starting the experiment, set up a synthetic canary for heartbeat monitori 1. First, create an S3 bucket for the canary artifacts: -```bash wait=15 +```bash wait=30 $ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" $ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION @@ -62,7 +62,7 @@ manifests/modules/observability/resiliency/scripts/create-blueprint.sh Place this canary blueprint into the bucket: -```bash wait=15 +```bash $ $SCRIPT_DIR/create-blueprint.sh upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip @@ -72,7 +72,7 @@ The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594 3. Create a synthetic canary with a Cloudwatch alarm: -```bash timeout=120 wait=30 +```bash $ aws synthetics create-canary \ --name eks-workshop-canary \ --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ @@ -81,7 +81,8 @@ $ aws synthetics create-canary \ --schedule "Expression=rate(1 minute)" \ --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ --region $AWS_REGION -$ aws synthetics wait canary-ready --name eks-workshop-canary --region $AWS_REGION +$ sleep 40 +$ aws synthetics describe-canaries --name eks-workshop-canary --region $AWS_REGION $ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION $ aws cloudwatch put-metric-alarm \ --alarm-name "eks-workshop-canary-alarm" \ diff --git a/website/docs/observability/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md index 94c6274c1..44c0c416e 100644 --- a/website/docs/observability/high-availability/07-az-failure.md +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -1,6 +1,6 @@ --- title: "Simulating AZ Failure" -sidebar_position: 7 +sidebar_position: 210 description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." --- @@ -20,8 +20,8 @@ $ ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"descripti Execute the FIS experiment to simulate the AZ failure: -```bash timeout=560 wait=30 -$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && $SCRIPT_DIR/node-failure.sh && timeout 480s $SCRIPT_DIR/get-pods-by-az.sh +```bash timeout=560 +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout 480s $SCRIPT_DIR/get-pods-by-az.sh ------us-west-2a------ ip-10-42-100-4.us-west-2.compute.internal: @@ -61,12 +61,19 @@ During this time, the retail url will stay available showimg how resilient EKS i :::note To verify nodes and rebalance pods, you can run: -```bash timeout=300 wait=30 +```bash timeout=900 $ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done -$ kubectl delete deployment ui -n ui -$ kubectl apply -k /manifests/modules/observability/resiliency/high-availability/config/ -$ sleep 30 -$ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +$ kubectl delete pod --grace-period=0 -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 -n catalog -l app.kubernetes.io/component=service +$ kubectl rollout status -n ui deployment/ui --timeout 30s +$ kubectl rollout status -n orders deployment/orders --timeout 60s +$ kubectl rollout status -n catalog deployment/catalog --timeout 30s +$ kubectl rollout status -n checkout deployment/checkout --timeout 30s +$ kubectl rollout status -n carts deployment/carts --timeout 30s +$ timeout 10s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 ``` ::: @@ -75,7 +82,7 @@ $ timeout 5s $SCRIPT_DIR/get-pods-by-az.sh | head -n 30 After the experiment, verify that your application remains operational despite the simulated AZ failure: -```bash timeout=900 wait=30 +```bash timeout=900 $ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... diff --git a/website/docs/observability/high-availability/index.md b/website/docs/observability/high-availability/index.md index 9873c4990..2fe01d149 100644 --- a/website/docs/observability/high-availability/index.md +++ b/website/docs/observability/high-availability/index.md @@ -2,14 +2,16 @@ title: "Chaos Engineering with EKS" sidebar_position: 70 sidebar_custom_props: { "module": true } -weight: 10 +description: Stimulating various failure scenarios to check Amazon EKS cluster resiliency." --- +::required-time + :::tip Before you start Prepare your environment for this section: -```bash timeout=900 wait=30 -$ kubectl delete deployment ui -n ui +```bash timeout=900 +$ kubectl delete deployment ui -n ui --ignore-not-found $ prepare-environment observability/resiliency ```