diff --git a/.gitignore b/.gitignore index 401c21679ad..f0a633f5112 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ test_eventsV2.log test_eventsV2_xdc test_eventsV2_xdc.log matching-simulator-output/ +replication-simulator-output/ # Executables produced by cadence repo /cadence diff --git a/config/dynamicconfig/replication_simulation_default.yml b/config/dynamicconfig/replication_simulation_default.yml new file mode 100644 index 00000000000..6f22844faa1 --- /dev/null +++ b/config/dynamicconfig/replication_simulation_default.yml @@ -0,0 +1,5 @@ +# This file is used as dynamicconfig override for "default" replication simulation scenario configured via host/testdata/replication_simulation_default.yaml + +history.replicatorTaskBatchSize: +- value: 25 + constraints: {} diff --git a/docker/buildkite/docker-compose-local-replication-simulation.yml b/docker/buildkite/docker-compose-local-replication-simulation.yml new file mode 100644 index 00000000000..31b08435e57 --- /dev/null +++ b/docker/buildkite/docker-compose-local-replication-simulation.yml @@ -0,0 +1,205 @@ +version: "3.5" + +services: + cassandra: + image: cassandra:4.1.1 + environment: + - "MAX_HEAP_SIZE=256M" + - "HEAP_NEWSIZE=128M" + expose: + - "9042" + networks: + services-network: + aliases: + - cassandra + healthcheck: + test: ["CMD", "cqlsh", "-u cassandra", "-p cassandra" ,"-e describe keyspaces"] + interval: 15s + timeout: 30s + retries: 10 + + prometheus: + image: prom/prometheus:v3.0.1 + volumes: + - ./prometheus:/etc/prometheus + command: + - '--config.file=/etc/prometheus/replication_simulation_prometheus.yml' + ports: + - '9090:9090' + networks: + services-network: + aliases: + - prometheus + + grafana: + image: grafana/grafana:11.4.0 + volumes: + - ./grafana:/etc/grafana + user: "1000" + depends_on: + - prometheus + ports: + - '3000:3000' + networks: + services-network: + aliases: + - grafana + + cadence-cluster0: + build: + context: ../../ + dockerfile: ./Dockerfile + args: + TARGET: auto-setup + command: + - /start.sh + ports: + - "7933:7933" # frontend thrift + - "7833:7833" # frontend grpc + - "7934:7934" # history thrift + - "7834:7834" # history grpc + - "7935:7935" # matching thrift + - "7835:7835" # matching grpc + - "7939:7939" # worker thrift + - "7000:7000" # frontend prometheus + - "7001:7001" # matching prometheus + - "7002:7002" # history prometheus + - "7003:7003" # worker prometheus + environment: + - "BIND_ON_IP=0.0.0.0" + - "PRIMARY_FRONTEND_SERVICE=cadence-cluster0" + - "SECONDARY_FRONTEND_SERVICE=cadence-cluster1" + - "CASSANDRA_SEEDS=cassandra" + - "DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/replication_simulation_${SCENARIO}.yml" + - "ENABLE_GLOBAL_DOMAIN=true" + - "KEYSPACE=cadence_primary" + - "VISIBILITY_KEYSPACE=cadence_visibility_primary" + - "PROMETHEUS_ENDPOINT_0=0.0.0.0:7000" # frontend scrape endpoint + - "PROMETHEUS_ENDPOINT_1=0.0.0.0:7001" # matching scrape endpoint + - "PROMETHEUS_ENDPOINT_2=0.0.0.0:7002" # history scrape endpoint + - "PROMETHEUS_ENDPOINT_3=0.0.0.0:7003" # worker scrape endpoint + depends_on: + cassandra: + condition: service_healthy + prometheus: + condition: service_started + networks: + services-network: + aliases: + - cadence-cluster0 + + cadence-cluster1: + build: + context: ../../ + dockerfile: ./Dockerfile + args: + TARGET: auto-setup + command: + - /start.sh + ports: # cluster1 uses 8xxx host ports to avoid conflicts with cluster0 + - "8933:7933" # frontend thrift + - "8833:7833" # frontend grpc + - "8934:7934" # history thrift + - "8834:7834" # history grpc + - "8935:7935" # matching thrift + - "8835:7835" # matching grpc + - "8939:7939" # worker thrift + - "8000:8000" # frontend prometheus + - "8001:8001" # matching prometheus + - "8002:8002" # history prometheus + - "8003:8003" # worker prometheus + environment: + - "BIND_ON_IP=0.0.0.0" + - "PRIMARY_FRONTEND_SERVICE=cadence-cluster0" + - "SECONDARY_FRONTEND_SERVICE=cadence-cluster1" + - "CASSANDRA_SEEDS=cassandra" + - "DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/replication_simulation_${SCENARIO}.yml" + - "IS_NOT_PRIMARY=true" + - "ENABLE_GLOBAL_DOMAIN=true" + - "KEYSPACE=cadence_secondary" + - "VISIBILITY_KEYSPACE=cadence_visibility_secondary" + - "PROMETHEUS_ENDPOINT_0=0.0.0.0:8000" # frontend scrape endpoint + - "PROMETHEUS_ENDPOINT_1=0.0.0.0:8001" # matching scrape endpoint + - "PROMETHEUS_ENDPOINT_2=0.0.0.0:8002" # history scrape endpoint + - "PROMETHEUS_ENDPOINT_3=0.0.0.0:8003" # worker scrape endpoint + depends_on: + cassandra: + condition: service_healthy + prometheus: + condition: service_started + networks: + services-network: + aliases: + - cadence-cluster1 + + cadence-web-cluster0: + image: ubercadence/web:latest + environment: + - "CADENCE_TCHANNEL_PEERS=cadence-cluster0:7933" + ports: + - "8088:8088" + depends_on: + - cadence-cluster0 + networks: + services-network: + aliases: + - cadence-web-cluster0 + + cadence-web-cluster1: + image: ubercadence/web:latest + environment: + - "CADENCE_TCHANNEL_PEERS=cadence-cluster1:7933" + ports: + - "8089:8088" + depends_on: + - cadence-cluster1 + networks: + services-network: + aliases: + - cadence-web-cluster0 + + replication-simulator: + build: + context: ../../ + dockerfile: ./docker/buildkite/Dockerfile + command: + - /bin/sh + - -e + - -c + - > + go test -timeout 180s + -run ^TestReplicationSimulation.*$ + -count 1 + -v + -tags replicationsim + github.com/uber/cadence/host + | tee test.log + depends_on: + cadence-cluster0: + condition: service_started + cadence-cluster1: + condition: service_started + cadence-web-cluster0: + condition: service_started + cadence-web-cluster1: + condition: service_started + grafana: + condition: service_started + ports: # expose prometheus ports so they can be scraped + - '8306:8306' + - '8307:8307' + - '8308:8308' + - '8309:8309' + volumes: + - ../../:/cadence + - /cadence/.build/ # ensure we don't mount the build directory + - /cadence/.bin/ # ensure we don't mount the bin directory + networks: + services-network: + aliases: + - replication-simulator + +networks: + services-network: + name: services-network + driver: bridge diff --git a/docker/buildkite/prometheus/replication_simulation_prometheus.yml b/docker/buildkite/prometheus/replication_simulation_prometheus.yml new file mode 100644 index 00000000000..42f6f04456f --- /dev/null +++ b/docker/buildkite/prometheus/replication_simulation_prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 5s + external_labels: + monitor: 'cadence-monitor' + query_log_file: /etc/prometheus/query.log + scrape_failure_log_file: /etc/prometheus/scrape.log +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: # addresses to scrape from cluster0 + - 'cadence-cluster0:7000' # frontend + - 'cadence-cluster0:7001' # matching + - 'cadence-cluster0:7002' # history + - 'cadence-cluster0:7003' # worker + labels: + cluster: 'cluster0' + - targets: # addresses to scrape from cluster1 + - 'cadence-cluster1:8000' # frontend + - 'cadence-cluster1:8001' # matching + - 'cadence-cluster1:8002' # history + - 'cadence-cluster1:8003' # worker + labels: + cluster: 'cluster1' diff --git a/host/replication_simulation_test.go b/host/replication_simulation_test.go new file mode 100644 index 00000000000..7871fb4a3ea --- /dev/null +++ b/host/replication_simulation_test.go @@ -0,0 +1,199 @@ +// Copyright (c) 2018 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +//go:build !race && replicationsim +// +build !race,replicationsim + +/* +To run locally: + +1. Pick a scenario from the existing config files host/testdata/replication_simulation_${scenario}.yaml or add a new one + +2. Run the scenario +`./scripts/run_replication_simulator.sh default` + +Full test logs can be found at test.log file. Event json logs can be found at replication-simulator-output folder. +See the run_replication_simulator.sh script for more details about how to parse events. +*/ +package host + +import ( + "context" + "flag" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + adminv1 "github.com/uber/cadence-idl/go/proto/admin/v1" + apiv1 "github.com/uber/cadence-idl/go/proto/api/v1" + "go.uber.org/yarpc" + "go.uber.org/yarpc/api/transport" + "go.uber.org/yarpc/transport/grpc" + "gopkg.in/yaml.v2" + + "github.com/uber/cadence/client/admin" + "github.com/uber/cadence/client/frontend" + grpcClient "github.com/uber/cadence/client/wrappers/grpc" + "github.com/uber/cadence/common/types" +) + +const ( + defaultTestCase = "testdata/replication_simulation_default.yaml" + domainName = "test-domain" +) + +type ReplicationSimulationConfig struct { + Clusters map[string]*Cluster `yaml:"clusters"` + + PrimaryCluster string `yaml:"primaryCluster"` +} + +type Cluster struct { + GRPCEndpoint string `yaml:"grpcEndpoint"` + + AdminClient admin.Client `yaml:"-"` + FrontendClient frontend.Client `yaml:"-"` +} + +func TestReplicationSimulation(t *testing.T) { + flag.Parse() + + logf(t, "Starting Replication Simulation") + + logf(t, "Sleeping for 30 seconds to allow services to start/warmup") + time.Sleep(30 * time.Second) + + // load config + simCfg := mustLoadReplSimConf(t) + + // initialize cadence clients + for _, c := range simCfg.Clusters { + mustInitClients(t, c) + } + + mustRegisterDomain(t, simCfg) + + startTime := time.Now() + // TODO: implement rest of the simulation + // - override dynamicconfigs in config/dynamicconfig/replication_simulation.yaml as needed + // - run workflows based on given config + // - do failover at specified time based on config + // - validate workflows finished successfully + + // Print the test summary. + // Don't change the start/end line format as it is used by scripts to parse the summary info + executionTime := time.Since(startTime) + testSummary := []string{} + testSummary = append(testSummary, "Simulation Summary:") + testSummary = append(testSummary, fmt.Sprintf("Simulation Duration: %v", executionTime)) + testSummary = append(testSummary, "End of Simulation Summary") + fmt.Println(strings.Join(testSummary, "\n")) +} + +func mustLoadReplSimConf(t *testing.T) *ReplicationSimulationConfig { + t.Helper() + + path := os.Getenv("REPLICATION_SIMULATION_CONFIG") + if path == "" { + path = defaultTestCase + } + confContent, err := os.ReadFile(path) + require.NoError(t, err, "failed to read config file") + + var cfg ReplicationSimulationConfig + err = yaml.Unmarshal(confContent, &cfg) + require.NoError(t, err, "failed to unmarshal config") + + logf(t, "Loaded config from path: %s", path) + return &cfg +} + +func logf(t *testing.T, msg string, args ...interface{}) { + t.Helper() + msg = time.Now().Format(time.RFC3339Nano) + "\t" + msg + t.Logf(msg, args...) +} + +func mustInitClients(t *testing.T, c *Cluster) { + t.Helper() + outbounds := transport.Outbounds{Unary: grpc.NewTransport().NewSingleOutbound(c.GRPCEndpoint)} + dispatcher := yarpc.NewDispatcher(yarpc.Config{ + Name: "cadence-client", + Outbounds: yarpc.Outbounds{ + "cadence-frontend": outbounds, + }, + }) + + if err := dispatcher.Start(); err != nil { + dispatcher.Stop() + require.NoError(t, err, "failed to create outbound transport channel") + } + + clientConfig := dispatcher.ClientConfig("cadence-frontend") + c.FrontendClient = grpcClient.NewFrontendClient( + apiv1.NewDomainAPIYARPCClient(clientConfig), + apiv1.NewWorkflowAPIYARPCClient(clientConfig), + apiv1.NewWorkerAPIYARPCClient(clientConfig), + apiv1.NewVisibilityAPIYARPCClient(clientConfig), + ) + + c.AdminClient = grpcClient.NewAdminClient(adminv1.NewAdminAPIYARPCClient(clientConfig)) + logf(t, "Initialized clients for cluster with grpc endpoint: %s", c.GRPCEndpoint) +} + +func mustRegisterDomain(t *testing.T, simCfg *ReplicationSimulationConfig) { + logf(t, "Registering domain: %s", domainName) + var clusters []*types.ClusterReplicationConfiguration + for name := range simCfg.Clusters { + clusters = append(clusters, &types.ClusterReplicationConfiguration{ + ClusterName: name, + }) + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + err := simCfg.mustGetPrimaryFrontendClient(t).RegisterDomain(ctx, &types.RegisterDomainRequest{ + Name: domainName, + Clusters: clusters, + WorkflowExecutionRetentionPeriodInDays: 1, + ActiveClusterName: simCfg.PrimaryCluster, + IsGlobalDomain: true, + }) + require.NoError(t, err, "failed to register domain") + logf(t, "Registered domain: %s", domainName) +} + +func (s *ReplicationSimulationConfig) mustGetPrimaryFrontendClient(t *testing.T) frontend.Client { + t.Helper() + primaryCluster, ok := s.Clusters[s.PrimaryCluster] + require.True(t, ok, "Primary cluster not found in the config") + require.NotNil(t, primaryCluster.FrontendClient, "Primary cluster frontend client not initialized") + return primaryCluster.FrontendClient +} + +func (s *ReplicationSimulationConfig) mustGetPrimaryAdminClient(t *testing.T) admin.Client { + t.Helper() + primaryCluster, ok := s.Clusters[s.PrimaryCluster] + require.True(t, ok, "Primary cluster not found in the config") + require.NotNil(t, primaryCluster.AdminClient, "Primary cluster admin client not initialized") + return primaryCluster.AdminClient +} diff --git a/host/testdata/replication_simulation_default.yaml b/host/testdata/replication_simulation_default.yaml new file mode 100644 index 00000000000..a00bc16a4ef --- /dev/null +++ b/host/testdata/replication_simulation_default.yaml @@ -0,0 +1,10 @@ +# This file is a replication simulation scenario spec. +# It is parsed into ReplicationSimulationConfig struct. +# Replication simulations can be run via scripts/run_replication_simulator.sh +clusters: + cluster0: + grpcEndpoint: "cadence-cluster0:7833" + cluster1: + grpcEndpoint: "cadence-cluster1:7833" + +primaryCluster: "cluster0" diff --git a/scripts/run_matching_simulator.sh b/scripts/run_matching_simulator.sh index fb8fab006c2..100d256ca72 100755 --- a/scripts/run_matching_simulator.sh +++ b/scripts/run_matching_simulator.sh @@ -19,6 +19,19 @@ echo "Building test image" docker-compose -f docker/buildkite/docker-compose-local-matching-simulation.yml \ build matching-simulator +function check_test_failure() +{ + faillog="$(cat test.log | grep 'FAIL: TestMatchingSimulationSuite' -B 10)" + if [[ -n $faillog ]]; then + echo "Test failed!!!" + echo "$faillog" + echo "Check test.log file for more details" + exit 1 + fi +} + +trap check_test_failure EXIT + echo "Running the test $testCase" docker-compose \ -f docker/buildkite/docker-compose-local-matching-simulation.yml \ @@ -28,11 +41,6 @@ docker-compose \ | sed "s/Matching New Event: //" \ | jq . > "$eventLogsFile" -if cat test.log | grep -a "FAIL: TestMatchingSimulationSuite"; then - echo "Test failed" - exit 1 -fi - echo "---- Simulation Summary ----" cat test.log \ | sed -n '/Simulation Summary/,/End of Simulation Summary/p' \ diff --git a/scripts/run_replication_simulator.sh b/scripts/run_replication_simulator.sh new file mode 100755 index 00000000000..84bcec3addb --- /dev/null +++ b/scripts/run_replication_simulator.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# This script can be used to run replication simulator and check the critical flow via logs +# Scenario specs are located at host/testdata/replication_simulation_${scenario}.yaml +# Dynamic configs are located at config/dynamicconfig/replication_simulation_${scenario}.yml +# The output of the simulation is saved in replication-simulator-output/ folder + +# Usage: ./scripts/run_replication_simulator.sh + +set -eo pipefail + +testCase="${1:-default}" +testCfg="testdata/replication_simulation_$testCase.yaml" +now="$(date '+%Y-%m-%d-%H-%M-%S')" +timestamp="${2:-$now}" +testName="test-$testCase-$timestamp" +resultFolder="replication-simulator-output" +mkdir -p "$resultFolder" +eventLogsFile="$resultFolder/$testName-events.json" +testSummaryFile="$resultFolder/$testName-summary.txt" + +echo "Removing some of the previous containers (if exists) to start fresh" +docker-compose -f docker/buildkite/docker-compose-local-replication-simulation.yml \ + down cassandra cadence-cluster0 cadence-cluster1 replication-simulator + +echo "Each simulation run creates multiple new giant container images. Running docker system prune to avoid disk space issues" +docker system prune -f + +echo "Building test images" +docker-compose -f docker/buildkite/docker-compose-local-replication-simulation.yml \ + build cadence-cluster0 cadence-cluster1 replication-simulator + +function check_test_failure { + echo "Checking test failure" + faillog="$(cat test.log | grep 'FAIL: TestReplicationSimulation111' -B 10)"; + if [ -n "$faillog" ]; then + echo 'Test Failed!!!'; + echo "$faillog"; + echo "Check test.log file for more details"; + exit 1; + fi +} + +trap check_test_failure EXIT + +echo "Running the scenario $testCase" +docker-compose \ + -f docker/buildkite/docker-compose-local-replication-simulation.yml \ + run \ + -e REPLICATION_SIMULATION_CONFIG=$testCfg \ + -e SCENARIO=$testCase \ + --rm --remove-orphans --service-ports --use-aliases \ + replication-simulator \ + | grep -a --line-buffered "Replication New Event" \ + | sed "s/Replication New Event: //" \ + | jq . > "$eventLogsFile" + + +echo "---- Simulation Summary ----" +cat test.log \ + | sed -n '/Simulation Summary/,/End of Simulation Summary/p' \ + | grep -v "Simulation Summary" \ + | tee -a $testSummaryFile + +echo "End of summary" | tee -a $testSummaryFile + +printf "\nResults are saved in $testSummaryFile\n" +printf "For further ad-hoc analysis, please check $eventLogsFile via jq queries\n" +printf "Visit http://localhost:3000/ to view Cadence replication grafana dashboard\n"