Skip to content

Commit

Permalink
Replication/failover simulation skeleton (#6627)
Browse files Browse the repository at this point in the history
  • Loading branch information
taylanisikdemir authored Jan 16, 2025
1 parent 8b5e2a5 commit 6f0a746
Show file tree
Hide file tree
Showing 8 changed files with 525 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ test_eventsV2.log
test_eventsV2_xdc
test_eventsV2_xdc.log
matching-simulator-output/
replication-simulator-output/

# Executables produced by cadence repo
/cadence
Expand Down
5 changes: 5 additions & 0 deletions config/dynamicconfig/replication_simulation_default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# This file is used as dynamicconfig override for "default" replication simulation scenario configured via host/testdata/replication_simulation_default.yaml

history.replicatorTaskBatchSize:
- value: 25
constraints: {}
205 changes: 205 additions & 0 deletions docker/buildkite/docker-compose-local-replication-simulation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
version: "3.5"

services:
cassandra:
image: cassandra:4.1.1
environment:
- "MAX_HEAP_SIZE=256M"
- "HEAP_NEWSIZE=128M"
expose:
- "9042"
networks:
services-network:
aliases:
- cassandra
healthcheck:
test: ["CMD", "cqlsh", "-u cassandra", "-p cassandra" ,"-e describe keyspaces"]
interval: 15s
timeout: 30s
retries: 10

prometheus:
image: prom/prometheus:v3.0.1
volumes:
- ./prometheus:/etc/prometheus
command:
- '--config.file=/etc/prometheus/replication_simulation_prometheus.yml'
ports:
- '9090:9090'
networks:
services-network:
aliases:
- prometheus

grafana:
image: grafana/grafana:11.4.0
volumes:
- ./grafana:/etc/grafana
user: "1000"
depends_on:
- prometheus
ports:
- '3000:3000'
networks:
services-network:
aliases:
- grafana

cadence-cluster0:
build:
context: ../../
dockerfile: ./Dockerfile
args:
TARGET: auto-setup
command:
- /start.sh
ports:
- "7933:7933" # frontend thrift
- "7833:7833" # frontend grpc
- "7934:7934" # history thrift
- "7834:7834" # history grpc
- "7935:7935" # matching thrift
- "7835:7835" # matching grpc
- "7939:7939" # worker thrift
- "7000:7000" # frontend prometheus
- "7001:7001" # matching prometheus
- "7002:7002" # history prometheus
- "7003:7003" # worker prometheus
environment:
- "BIND_ON_IP=0.0.0.0"
- "PRIMARY_FRONTEND_SERVICE=cadence-cluster0"
- "SECONDARY_FRONTEND_SERVICE=cadence-cluster1"
- "CASSANDRA_SEEDS=cassandra"
- "DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/replication_simulation_${SCENARIO}.yml"
- "ENABLE_GLOBAL_DOMAIN=true"
- "KEYSPACE=cadence_primary"
- "VISIBILITY_KEYSPACE=cadence_visibility_primary"
- "PROMETHEUS_ENDPOINT_0=0.0.0.0:7000" # frontend scrape endpoint
- "PROMETHEUS_ENDPOINT_1=0.0.0.0:7001" # matching scrape endpoint
- "PROMETHEUS_ENDPOINT_2=0.0.0.0:7002" # history scrape endpoint
- "PROMETHEUS_ENDPOINT_3=0.0.0.0:7003" # worker scrape endpoint
depends_on:
cassandra:
condition: service_healthy
prometheus:
condition: service_started
networks:
services-network:
aliases:
- cadence-cluster0

cadence-cluster1:
build:
context: ../../
dockerfile: ./Dockerfile
args:
TARGET: auto-setup
command:
- /start.sh
ports: # cluster1 uses 8xxx host ports to avoid conflicts with cluster0
- "8933:7933" # frontend thrift
- "8833:7833" # frontend grpc
- "8934:7934" # history thrift
- "8834:7834" # history grpc
- "8935:7935" # matching thrift
- "8835:7835" # matching grpc
- "8939:7939" # worker thrift
- "8000:8000" # frontend prometheus
- "8001:8001" # matching prometheus
- "8002:8002" # history prometheus
- "8003:8003" # worker prometheus
environment:
- "BIND_ON_IP=0.0.0.0"
- "PRIMARY_FRONTEND_SERVICE=cadence-cluster0"
- "SECONDARY_FRONTEND_SERVICE=cadence-cluster1"
- "CASSANDRA_SEEDS=cassandra"
- "DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/replication_simulation_${SCENARIO}.yml"
- "IS_NOT_PRIMARY=true"
- "ENABLE_GLOBAL_DOMAIN=true"
- "KEYSPACE=cadence_secondary"
- "VISIBILITY_KEYSPACE=cadence_visibility_secondary"
- "PROMETHEUS_ENDPOINT_0=0.0.0.0:8000" # frontend scrape endpoint
- "PROMETHEUS_ENDPOINT_1=0.0.0.0:8001" # matching scrape endpoint
- "PROMETHEUS_ENDPOINT_2=0.0.0.0:8002" # history scrape endpoint
- "PROMETHEUS_ENDPOINT_3=0.0.0.0:8003" # worker scrape endpoint
depends_on:
cassandra:
condition: service_healthy
prometheus:
condition: service_started
networks:
services-network:
aliases:
- cadence-cluster1

cadence-web-cluster0:
image: ubercadence/web:latest
environment:
- "CADENCE_TCHANNEL_PEERS=cadence-cluster0:7933"
ports:
- "8088:8088"
depends_on:
- cadence-cluster0
networks:
services-network:
aliases:
- cadence-web-cluster0

cadence-web-cluster1:
image: ubercadence/web:latest
environment:
- "CADENCE_TCHANNEL_PEERS=cadence-cluster1:7933"
ports:
- "8089:8088"
depends_on:
- cadence-cluster1
networks:
services-network:
aliases:
- cadence-web-cluster0

replication-simulator:
build:
context: ../../
dockerfile: ./docker/buildkite/Dockerfile
command:
- /bin/sh
- -e
- -c
- >
go test -timeout 180s
-run ^TestReplicationSimulation.*$
-count 1
-v
-tags replicationsim
github.com/uber/cadence/host
| tee test.log
depends_on:
cadence-cluster0:
condition: service_started
cadence-cluster1:
condition: service_started
cadence-web-cluster0:
condition: service_started
cadence-web-cluster1:
condition: service_started
grafana:
condition: service_started
ports: # expose prometheus ports so they can be scraped
- '8306:8306'
- '8307:8307'
- '8308:8308'
- '8309:8309'
volumes:
- ../../:/cadence
- /cadence/.build/ # ensure we don't mount the build directory
- /cadence/.bin/ # ensure we don't mount the bin directory
networks:
services-network:
aliases:
- replication-simulator

networks:
services-network:
name: services-network
driver: bridge
23 changes: 23 additions & 0 deletions docker/buildkite/prometheus/replication_simulation_prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
global:
scrape_interval: 5s
external_labels:
monitor: 'cadence-monitor'
query_log_file: /etc/prometheus/query.log
scrape_failure_log_file: /etc/prometheus/scrape.log
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: # addresses to scrape from cluster0
- 'cadence-cluster0:7000' # frontend
- 'cadence-cluster0:7001' # matching
- 'cadence-cluster0:7002' # history
- 'cadence-cluster0:7003' # worker
labels:
cluster: 'cluster0'
- targets: # addresses to scrape from cluster1
- 'cadence-cluster1:8000' # frontend
- 'cadence-cluster1:8001' # matching
- 'cadence-cluster1:8002' # history
- 'cadence-cluster1:8003' # worker
labels:
cluster: 'cluster1'
Loading

0 comments on commit 6f0a746

Please sign in to comment.