diff --git a/.github/workflows/test-quickstart.yml b/.github/workflows/test-quickstart.yml index 5b7363c7c..397014260 100644 --- a/.github/workflows/test-quickstart.yml +++ b/.github/workflows/test-quickstart.yml @@ -88,6 +88,8 @@ jobs: haQuickstartTest: name: Test HA Quickstart runs-on: ubuntu-latest + env: + BUILD_DIR: /tmp/build # dir is created by build step and env var used by test script to find ziti executable steps: - name: Shallow checkout uses: actions/checkout@v4 @@ -97,12 +99,22 @@ jobs: with: go-version-file: ./go.mod + - name: Go Caching + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + - name: Build ziti executable shell: bash run: | - mkdir -pv /tmp/build - go build -o /tmp/build ${GITHUB_WORKSPACE}/... + mkdir -pv ${BUILD_DIR} + go build -o ${BUILD_DIR} ${GITHUB_WORKSPACE}/... - name: Build and run a three quickstart in HA mode shell: bash - run: ./quickstart/test/ha-test.sh \ No newline at end of file + run: ./quickstart/test/ha-test.sh diff --git a/doc/ha/quickstart.md b/doc/ha/quickstart.md new file mode 100644 index 000000000..8eb1eb718 --- /dev/null +++ b/doc/ha/quickstart.md @@ -0,0 +1,250 @@ + +# HA Quickstart + +You can explore Ziti HA by running three local processes on unique TCP ports. This is an interactive quickstart based on the [the HA test script](/quickstart/test/ha-test.sh). + +1. Create an empty working directory. All commands are run here. + + ```bash + cd $(mktemp -d) + ``` + +1. Run the first member in the background to create the cluster. + + ```bash + nohup ziti edge quickstart ha \ + --instance-id="ctrl1" \ + --ctrl-port="1281" \ + --router-port="3021" \ + --home="${PWD}" \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --trust-domain="ha-quickstart" \ + &> ctrl1.log & + ``` + + Confirm the first job is running and check the log for startup errors. + + ```bash + tail ctrl1.log; echo; jobs + ``` + + Expected output: + + ```text + . + . + . + ... logs ... + . + . + . + + [1] + running nohup ziti edge quickstart ha --instance-id="ctrl1" --ctrl-port="1281" & + ``` + +1. Run the second member and join the cluster. + + ```bash + nohup ziti edge quickstart join \ + --instance-id="ctrl2" \ + --ctrl-port="1282" \ + --router-port="3022" \ + --home="${PWD}" \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --trust-domain="ha-quickstart" \ + --cluster-member="tls:127.0.0.1:1281" \ + &> ctrl2.log & + ``` + + Confirm the second job is running and check the log for startup errors. + + ```bash + tail ctrl2.log; echo; jobs + ``` + + Expected output: + + ```text + . + . + . + ... logs ... + . + . + . + + [1] - running nohup ziti edge quickstart ha --instance-id="ctrl1" --ctrl-port="1281" & + [2] + running nohup ziti edge quickstart join --instance-id="ctrl2" --ctrl-port="1282" + ``` + +1. Run the third member and join the cluster. + + ```bash + nohup ziti edge quickstart join \ + --instance-id="ctrl3" \ + --ctrl-port="1283" \ + --router-port="3023" \ + --home="${PWD}" \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --trust-domain="ha-quickstart" \ + --cluster-member="tls:127.0.0.1:1281" \ + &> ctrl3.log & + ``` + + Confirm the third job is running and check the log for startup errors. + + ```bash + tail ctrl3.log; echo; jobs + ``` + + Expected output: + + ```text + . + . + . + ... logs ... + . + . + . + + [1] running nohup ziti edge quickstart ha --instance-id="ctrl1" --ctrl-port="1281" & + [2] - running nohup ziti edge quickstart join --instance-id="ctrl2" --ctrl-port="1282" + [3] + running nohup ziti edge quickstart join --instance-id="ctrl3" --ctrl-port="1283" + ``` + +1. Optionally, follow interleaved logs in another window. + + ```bash + tail -F -n +1 *.log + ``` + +1. List agent applications. + + ```bash + ziti agent list + ``` + + Expected output: + + ```text + ╭────────┬────────────┬────────┬─────────────────────────────┬────────────┬─────────────┬───────────╮ + │ PID │ EXECUTABLE │ APP ID │ UNIX SOCKET │ APP TYPE │ APP VERSION │ APP ALIAS │ + ├────────┼────────────┼────────┼─────────────────────────────┼────────────┼─────────────┼───────────┤ + │ 276912 │ ziti │ ctrl1 │ /tmp/gops-agent.276912.sock │ controller │ v0.0.0 │ │ + │ 277714 │ ziti │ ctrl2 │ /tmp/gops-agent.277714.sock │ controller │ v0.0.0 │ │ + │ 281490 │ ziti │ ctrl3 │ /tmp/gops-agent.281490.sock │ controller │ v0.0.0 │ │ + ╰────────┴────────────┴────────┴─────────────────────────────┴────────────┴─────────────┴───────────╯ + ``` + +1. Identify the cluster leader. + + ```bash + ziti agent cluster list --app-id ctrl1 + ``` + + Expected output: + + ```text + ╭───────┬────────────────────┬───────┬────────┬─────────┬───────────╮ + │ ID │ ADDRESS │ VOTER │ LEADER │ VERSION │ CONNECTED │ + ├───────┼────────────────────┼───────┼────────┼─────────┼───────────┤ + │ ctrl1 │ tls:127.0.0.1:1281 │ true │ true │ v0.0.0 │ true │ + │ ctrl2 │ tls:127.0.0.1:1282 │ true │ false │ v0.0.0 │ true │ + │ ctrl3 │ tls:127.0.0.1:1283 │ true │ false │ v0.0.0 │ true │ + ╰───────┴────────────────────┴───────┴────────┴─────────┴───────────╯ + ``` + +1. Simulate a member availability incident. + + Identify the job number for the cluster leader. It may not be `%1`. + + ```bash + jobs + ``` + + Job 1 belongs to ctrl1, the current leader. + + ```text + [1] running nohup ziti edge quickstart ha --instance-id="ctrl1" --ctrl-port="1281" + [2] - running nohup ziti edge quickstart join --instance-id="ctrl3" --ctrl-port="1283" + [3] + running nohup ziti edge quickstart join --instance-id="ctrl2" --ctrl-port="1282" + ``` + + ```bash + kill %1 + ``` + +1. Inspect the cluster via another member ID. + + ```bash + ziti agent cluster list --app-id ctrl2 + ``` + + Expected output: + + ```text + ╭───────┬────────────────────┬───────┬────────┬─────────────────┬───────────╮ + │ ID │ ADDRESS │ VOTER │ LEADER │ VERSION │ CONNECTED │ + ├───────┼────────────────────┼───────┼────────┼─────────────────┼───────────┤ + │ ctrl1 │ tls:127.0.0.1:1281 │ true │ false │ │ false │ + │ ctrl2 │ tls:127.0.0.1:1282 │ true │ true │ v0.0.0 │ true │ + │ ctrl3 │ tls:127.0.0.1:1283 │ true │ false │ v0.0.0 │ true │ + ╰───────┴────────────────────┴───────┴────────┴─────────────────┴───────────╯ + ``` + +1. Restart any disconnected member. + + Any member can be restarted with this `ha` subcommand. + + ```bash + nohup ziti edge quickstart ha \ + --instance-id="ctrl1" \ + --home="${PWD}" \ + &>> ctrl1.log & + ``` + +1. Inspect the cluster. + + Once restarted, ctrl1 does not necessarily resume being the leader. + + ```bash + ziti agent cluster list --app-id ctrl2 + ``` + + Expected output: + + ```text + ╭───────┬────────────────────┬───────┬────────┬─────────────────┬───────────╮ + │ ID │ ADDRESS │ VOTER │ LEADER │ VERSION │ CONNECTED │ + ├───────┼────────────────────┼───────┼────────┼─────────────────┼───────────┤ + │ ctrl1 │ tls:127.0.0.1:1281 │ true │ false │ v0.0.0 │ true │ + │ ctrl2 │ tls:127.0.0.1:1282 │ true │ false │ v0.0.0 │ true │ + │ ctrl3 │ tls:127.0.0.1:1283 │ true │ true │ v0.0.0 │ true │ + ╰───────┴────────────────────┴───────┴────────┴─────────────────┴───────────╯ + ``` + +1. Stop all background jobs. + + BASH + + ```bash + kill $(jobs -p) + ``` + + ZSH + + ```bash + kill ${${(v)jobstates##*:*:}%=*} + ``` + + Expected output: + + ```text + [1] + done nohup ziti edge quickstart ha --instance-id="ctrl1" --ctrl-port="1281" + [2] done nohup ziti edge quickstart join --instance-id="ctrl2" --ctrl-port="1282" + [3] + done nohup ziti edge quickstart join --instance-id="ctrl3" --ctrl-port="1283" + ``` diff --git a/quickstart/test/ha-test.sh b/quickstart/test/ha-test.sh index f8334608f..de725f716 100755 --- a/quickstart/test/ha-test.sh +++ b/quickstart/test/ha-test.sh @@ -1,16 +1,28 @@ -BUILD_DIR=/tmp/build - -ctrl_port=2001 -router_port=3001 -rm -rf "/tmp/quickstart-ha-test" -ziti_home="/tmp/quickstart-ha-test" +#!/usr/bin/env bash + +# raise exceptions +set -o errexit +set -o nounset +set -o pipefail + +# set defaults +: "${BUILD_DIR:=./build}" +: "${PFXLOG_NO_JSON:=true}"; export PFXLOG_NO_JSON # disable JSON log format +: "${VERBOSE:=1}" # 0: no instance logs printed, 1: print instance logs to stdout +declare -a ctrl_ports=(2001 2002 2003) +declare -a router_ports=(3001 3002 3003) +: "${ziti_home:=$(mktemp -d)}" +: "${trust_domain:="quickstart-ha-test"}" function _wait_for_controller { local advertised_host_port="127.0.0.1:${1}" local timeout=60 local elapsed=0 - while [[ "$(curl -w "%{http_code}" -m 1 -s -k -o /dev/null https://${advertised_host_port}/edge/client/v1/version)" != "200" ]]; do + while [[ + "$(curl -w "%{http_code}" -m 1 -sSf -k -o /dev/null \ + https://${advertised_host_port}/edge/client/v1/version 2>/dev/null + )" != "200" ]]; do if (( elapsed >= timeout )); then echo "Timeout waiting for https://${advertised_host_port}" >&2 exit 1 @@ -22,69 +34,104 @@ function _wait_for_controller { echo "CONTROLLER ONLINE AT: https://${advertised_host_port}" } -function _stop_instances { - echo "killing...." - kill "$@" 2>/dev/null - - for pid in "$@"; do - while kill -0 "$pid" 2>/dev/null; do - echo "Waiting for process $pid to stop..." - sleep 1 - done - echo "Process $pid has stopped." - done +function _term_background_pids { + echo -n "terminating background pids: " + for name in "${!PIDS[@]}"; do + echo -n "${name}=${PIDS[$name]} " + done + echo -e "\n" + kill "${PIDS[@]}" 2>/dev/null + for instance in "${!PIDS[@]}"; do + while kill -0 "${PIDS[${instance}]}" 2>/dev/null; do + echo "Waiting for ${instance} process ${PIDS[${instance}]} to stop..." + sleep 1 + done + echo "Process ${PIDS[${instance}]} has stopped." + done } -trap 'kill $inst001pid $inst002pid $inst003pid 2>/dev/null' EXIT +function _check_command() { + if ! command -v "$1" &>/dev/null; then + echo "ERROR: this script requires ${BINS[*]}, but '$1' is missing." >&2 + $1 + fi +} -"${BUILD_DIR}/ziti" edge quickstart ha \ - --home "${ziti_home}" \ - --trust-domain="quickstart-ha-test" \ - --instance-id inst001 \ - --ctrl-port "${ctrl_port}" \ - --router-port "${router_port}" \ - & -inst001pid=$! +declare -a BINS=(awk grep jq "${BUILD_DIR}/ziti") +for BIN in "${BINS[@]}"; do + _check_command "$BIN" +done -_wait_for_controller "${ctrl_port}" +trap '_term_background_pids' EXIT + +# initialize an array of instance names +declare -a INSTANCE_NAMES=(inst001 inst002 inst003) +# initialize a map of name=pid +declare -A PIDS + +nohup "${BUILD_DIR}/ziti" edge quickstart ha \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --home="${ziti_home}" \ + --trust-domain="${trust_domain}" \ + --instance-id="${INSTANCE_NAMES[0]}" \ + --ctrl-port="${ctrl_ports[0]}" \ + --router-port="${router_ports[0]}" \ + &> "${ziti_home}/${INSTANCE_NAMES[0]}.log" & +PIDS["${INSTANCE_NAMES[0]}"]=$! + +_wait_for_controller "${ctrl_ports[0]}" sleep 5 echo "controller online" -"${BUILD_DIR}/ziti" edge quickstart join \ - --home "${ziti_home}" \ - --trust-domain="quickstart-ha-test" \ - --ctrl-port 2002 \ - --router-port 3002 \ - --instance-id "inst002" \ - --member-pid "${inst001pid}" & -inst002pid=$! - -"${BUILD_DIR}/ziti" edge quickstart join \ - --home "${ziti_home}" \ - --trust-domain="quickstart-ha-test" \ - --ctrl-port 2003 \ - --router-port 3003 \ - --instance-id "inst003" \ - --member-pid "${inst001pid}" & -inst003pid=$! +nohup "${BUILD_DIR}/ziti" edge quickstart join \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --home="${ziti_home}" \ + --trust-domain="${trust_domain}" \ + --ctrl-port="${ctrl_ports[1]}" \ + --router-port="${router_ports[1]}" \ + --instance-id="${INSTANCE_NAMES[1]}" \ + --cluster-member="tls:127.0.0.1:${ctrl_ports[0]}" \ + &> "${ziti_home}/${INSTANCE_NAMES[1]}.log" & +PIDS["${INSTANCE_NAMES[1]}"]=$! + +nohup "${BUILD_DIR}/ziti" edge quickstart join \ + --ctrl-address="127.0.0.1" \ + --router-address="127.0.0.1" \ + --home="${ziti_home}" \ + --trust-domain="${trust_domain}" \ + --ctrl-port="${ctrl_ports[2]}" \ + --router-port="${router_ports[2]}" \ + --instance-id="${INSTANCE_NAMES[2]}" \ + --cluster-member="tls:127.0.0.1:${ctrl_ports[0]}" \ + &> "${ziti_home}/${INSTANCE_NAMES[2]}.log" & +PIDS["${INSTANCE_NAMES[2]}"]=$! + +if (( VERBOSE )); then + # print from the top and follow instance logs with filename separators + sleep 1; tail -F -n +1 "${ziti_home}/"*.log & + # add the tail PID to background pids to clean up + PIDS["logtail"]=$! +fi count=0 -timeout=60 # Timeout in seconds +: "${timeout:=60}" # Timeout in seconds elapsed=0 -while [[ $count -lt 3 ]]; do +while [[ ${count} -lt 3 ]]; do results=$("${BUILD_DIR}/ziti" fabric list links -j | jq -r '.data[].state') - connected_count=$(echo "$results" | grep -c "Connected") + connected_count=$(echo "${results}" | grep -c "Connected" || true) - if [[ $connected_count -eq 3 ]]; then + if [[ ${connected_count} -eq 3 ]]; then echo "All three are connected." break else echo "Waiting for three router links before continuing..." - sleep 3 - ((elapsed+=3)) + sleep 6 + ((elapsed+=6)) - if [[ $elapsed -ge $timeout ]]; then + if [[ ${elapsed} -ge ${timeout} ]]; then echo "Timeout reached; not all connections are 'Connected'." exit 1 fi @@ -92,32 +139,32 @@ while [[ $count -lt 3 ]]; do done # three links == things are ready -- tests start below -output=$("${BUILD_DIR}/ziti" agent cluster list --pid $inst001pid) +output=$("${BUILD_DIR}/ziti" agent cluster list --pid "${PIDS["${INSTANCE_NAMES[0]}"]}") echo "" -echo "$output" +echo "${output}" echo "" # Extract the columns for LEADER and CONNECTED -leaders=$(echo "$output" | grep inst | awk -F '│' '{print $5}') -connected=$(echo "$output" | grep inst | awk -F '/│' '{print $6}') +leaders=$(echo "${output}" | grep inst | awk -F '│' '{print $5}') +connected=$(echo "${output}" | grep inst | awk -F '/│' '{print $6}') # Check there is only one leader -leader_count=$(echo "$leaders" | grep -c "true") -if [[ $leader_count -ne 1 ]]; then - echo "Test failed: Expected 1 leader, found $leader_count" - _stop_instances $inst001pid $inst002pid $inst003pid +leader_count=$(echo "${leaders}" | grep -c "true") +if [[ ${leader_count} -ne 1 ]]; then + echo "Test failed: Expected 1 leader, found ${leader_count}" + _term_background_pids exit 1 fi # Check all are connected -disconnected_count=$(echo "$connected" | grep -c "false") -if [[ $disconnected_count -ne 0 ]]; then +disconnected_count=$(echo "${connected}" | grep -c "false" || true) +if [[ ${disconnected_count} -ne 0 ]]; then echo "Test failed: Some instances are not connected" - _stop_instances $inst001pid $inst002pid $inst003pid + _term_background_pids exit 1 fi echo "Test passed: One leader found and all instances are connected" -_stop_instances $inst001pid $inst002pid $inst003pid - +trap - EXIT +_term_background_pids \ No newline at end of file diff --git a/ziti/cmd/edge/quickstart.go b/ziti/cmd/edge/quickstart.go index ef611c973..35346025f 100644 --- a/ziti/cmd/edge/quickstart.go +++ b/ziti/cmd/edge/quickstart.go @@ -67,7 +67,7 @@ type QuickstartOpts struct { TrustDomain string isHA bool InstanceID string - MemberPID int + ClusterMember string joinCommand bool verbose bool nonVoter bool @@ -159,7 +159,7 @@ func NewQuickStartJoinClusterCmd(out io.Writer, errOut io.Writer, context contex } addCommonQuickstartFlags(cmd, options) addQuickstartHaFlags(cmd, options) - cmd.Flags().IntVarP(&options.MemberPID, "member-pid", "m", 0, "the pid of a cluster member. required") + cmd.Flags().StringVarP(&options.ClusterMember, "cluster-member", "m", "", "address of a cluster member. required. example tls:localhost:1280") cmd.Flags().BoolVar(&options.nonVoter, "non-voting", false, "used with ha mode. specifies the member is a non-voting member") cmd.Hidden = true return cmd @@ -182,9 +182,10 @@ func (o *QuickstartOpts) join(ctx context.Context) { logrus.Fatalf("the home directory must be specified when joining an existing cluster. the root-ca is used to create the server's pki") } - if o.MemberPID == 0 { - logrus.Fatalf("--member-pid is required") + if o.ClusterMember == "" { + logrus.Fatalf("--cluster-member is required") } + o.isHA = true o.joinCommand = true o.run(ctx) @@ -353,8 +354,8 @@ func (o *QuickstartOpts) run(ctx context.Context) { agentJoinCmd := agentcli.NewAgentClusterAdd(p) args := []string{ - fmt.Sprintf("tls:%s:%s", helpers.GetCtrlAdvertisedAddress(), helpers.GetCtrlAdvertisedPort()), - fmt.Sprintf("--pid=%d", o.MemberPID), + o.ClusterMember, + fmt.Sprintf("--pid=%d", os.Getpid()), fmt.Sprintf("--voter=%t", !o.nonVoter), } agentJoinCmd.SetArgs(args) @@ -419,7 +420,7 @@ func (o *QuickstartOpts) run(ctx context.Context) { fmt.Printf(" --router-port %d \\\n", o.RouterPort+1) fmt.Printf(" --home \"%s\" \\\n", o.Home) fmt.Printf(" --trust-domain=\"%s\" \\\n", o.TrustDomain) - fmt.Printf(" --member-pid %d\\ \n", os.Getpid()) + fmt.Printf(" --cluster-member tls:%s:%s\\ \n", ctrlAddy, ctrlPort) fmt.Printf(" --instance-id \"%s\"\n", nextInstId) fmt.Println("=======================================================================================") fmt.Println() diff --git a/zititest/models/smoke/smoketest.go b/zititest/models/smoke/smoketest.go index 8a86c25f0..acb28e40f 100644 --- a/zititest/models/smoke/smoketest.go +++ b/zititest/models/smoke/smoketest.go @@ -38,7 +38,7 @@ import ( "time" ) -const ZitiEdgeTunnelVersion = "v2.0.0-alpha1" +const ZitiEdgeTunnelVersion = "v1.2.9" //go:embed configs var configResource embed.FS