Skip to content

Commit

Permalink
timeouts: Redesign and unify timeouts
Browse files Browse the repository at this point in the history
Add common function to run a code block with a timeout.
Add several global timeouts for different tasks, which can be
configured via environment variables.
Several small fixes and refactoring.

ref: cnti-testcatalog#335
Signed-off-by: Konstantin Yarovoy <[email protected]>
  • Loading branch information
kosstennbl committed May 16, 2024
1 parent e5c795c commit 75c4060
Show file tree
Hide file tree
Showing 16 changed files with 148 additions and 203 deletions.
10 changes: 10 additions & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ Also setting the verbose option for many tasks will add extra output to help wit
```
./cnf-testsuite test_name verbose
```
#### Environment variables for timeouts:

Timeouts are controlled by these environment variables, set them if default values aren't suitable:
```
CNF_TESTSUITE_GENERIC_OPERATION_TIMEOUT=60
CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT=120
CNF_TESTSUITE_NODE_READINESS_TIMEOUT=240
CNF_TESTSUITE_POD_READINESS_TIMEOUT=180
CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT=1800
```

#### Running The Linter in Developer Mode

Expand Down
9 changes: 3 additions & 6 deletions spec/platform/observability_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,10 @@ describe "Platform Observability" do
Helm.helm_repo_add("prometheus-community","https://prometheus-community.github.io/helm-charts")
result = ShellCmd.run("#{helm} install node-exporter prometheus-community/prometheus-node-exporter", force_output: true)

pod_ready = ""
pod_ready_timeout = 45
until (pod_ready == "true" || pod_ready_timeout == 0)
pod_ready = KubectlClient::Get.pod_status("node-exporter-prometheus").split(",")[2]
repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod readiness has timed-out") do
pod_ready = KubectlClient::Get.pod_status("node-exporter-prometheus").split(",")[2] == "true"
Log.info { "Pod Ready Status: #{pod_ready}" }
sleep 1
pod_ready_timeout = pod_ready_timeout - 1
pod_ready
end
result = ShellCmd.run_testsuite("platform:node_exporter poc")
if check_containerd
Expand Down
22 changes: 5 additions & 17 deletions spec/workload/operator_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -42,40 +42,28 @@ describe "Operator" do
KubectlClient::Get.resource_wait_for_uninstall("Pod", "#{pod_name}", 180, "operator-lifecycle-manager")
end

second_count = 0
wait_count = 20
delete=false
until delete || second_count > wait_count.to_i
repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Namespace uninstallation has timed-out") do
File.write("operator.json", "#{KubectlClient::Get.namespaces("operators").to_json}")
json = File.open("operator.json") do |file|
JSON.parse(file)
end
json.as_h.delete("spec")
File.write("operator.json", "#{json.to_json}")
Log.info { "Uninstall Namespace Finalizer" }
if KubectlClient::Replace.command("--raw '/api/v1/namespaces/operators/finalize' -f ./operator.json")[:status].success?
delete=true
end
sleep 3
KubectlClient::Replace.command("--raw '/api/v1/namespaces/operators/finalize' -f ./operator.json")[:status].success?
end

second_count = 0
wait_count = 20
delete=false
until delete || second_count > wait_count.to_i
repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Namespace uninstallation has timed-out") do
File.write("manager.json", "#{KubectlClient::Get.namespaces("operator-lifecycle-manager").to_json}")
json = File.open("manager.json") do |file|
JSON.parse(file)
end
json.as_h.delete("spec")
File.write("manager.json", "#{json.to_json}")
Log.info { "Uninstall Namespace Finalizer" }
if KubectlClient::Replace.command("--raw '/api/v1/namespaces/operator-lifecycle-manager/finalize' -f ./manager.json")[:status].success?
delete=true
end
sleep 3
KubectlClient::Replace.command("--raw '/api/v1/namespaces/operator-lifecycle-manager/finalize' -f ./manager.json")[:status].success?
end
end
end
end

it "'operator_test' operator should not be found", tags: ["operator_test"] do
Expand Down
28 changes: 8 additions & 20 deletions src/tasks/chaos_mesh_setup.cr
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,13 @@ end
module ChaosMeshSetup

def self.wait_for_test(test_type, test_name)
second_count = 0
wait_count = 60
status = ""
until (status.empty? != true && status == "Finished") || second_count > wait_count.to_i
Log.debug { "second_count = #{second_count}" }
sleep 1
execution_complete = repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Chaos Mesh test timed-out") do
cmd = "kubectl get #{test_type} #{test_name} -o json "
Log.info { cmd }
status = Process.run(cmd,
shell: true,
output: output = IO::Memory.new,
error: stderr = IO::Memory.new)
output: output = IO::Memory.new,
error: stderr = IO::Memory.new)
Log.info { "KubectlClient.exec output: #{output.to_s}" }
Log.info { "KubectlClient.exec stderr: #{stderr.to_s}" }
get_status = output.to_s
Expand All @@ -75,23 +70,15 @@ module ChaosMeshSetup
end
Log.info { "Status: #{get_status}" }
status = status_data.dig?("status", "experiment", "phase").to_s
second_count = second_count + 1
Log.info { "#{get_status}" }
Log.info { "#{second_count}" }
!status.empty? && status == "Finished"
end
# Did chaos mesh finish the test successfully
# (status.empty? !=true && status == "Finished")
true
execution_complete
end

# TODO make generate without delete?
def self.wait_for_resource(resource_file)
second_count = 0
wait_count = 60
is_resource_created = nil
until (is_resource_created.nil? != true && is_resource_created == true) || second_count > wait_count.to_i
Log.info { "second_count = #{second_count}" }
sleep 3
execution_complete = repeat_with_timeout(timeout: RESOURCE_CREATION_TIMEOUT, errormsg: "Resource creation timed-out") do
cmd = "kubectl create -f #{resource_file} 2>&1 >/dev/null"
status = Process.run(
cmd,
Expand All @@ -103,8 +90,9 @@ module ChaosMeshSetup
Log.info { "Waiting for CRD" }
Log.info { "Status: #{is_resource_created}" }
Log.debug { "resource file: #{resource_file}" }
second_count = second_count + 1
is_resource_created == true
end
KubectlClient::Delete.file(resource_file)
execution_complete
end
end
36 changes: 13 additions & 23 deletions src/tasks/kind_setup.cr
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,9 @@ class KindManager
def initialize(@name : String, @kubeconfig : String)
end

def wait_until_nodes_ready(wait_count : Int32 = 180)
def wait_until_nodes_ready
Log.info { "wait_until_nodes_ready" }
ready = false
timeout = wait_count
until (ready == true || timeout <= 0)
execution_complete = repeat_with_timeout(timeout: NODE_READINESS_TIMEOUT, errormsg: "Node readiness timed-out") do
cmd = "kubectl get nodes --kubeconfig #{kubeconfig}"
result = ShellCmd.run(cmd, "wait_until_nodes_ready:all_nodes")
all_nodes = result[:output]
Expand All @@ -143,32 +141,27 @@ class KindManager
node_count = all_nodes.size
Log.info { "node_count: #{node_count}" }

ready_count = all_nodes.reduce(0) do |acc, node|
ready_count = all_nodes.reduce(0) do |acc, node|
if /\s(Ready)/.match(node)
acc = acc + 1
else
acc
end
end

if node_count == ready_count
Log.info { "Nodes are ready for the #{name} cluster" }
ready = true
true
else
sleep 1
timeout = timeout - 1
Log.info { "Waiting for nodes on #{name} cluster to be ready: #{ready}" }
break if timeout <= 0
Log.info { "Waiting for nodes on #{name} cluster to be ready..." }
false
end
end
ready
execution_complete
end

def wait_until_pods_ready(wait_count : Int32 = 180)
def wait_until_pods_ready
Log.info { "wait_until_pods_ready" }
ready = false
timeout = wait_count
until (ready == true || timeout <= 0)
execution_complete = repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod readiness timed-out") do
all_pods_cmd = <<-STRING
kubectl get pods -A -o go-template='{{range $index, $element := .items}}{{range .status.containerStatuses}}{{$element.metadata.name}}{{"\\n"}}{{end}}{{end}}' --kubeconfig #{kubeconfig}
STRING
Expand All @@ -193,16 +186,13 @@ class KindManager

if pod_count.to_i == ready_count.to_i
Log.info { "Pods on #{name} cluster are ready" }
ready = true
true
else
sleep 1
timeout = timeout - 1
Log.info { "Waiting for pods on #{name} cluster to be ready: #{ready}" }
break if timeout <= 0
Log.info { "Waiting for pods on #{name} cluster to be ready..." }
false
end
end
ready
execution_complete
end

end
end
44 changes: 17 additions & 27 deletions src/tasks/litmus_setup.cr
Original file line number Diff line number Diff line change
Expand Up @@ -86,50 +86,40 @@ module LitmusManager
end

## wait_for_test will wait for the completion of litmus test
def self.wait_for_test(test_name, chaos_experiment_name,total_chaos_duration,args, namespace : String = "default")
## Maximum wait time is TCD (total chaos duration) + 60s (additional wait time)
delay=2
timeout="#{total_chaos_duration}".to_i + 60
retry=timeout/delay
def self.wait_for_test(test_name, chaos_experiment_name, args, namespace : String = "default")
chaos_result_name = "#{test_name}-#{chaos_experiment_name}"
wait_count = 0
status_code = -1
experimentStatus = ""

experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.engineStatus}'"
Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)

## Wait for completion of chaosengine which indicates the completion of chaos
until (status_code == 0 && experimentStatus == "Completed") || wait_count >= 1800
sleep delay
experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.experiments[0].status}'"
Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)
status_code = Process.run("#{experimentStatus_cmd}", shell: true, output: experimentStatus_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args)
Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)
repeat_with_timeout(timeout: LITMUS_CHAOS_TEST_TIMEOUT, errormsg: "Litmus test has timed-out") do
status_code = Process.run("#{experimentStatus_cmd}",
shell: true,
output: experimentStatus_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "#{chaos_experiment_name} status_code: #{status_code}" } if check_verbose(args)
experimentStatus = experimentStatus_response.to_s
Log.info {"#{chaos_experiment_name} experiment status: "+experimentStatus}

emoji_test_failed= "🗡️💀♻️"
Log.info { "experimentStatus #{experimentStatus}"}
Log.for("wait_for_test").info {"#{chaos_experiment_name} experiment status: " + experimentStatus}
if (experimentStatus != "Waiting for Job Creation" && experimentStatus != "Running" && experimentStatus != "Completed")
Log.info {"#{test_name}: wait_for_test failed."}
true
else
status_code == 0 && experimentStatus == "Completed"
end
wait_count = wait_count + 1
end

verdict = ""
wait_count = 0
verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
Log.for("wait_for_test").info { "Checking experiment verdict #{verdict_cmd}" } if check_verbose(args)
## Check the chaosresult verdict
until (status_code == 0 && verdict != "Awaited") || wait_count >= 30
sleep delay
status_code = Process.run("#{verdict_cmd}", shell: true, output: verdict_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Litmus verdict aquiring has timed-out") do
status_code = Process.run("#{verdict_cmd}",
shell: true,
output: verdict_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args)
Log.for("wait_for_test").info { "verdict: #{verdict_response.to_s}" } if check_verbose(args)
verdict = verdict_response.to_s
wait_count = wait_count + 1
status_code == 0 && verdict != "Awaited"
end
end

Expand Down
64 changes: 28 additions & 36 deletions src/tasks/platform/resilience.cr
Original file line number Diff line number Diff line change
Expand Up @@ -34,59 +34,51 @@ namespace "platform" do
KubectlClient::Apply.file("reboot_daemon_pod.yml")
KubectlClient::Get.wait_for_install("node-failure-coredns")

pod_ready = ""
pod_ready_timeout = 45
begin
until (pod_ready == "true" || pod_ready_timeout == 0)
pod_ready = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2]
pod_ready_timeout = pod_ready_timeout - 1
if pod_ready_timeout == 0
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Failed to install reboot daemon")
end
sleep 1
puts "Waiting for reboot daemon to be ready"
puts "Reboot Daemon Ready Status: #{pod_ready}"

execution_complete = repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod daemon installation has timed-out") do
pod_ready = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2] == "true"
Log.info { "Waiting for reboot daemon to be ready. Current status: #{pod_ready}" }
pod_ready
end

if !execution_complete
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Failed to install reboot daemon")
end

# Find Reboot Daemon name
reboot_daemon_pod = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[0]
start_reboot = KubectlClient.exec("#{reboot_daemon_pod} touch /tmp/reboot")

#Watch for Node Failure.
pod_ready = ""
node_ready = ""
node_failure_timeout = 30
until (pod_ready == "false" || node_ready == "False" || node_ready == "Unknown" || node_failure_timeout == 0)
pod_ready = KubectlClient::Get.pod_status("node-failure").split(",")[2]
node_ready = KubectlClient::Get.node_status("#{worker_node}")
Log.info { "Waiting for Node to go offline" }
execution_complete = repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Node shut-off has timed-out") do
pod_ready = KubectlClient::Get.pod_status("node-failure").split(",")[2] == "true"
node_ready = KubectlClient::Get.node_status("#{worker_node}") == "True"
Log.info { "Waiting for Node to go offline..." }
Log.info { "Pod Ready Status: #{pod_ready}" }
Log.info { "Node Ready Status: #{node_ready}" }
node_failure_timeout = node_failure_timeout - 1
if node_failure_timeout == 0
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to go offline")
end
sleep 1
!pod_ready || !node_ready
end

if !execution_complete
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to go offline")
end

#Watch for Node to come back online
pod_ready = ""
node_ready = ""
node_online_timeout = 300
until (pod_ready == "true" && node_ready == "True" || node_online_timeout == 0)
pod_ready = KubectlClient::Get.pod_status("node-failure", "").split(",")[2]
node_ready = KubectlClient::Get.node_status("#{worker_node}")
Log.info { "Waiting for Node to come back online" }
execution_complete = repeat_with_timeout(timeout: NODE_READINESS_TIMEOUT, errormsg: "Node startup has timed-out") do
pod_ready = KubectlClient::Get.pod_status("node-failure", "").split(",")[2] == "true"
node_ready = KubectlClient::Get.node_status("#{worker_node}") == "True"
Log.info { "Waiting for Node to come back online..." }
Log.info { "Pod Ready Status: #{pod_ready}" }
Log.info { "Node Ready Status: #{node_ready}" }
node_online_timeout = node_online_timeout - 1
if node_online_timeout == 0
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to come back online")
end
sleep 1
pod_ready && node_ready
end
CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Passed, "Node came back online")

if !execution_complete
next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to come back online")
end

CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Passed, "Node came back online")
ensure
Log.info { "node_failure cleanup" }
delete_reboot_daemon = KubectlClient::Delete.file("reboot_daemon_pod.yml")
Expand Down
2 changes: 1 addition & 1 deletion src/tasks/utils/apisnoop.cr
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class ApiSnoop
ShellCmd.run("pwd", "apisnoop_setup_kind_dir", true)
kind_config = "kind+apisnoop.yaml"
cluster = kind_manager.create_cluster(name, kind_config, false, k8s_version)
cluster.wait_until_nodes_ready(240)
cluster.wait_until_nodes_ready()
cluster.wait_until_pods_ready()
return cluster
end
Expand Down
Loading

0 comments on commit 75c4060

Please sign in to comment.