From 5d44e48f0c978b5d36340a0cfdb6a5eeda5db48f Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Tue, 20 Dec 2022 02:16:11 +0530 Subject: [PATCH 1/8] Add scripts to detect HNS crashes in AKS clusters --- Kubernetes/windows/debug/detectHNSCrash.ps1 | 42 +++++++++++ Kubernetes/windows/debug/faultTolerance.ps1 | 78 +++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 Kubernetes/windows/debug/detectHNSCrash.ps1 create mode 100644 Kubernetes/windows/debug/faultTolerance.ps1 diff --git a/Kubernetes/windows/debug/detectHNSCrash.ps1 b/Kubernetes/windows/debug/detectHNSCrash.ps1 new file mode 100644 index 00000000..b88dff30 --- /dev/null +++ b/Kubernetes/windows/debug/detectHNSCrash.ps1 @@ -0,0 +1,42 @@ +$crashDetected=$false +$hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like \"*The Host Network Service terminated unexpectedly*\").TimeCreated; +if($hnsCrash.Count -gt 0) { + $crashDetected=$true + # Log HNS Crashes + $errStr += "HNS crash detected at "; + foreach ($ts in $hnsCrash) { + $errStr += "( "+$ts+" ) "; + } + $errStr += "`nReason:`n"; + $fixedCrashes = @( + [pscustomobject]@{ + faultStr='*ElbDsrPolicy-Update-Failure*'; + bugId='41071049'; + }, + [pscustomobject]@{ + faultStr='*Network-Not-Found*'; + bugId='42521831'; + } + ) + $isKnownCrash=$false; + foreach($fault in $fixedCrashes.GetEnumerator()) { + $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $fault.faultStr).TimeCreated + if ($faultEvent.Count -gt 0) { + $isKnownCrash=$true; + $errStr += "Bug #" + $fault.bugId + " hit at "; + foreach ($ts in $faultEvent) { + $errStr += "("+$ts+") "; + } + $errStr += "`n"; + } + } + # If it is not a known crash log here, also collect logs? + if ($isKnownCrash -eq $false) { + $errStr += "Unknown"; + } + Write-Host $errStr; +} + +if ($crashDetected -eq $false) { + Write-Host "$(date) HNS crash not detected" +} diff --git a/Kubernetes/windows/debug/faultTolerance.ps1 b/Kubernetes/windows/debug/faultTolerance.ps1 new file mode 100644 index 00000000..d6effa4b --- /dev/null +++ b/Kubernetes/windows/debug/faultTolerance.ps1 @@ -0,0 +1,78 @@ +$faultToleranceYaml = @' +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: faulttolerance + labels: + app: faulttolerance +spec: + selector: + matchLabels: + name: faulttolerance + template: + metadata: + labels: + name: faulttolerance + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: faulttolerance + image: mcr.microsoft.com/windows/servercore:1809 + args: + - powershell.exe + - -Command + - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/detectHNSCrash.ps1\" -OutFile $BaseDir\\detectHNSCrash.ps1;c:\\k\\debug\\detectHNSCrash.ps1; start-sleep 60;}" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: kube-path + mountPath: C:\k + volumes: + - name: kube-path + hostPath: + path: C:\k + nodeSelector: + kubernetes.azure.com/os-sku: Windows2019 +'@ + +$faultToleranceYaml | kubectl delete -f - + +$faultToleranceYaml | kubectl apply -f - +Write-Output "Sleep for a minute for fault tolerance pods to be up" +Start-Sleep 60 + +[System.Collections.ArrayList] $ws2019Nodes = @() +$nodes = (kubectl get nodes -o jsonpath="{.items[*].metadata.name}").Split() +foreach ($node in $nodes) { + $nodeImage = kubectl get node $node -o jsonpath="{.status.nodeInfo.osImage}" + + if ($nodeImage.ToString().trim() -eq "Windows Server 2019 Datacenter") { + $ws2019Nodes += $node.trim(); + } +} + +$pods = (kubectl get pods -o jsonpath="{.items[*].metadata.name}").Split() +foreach ($pod in $pods) { + if ($pod.StartsWith('faulttolerance')) { + # if hns crashed - get the reason + $nodeName = kubectl get pod $pod -o jsonpath="{.spec.nodeName}" + $podLog = kubectl log $pod + if ($podLog.Contains("HNS crash not detected")) { + $ws2019Nodes.Remove($nodeName.ToLower()) + } else { + # Generate Crash Report + $errStr = "HNS Crash detected in "+$nodeName+", Report: `n"+$podLog+"`n" + } +} +Write-Host $errStr + +if ($ws2019Nodes.Count -eq 0) { + Write-Host "No HNS crashes detected in the cluster" +} + +# Sleep for 60 minutes, and delete the daemonset +Start-Sleep 3600 +$faultToleranceYaml | kubectl delete -f - \ No newline at end of file From 0dbf27e2dbc0e542cab69dbd82ee92a45679c79a Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Tue, 20 Dec 2022 13:57:41 +0530 Subject: [PATCH 2/8] Minor changes --- Kubernetes/windows/debug/detectHNSCrash.ps1 | 56 +++++++++++---------- Kubernetes/windows/debug/faultTolerance.ps1 | 7 ++- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/Kubernetes/windows/debug/detectHNSCrash.ps1 b/Kubernetes/windows/debug/detectHNSCrash.ps1 index b88dff30..4aeb935f 100644 --- a/Kubernetes/windows/debug/detectHNSCrash.ps1 +++ b/Kubernetes/windows/debug/detectHNSCrash.ps1 @@ -1,3 +1,15 @@ +# Enlist the fixed crashes to detect codepath execution +$fixedCrashes = @( + [pscustomobject]@{ + faultStr='*ElbDsrPolicy-Update-Failure*'; + bugId='41071049'; + }, + [pscustomobject]@{ + faultStr='*Network-Not-Found*'; + bugId='42521831'; + } +) + $crashDetected=$false $hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like \"*The Host Network Service terminated unexpectedly*\").TimeCreated; if($hnsCrash.Count -gt 0) { @@ -7,36 +19,28 @@ if($hnsCrash.Count -gt 0) { foreach ($ts in $hnsCrash) { $errStr += "( "+$ts+" ) "; } - $errStr += "`nReason:`n"; - $fixedCrashes = @( - [pscustomobject]@{ - faultStr='*ElbDsrPolicy-Update-Failure*'; - bugId='41071049'; - }, - [pscustomobject]@{ - faultStr='*Network-Not-Found*'; - bugId='42521831'; - } - ) - $isKnownCrash=$false; - foreach($fault in $fixedCrashes.GetEnumerator()) { - $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $fault.faultStr).TimeCreated - if ($faultEvent.Count -gt 0) { - $isKnownCrash=$true; - $errStr += "Bug #" + $fault.bugId + " hit at "; - foreach ($ts in $faultEvent) { - $errStr += "("+$ts+") "; - } - $errStr += "`n"; +} + +$errStr += "`nChecking for known issues that were handled... `n"; +$isKnownCrash=$false; +foreach($fault in $fixedCrashes.GetEnumerator()) { + $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $fault.faultStr).TimeCreated + if ($faultEvent.Count -gt 0) { + $isKnownCrash=$true; + $errStr += "Bug #" + $fault.bugId + " gracefully handled at "; + foreach ($ts in $faultEvent) { + $errStr += "("+$ts+") "; } + $errStr += "`n"; } - # If it is not a known crash log here, also collect logs? - if ($isKnownCrash -eq $false) { - $errStr += "Unknown"; - } - Write-Host $errStr; +} + +if($isKnownCrash -eq false) { + $errStr += "No known issues were hit`n" } if ($crashDetected -eq $false) { Write-Host "$(date) HNS crash not detected" +} else { + Write-Host $errStr; } diff --git a/Kubernetes/windows/debug/faultTolerance.ps1 b/Kubernetes/windows/debug/faultTolerance.ps1 index d6effa4b..4dcda236 100644 --- a/Kubernetes/windows/debug/faultTolerance.ps1 +++ b/Kubernetes/windows/debug/faultTolerance.ps1 @@ -25,7 +25,7 @@ spec: args: - powershell.exe - -Command - - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/detectHNSCrash.ps1\" -OutFile $BaseDir\\detectHNSCrash.ps1;c:\\k\\debug\\detectHNSCrash.ps1; start-sleep 60;}" + - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/detectHNSCrash.ps1\" -OutFile $BaseDir\\detectHNSCrash.ps1;c:\\k\\debug\\detectHNSCrash.ps1; start-sleep 3600;}" imagePullPolicy: IfNotPresent volumeMounts: - name: kube-path @@ -59,13 +59,14 @@ foreach ($pod in $pods) { if ($pod.StartsWith('faulttolerance')) { # if hns crashed - get the reason $nodeName = kubectl get pod $pod -o jsonpath="{.spec.nodeName}" - $podLog = kubectl log $pod + $podLog = kubectl logs $pod if ($podLog.Contains("HNS crash not detected")) { $ws2019Nodes.Remove($nodeName.ToLower()) } else { # Generate Crash Report $errStr = "HNS Crash detected in "+$nodeName+", Report: `n"+$podLog+"`n" } + } } Write-Host $errStr @@ -73,6 +74,4 @@ if ($ws2019Nodes.Count -eq 0) { Write-Host "No HNS crashes detected in the cluster" } -# Sleep for 60 minutes, and delete the daemonset -Start-Sleep 3600 $faultToleranceYaml | kubectl delete -f - \ No newline at end of file From 66556495d3c8da3cb5519dd503b37f6d65201a87 Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Tue, 20 Dec 2022 14:21:01 +0530 Subject: [PATCH 3/8] Minor changes --- .../debug/{detectHNSCrash.ps1 => faultAnalysis.ps1} | 2 +- Kubernetes/windows/debug/faultTolerance.ps1 | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) rename Kubernetes/windows/debug/{detectHNSCrash.ps1 => faultAnalysis.ps1} (97%) diff --git a/Kubernetes/windows/debug/detectHNSCrash.ps1 b/Kubernetes/windows/debug/faultAnalysis.ps1 similarity index 97% rename from Kubernetes/windows/debug/detectHNSCrash.ps1 rename to Kubernetes/windows/debug/faultAnalysis.ps1 index 4aeb935f..0f513538 100644 --- a/Kubernetes/windows/debug/detectHNSCrash.ps1 +++ b/Kubernetes/windows/debug/faultAnalysis.ps1 @@ -35,7 +35,7 @@ foreach($fault in $fixedCrashes.GetEnumerator()) { } } -if($isKnownCrash -eq false) { +if ($isKnownCrash -eq $false) { $errStr += "No known issues were hit`n" } diff --git a/Kubernetes/windows/debug/faultTolerance.ps1 b/Kubernetes/windows/debug/faultTolerance.ps1 index 4dcda236..fa1e95db 100644 --- a/Kubernetes/windows/debug/faultTolerance.ps1 +++ b/Kubernetes/windows/debug/faultTolerance.ps1 @@ -25,7 +25,7 @@ spec: args: - powershell.exe - -Command - - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/detectHNSCrash.ps1\" -OutFile $BaseDir\\detectHNSCrash.ps1;c:\\k\\debug\\detectHNSCrash.ps1; start-sleep 3600;}" + - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1;c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;}" imagePullPolicy: IfNotPresent volumeMounts: - name: kube-path @@ -54,6 +54,7 @@ foreach ($node in $nodes) { } } +$report="" $pods = (kubectl get pods -o jsonpath="{.items[*].metadata.name}").Split() foreach ($pod in $pods) { if ($pod.StartsWith('faulttolerance')) { @@ -63,12 +64,12 @@ foreach ($pod in $pods) { if ($podLog.Contains("HNS crash not detected")) { $ws2019Nodes.Remove($nodeName.ToLower()) } else { - # Generate Crash Report - $errStr = "HNS Crash detected in "+$nodeName+", Report: `n"+$podLog+"`n" + # Generate Analysis Report + $report = $nodeName+" - Fault Analysis Report: `n"+$podLog+"`n" } } } -Write-Host $errStr +Write-Host $report if ($ws2019Nodes.Count -eq 0) { Write-Host "No HNS crashes detected in the cluster" From 197653bd582aef59fe1d2d1875ebf4aee373f614 Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Tue, 20 Dec 2022 16:52:19 +0530 Subject: [PATCH 4/8] Minor changes --- Kubernetes/windows/debug/faultAnalysis.ps1 | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Kubernetes/windows/debug/faultAnalysis.ps1 b/Kubernetes/windows/debug/faultAnalysis.ps1 index 0f513538..31e02e7b 100644 --- a/Kubernetes/windows/debug/faultAnalysis.ps1 +++ b/Kubernetes/windows/debug/faultAnalysis.ps1 @@ -10,6 +10,7 @@ $fixedCrashes = @( } ) +$errStr="" $crashDetected=$false $hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like \"*The Host Network Service terminated unexpectedly*\").TimeCreated; if($hnsCrash.Count -gt 0) { @@ -17,16 +18,16 @@ if($hnsCrash.Count -gt 0) { # Log HNS Crashes $errStr += "HNS crash detected at "; foreach ($ts in $hnsCrash) { - $errStr += "( "+$ts+" ) "; + $errStr += "("+$ts+") "; } } $errStr += "`nChecking for known issues that were handled... `n"; -$isKnownCrash=$false; +$isHandled=$false; foreach($fault in $fixedCrashes.GetEnumerator()) { $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $fault.faultStr).TimeCreated if ($faultEvent.Count -gt 0) { - $isKnownCrash=$true; + $isHandled=$true; $errStr += "Bug #" + $fault.bugId + " gracefully handled at "; foreach ($ts in $faultEvent) { $errStr += "("+$ts+") "; @@ -35,12 +36,12 @@ foreach($fault in $fixedCrashes.GetEnumerator()) { } } -if ($isKnownCrash -eq $false) { - $errStr += "No known issues were hit`n" +if ($isHandled -eq $false) { + $errStr += "None of the already handled issues were hit`n" } if ($crashDetected -eq $false) { Write-Host "$(date) HNS crash not detected" -} else { - Write-Host $errStr; } + +Write-Host $errStr; From 263358d98b10b97677b6031b7bbcbf59d68eef9c Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Wed, 21 Dec 2022 16:22:34 +0530 Subject: [PATCH 5/8] Minor changes --- Kubernetes/windows/debug/faultAnalysis.ps1 | 12 ++---- Kubernetes/windows/debug/faultTolerance.ps1 | 41 +++++++++++++-------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/Kubernetes/windows/debug/faultAnalysis.ps1 b/Kubernetes/windows/debug/faultAnalysis.ps1 index 31e02e7b..387319d0 100644 --- a/Kubernetes/windows/debug/faultAnalysis.ps1 +++ b/Kubernetes/windows/debug/faultAnalysis.ps1 @@ -12,7 +12,7 @@ $fixedCrashes = @( $errStr="" $crashDetected=$false -$hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like \"*The Host Network Service terminated unexpectedly*\").TimeCreated; +$hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like "*The Host Network Service terminated unexpectedly*").TimeCreated; if($hnsCrash.Count -gt 0) { $crashDetected=$true # Log HNS Crashes @@ -20,14 +20,12 @@ if($hnsCrash.Count -gt 0) { foreach ($ts in $hnsCrash) { $errStr += "("+$ts+") "; } + $errStr += "`n"; } -$errStr += "`nChecking for known issues that were handled... `n"; -$isHandled=$false; foreach($fault in $fixedCrashes.GetEnumerator()) { - $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $fault.faultStr).TimeCreated + $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like $fault.faultStr).TimeCreated if ($faultEvent.Count -gt 0) { - $isHandled=$true; $errStr += "Bug #" + $fault.bugId + " gracefully handled at "; foreach ($ts in $faultEvent) { $errStr += "("+$ts+") "; @@ -36,10 +34,6 @@ foreach($fault in $fixedCrashes.GetEnumerator()) { } } -if ($isHandled -eq $false) { - $errStr += "None of the already handled issues were hit`n" -} - if ($crashDetected -eq $false) { Write-Host "$(date) HNS crash not detected" } diff --git a/Kubernetes/windows/debug/faultTolerance.ps1 b/Kubernetes/windows/debug/faultTolerance.ps1 index fa1e95db..f0a43dc8 100644 --- a/Kubernetes/windows/debug/faultTolerance.ps1 +++ b/Kubernetes/windows/debug/faultTolerance.ps1 @@ -25,7 +25,7 @@ spec: args: - powershell.exe - -Command - - "$BaseDir = \"c:\\k\\debug\";while(1){Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1;c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;}" + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" imagePullPolicy: IfNotPresent volumeMounts: - name: kube-path @@ -38,10 +38,10 @@ spec: kubernetes.azure.com/os-sku: Windows2019 '@ -$faultToleranceYaml | kubectl delete -f - +$faultToleranceYaml | kubectl delete --ignore-not-found=true -f - $faultToleranceYaml | kubectl apply -f - -Write-Output "Sleep for a minute for fault tolerance pods to be up" +Write-Output "Sleep for a minute for fault tolerance pods to be up..." Start-Sleep 60 [System.Collections.ArrayList] $ws2019Nodes = @() @@ -57,22 +57,31 @@ foreach ($node in $nodes) { $report="" $pods = (kubectl get pods -o jsonpath="{.items[*].metadata.name}").Split() foreach ($pod in $pods) { - if ($pod.StartsWith('faulttolerance')) { - # if hns crashed - get the reason - $nodeName = kubectl get pod $pod -o jsonpath="{.spec.nodeName}" - $podLog = kubectl logs $pod - if ($podLog.Contains("HNS crash not detected")) { - $ws2019Nodes.Remove($nodeName.ToLower()) - } else { - # Generate Analysis Report - $report = $nodeName+" - Fault Analysis Report: `n"+$podLog+"`n" - } + if ($pod.StartsWith('faulttolerance')) { + # if hns crashed - get the reason + $nodeName = kubectl get pod $pod -o jsonpath="{.spec.nodeName}" + $podLog = kubectl logs $pod + if ($podLog -like "*HNS crash not detected*") { + $ws2019Nodes.Remove($nodeName) } + if (($podLog -like "*gracefully handled*") -or ($podLog -like "*HNS crash detected*")) { + # Generate Analysis Report + $report += $nodeName+" - Fault Analysis Report: `n"+$podLog+"`n" + } + } +} + +if ($report -ne "") { + Write-Host $report -ForegroundColor black -BackgroundColor white } -Write-Host $report if ($ws2019Nodes.Count -eq 0) { - Write-Host "No HNS crashes detected in the cluster" + Write-Host "No HNS crashes detected in the cluster" -ForegroundColor darkgreen -BackgroundColor white +} else { + Write-Host "HNS crashed on nodes: $ws2019Nodes" -ForegroundColor darkred -BackgroundColor white } -$faultToleranceYaml | kubectl delete -f - \ No newline at end of file + +Write-Output "Sleep for an hour before deleting the fault tolerance pods automatically..." +Start-Sleep 3600 +$faultToleranceYaml | kubectl delete --ignore-not-found=true -f - \ No newline at end of file From b46dffad13ede06c7ebc1a48d53e1a518ef3d974 Mon Sep 17 00:00:00 2001 From: Debjit Mondal Date: Mon, 26 Dec 2022 17:26:37 +0530 Subject: [PATCH 6/8] Add doc and move scripts to new dir --- .../{ => faulttolerance}/faultAnalysis.ps1 | 14 +++---- .../debug/faulttolerance/faultTolerance.md | 36 ++++++++++++++++++ .../{ => faulttolerance}/faultTolerance.ps1 | 0 .../debug/faulttolerance/faulttolerance.yaml | 37 +++++++++++++++++++ 4 files changed, 80 insertions(+), 7 deletions(-) rename Kubernetes/windows/debug/{ => faulttolerance}/faultAnalysis.ps1 (60%) create mode 100644 Kubernetes/windows/debug/faulttolerance/faultTolerance.md rename Kubernetes/windows/debug/{ => faulttolerance}/faultTolerance.ps1 (100%) create mode 100644 Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml diff --git a/Kubernetes/windows/debug/faultAnalysis.ps1 b/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 similarity index 60% rename from Kubernetes/windows/debug/faultAnalysis.ps1 rename to Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 index 387319d0..62bf9d77 100644 --- a/Kubernetes/windows/debug/faultAnalysis.ps1 +++ b/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 @@ -1,5 +1,5 @@ # Enlist the fixed crashes to detect codepath execution -$fixedCrashes = @( +$fixLogs = @( [pscustomobject]@{ faultStr='*ElbDsrPolicy-Update-Failure*'; bugId='41071049'; @@ -12,21 +12,21 @@ $fixedCrashes = @( $errStr="" $crashDetected=$false -$hnsCrash=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like "*The Host Network Service terminated unexpectedly*").TimeCreated; -if($hnsCrash.Count -gt 0) { +$hnsCrashEvts=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like "*The Host Network Service terminated unexpectedly*").TimeCreated; +if($hnsCrashEvts.Count -gt 0) { $crashDetected=$true # Log HNS Crashes $errStr += "HNS crash detected at "; - foreach ($ts in $hnsCrash) { + foreach ($ts in $hnsCrashEvts) { $errStr += "("+$ts+") "; } $errStr += "`n"; } -foreach($fault in $fixedCrashes.GetEnumerator()) { - $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like $fault.faultStr).TimeCreated +foreach($fixLog in $fixLogs.GetEnumerator()) { + $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like $fixLog.faultStr).TimeCreated if ($faultEvent.Count -gt 0) { - $errStr += "Bug #" + $fault.bugId + " gracefully handled at "; + $errStr += "Bug #" + $fixLog.bugId + " gracefully handled at "; foreach ($ts in $faultEvent) { $errStr += "("+$ts+") "; } diff --git a/Kubernetes/windows/debug/faulttolerance/faultTolerance.md b/Kubernetes/windows/debug/faulttolerance/faultTolerance.md new file mode 100644 index 00000000..3bbe8942 --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faultTolerance.md @@ -0,0 +1,36 @@ +# faultTolerance.ps1 + +This will analyze Host Network Service faults and provide concise summary / mitigation steps / auto-mitigate issues. + +## Instructions for AKS cluster + +### With powershell access to the cluster (kubectl) + +1. Run **faultTolerance.ps1** script on powershell with access to the AKS cluster using this command +``` + .\faultTolerance.ps1 + daemonset.apps/faulttolerance created + Sleep for a minute for fault tolerance pods to be up... + **No HNS crashes detected in the cluster** + Sleep for an hour before deleting the fault tolerance pods automatically... + daemonset.apps "faulttolerance" deleted +``` + +### Without powershell access to the cluster (kubectl) + +1. Apply the yaml **faulttolerance.yaml** on an AKS cluster using this command +``` + Cleanup the previous instance of the daemon set and re-apply. + + kubectl delete -f faulttolerance.yaml + kubectl apply -f faulttolerance.yaml +``` + +2. Wait for 5 minutes and redirect the output of the following command to a text file and provide it to the support engineer. +``` + kubectl logs -l name=faulttolerance --all-containers=true + + Example: + kubectl logs -l name=faulttolerance --all-containers=true >> faulttolerance.txt + Provide the generated faulttolerance.txt +``` \ No newline at end of file diff --git a/Kubernetes/windows/debug/faultTolerance.ps1 b/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 similarity index 100% rename from Kubernetes/windows/debug/faultTolerance.ps1 rename to Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 diff --git a/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml new file mode 100644 index 00000000..63566818 --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: faulttolerance + labels: + app: faulttolerance +spec: + selector: + matchLabels: + name: faulttolerance + template: + metadata: + labels: + name: faulttolerance + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: faulttolerance + image: mcr.microsoft.com/windows/servercore:1809 + args: + - powershell.exe + - -Command + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: kube-path + mountPath: C:\k + volumes: + - name: kube-path + hostPath: + path: C:\k + nodeSelector: + kubernetes.azure.com/os-sku: Windows2019 \ No newline at end of file From a717549a6c71851404107e22a4672d197e790614 Mon Sep 17 00:00:00 2001 From: Debjit Date: Thu, 22 Jun 2023 14:08:45 +0530 Subject: [PATCH 7/8] Update faulttolerance.yaml --- Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml index 63566818..0b604432 100644 --- a/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml +++ b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml @@ -24,7 +24,7 @@ spec: args: - powershell.exe - -Command - - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" imagePullPolicy: IfNotPresent volumeMounts: - name: kube-path @@ -34,4 +34,4 @@ spec: hostPath: path: C:\k nodeSelector: - kubernetes.azure.com/os-sku: Windows2019 \ No newline at end of file + kubernetes.azure.com/os-sku: Windows2019 From 10113a8ddb04577805ec0e5ec521e0924d9a9f38 Mon Sep 17 00:00:00 2001 From: Debjit Date: Thu, 22 Jun 2023 14:09:06 +0530 Subject: [PATCH 8/8] Update faultTolerance.ps1 --- Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 b/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 index f0a43dc8..a0d865dc 100644 --- a/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 +++ b/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 @@ -25,7 +25,7 @@ spec: args: - powershell.exe - -Command - - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" imagePullPolicy: IfNotPresent volumeMounts: - name: kube-path @@ -84,4 +84,4 @@ if ($ws2019Nodes.Count -eq 0) { Write-Output "Sleep for an hour before deleting the fault tolerance pods automatically..." Start-Sleep 3600 -$faultToleranceYaml | kubectl delete --ignore-not-found=true -f - \ No newline at end of file +$faultToleranceYaml | kubectl delete --ignore-not-found=true -f -