diff --git a/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 b/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 new file mode 100644 index 00000000..a1d3f5cd --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 @@ -0,0 +1,118 @@ +[CmdletBinding()] +param +( + # Path to the module defining the strategy to use for monitoring the node + [string] + $StrategyModulePath = "C:\k\debug\StrategyModulePath.psm1" +) + +function Start-HNSTrace +{ + .\collectlogs.ps1 + $sessionName = 'HnsCapture' + Write-Host "Starting HNS tracing" + + $curDir = Get-Location + # Generate a random directory to capture all the logs + $etlPath = [io.Path]::Combine($curDir.Path, "HNSTrace.etl") + .\starthnstrace.ps1 -NoPrompt -MaxFileSize 1024 -EtlFile $etlPath +} + +function Stop-HNSTrace +{ + # Stop the tracing + $sessionName = 'HnsCapture' + Write-Host "Stopping $sessionName." + Stop-NetEventSession $sessionName + + # Collect logs + .\collectlogs.ps1 + .\collect-windows-logs.ps1 + + # Take a HNS Process dump + $hnsProcessId = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Hns'" | Select-Object -ExpandProperty ProcessId + .\Procdump\Procdump.exe -ma $hnsProcessId /accepteula +} + +''' +Start-Monitoring + +Monitors Windows node for an error condition by polling every 15 seconds. +Gathers all the necessary logs if Windows node goes into an error/faulted state. +''' +function Start-Monitoring +{ + param + ( + # Path with filename where the configuration module is located + [string] + $StrategyModulePath = "C:\k\debug\StrategyModule.psm1", + + # Interval to poll for failure in seconds + [int] + $PollingInterval = 15, + + # Number of consecutive failures to declare the node is faulty + [int] + $FailureThreshold = 3 + ) + + $curDir = Get-Location + # Generate a random directory to capture all the logs + $outDir = [io.Path]::Combine($curDir.Path, [io.Path]::GetRandomFileName()) + md $outDir + pushd + cd $outDir + + # Download necessary files + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/collectlogs.ps1 -o collectlogs.ps1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/VFP.psm1 -o VFP.psm1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.psm1 -o HNS.psm1 + wget https://raw.githubusercontent.com/Azure/aks-engine/master/scripts/collect-windows-logs.ps1 -o collect-windows-logs.ps1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/starthnstrace.ps1 -o starthnstrace.ps1 + wget https://download.sysinternals.com/files/Procdump.zip -o Procdump.zip + Expand-Archive .\Procdump.zip + wget $StrategyModulePath -o StrategyModule.psm1 + ipmo .\VFP.psm1 + ipmo .\HNS.psm1 + ipmo .\StrategyModule.psm1 + + Start-HNSTrace + $consecutiveFailures = 0 + + StartHandler + + LogMessage "Started Monitoring" + + while($true) + { + if(IsNodeFaulted) + { + $consecutiveFailures++ + # Number of consecutive failures to confirm that the Windows node is faulted for real + # and this is not an intermittent failure + if ($consecutiveFailures -ge $FailureThreshold) + { + Stop-HNSTrace + + popd + + TerminateHandler($outDir) + + LogMessage "Diagnostic logs are available at $outDir" + return + } + } + else + { + $consecutiveFailures = 0 + } + + # Adjust the sleep time to lower the polling frequency + Start-Sleep -Seconds $PollingInterval + } +} + +##### Start execution ######### + +Start-Monitoring -StrategyModulePath $StrategyModulePath \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 b/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 new file mode 100644 index 00000000..d8a962af --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 @@ -0,0 +1,66 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #download file + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.v2.psm1 -o HNS.V2.psm1 + + ipmo .\HNS.V2.psm1 +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + LogMessage "Capturing information after node failure" + LogMessage "Information has been logged: $LogPath" + + #TODO: add azure blob + #TODO: add way to notify user of issue +} + +function IsNodeFaulted +{ + #More specific lookup by azure name. Needs more testing before is used. + #((get-hnsnetwork | ? name -like azure)[0].Policies | Where-Object PolicyType -eq IPSET).count + $expectedNumPolicies = (((get-hnsnetwork | Select Policies)[1].Policies) | Where-Object PolicyType -eq IPSET).Count + if($expectedNumPolicies -eq 0){ + return $false + } + $EndpointPorts = Get-HnsEndpoint | %{$_.Resources.Allocators} | Where-Object Tag -eq "Endpoint Port" | Select -ExpandProperty EndpointPortGuid + foreach ($endPort in $EndpointPorts) + { + $currNumPolicies = (vfpctrl /port $endPort /list-tag | Select-String "Friendly Name").Count + #if difference is greater than or equal to 10% + if($currNumPolicies -le ($expectedNumPolicies - $expectedNumPolicies * .1)){ + + #get the virtualNetwork + $netId = Get-HnsEndpoint | where-object {$_.Resources.Allocators.EndPointPortGuid -eq $endPort} | Select -ExpandProperty VirtualNetwork + #send test policy to simplify log lookup + New-HNSSetPolicy -NetworkId $netId -setType 0 -setValues "10.22.0.44" -setName "spTestName" -setId "spTestId" + + return $true + } + } + return $false +} \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 new file mode 100644 index 00000000..b08a87e4 --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 @@ -0,0 +1,47 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #logic here +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + + # copy the logs to Azure blob + Invoke-WebRequest https://azcopyvnext.azureedge.net/release20211027/azcopy_windows_amd64_10.13.0.zip -OutFile azcopyv10.zip + Expand-Archive .\azcopyv10.zip -Force + + $timeStamp = get-date -format 'yyyyMMdd-hhmmss' + $zipFileName = "$env:computername-$($timeStamp)_logs.zip" + Compress-Archive -LiteralPath $LogPath -DestinationPath $zipFileName + .\azcopyv10\azcopy_windows_amd64_10.13.0\azcopy.exe copy $zipFileName "https://sban91storage.blob.core.windows.net/akslogs?sp=rw&st=2021-11-30T18:59:20Z&se=2021-12-12T02:59:20Z&spr=https&sv=2020-08-04&sr=c&sig=3uzRPB72k4NnM2q1k1vZ1xqugkjDSUSWSPMdiMQkwMI%3D" +} + +function IsNodeFaulted +{ + #logic here + return $true +} \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 new file mode 100644 index 00000000..518198ca --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 @@ -0,0 +1,50 @@ +$ServiceIp = "192.168.0.10" +$ServicePort = 53 + +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + LogMessage "Capturing some information before the repro." + $hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'" + $kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'" + LogMessage $hnsInfo + LogMessage $kubeproxyInfo +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + LogMessage "Capturing some information after the repro." + $hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'" + $kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'" + LogMessage $hnsInfo + LogMessage $kubeproxyInfo + LogMessage "HNS Policy for K8's Service with IP $ServiceIp and Port $ServicePort is missing" +} + +function IsNodeFaulted +{ + return ((Get-HnsPolicyList | where {($_.Policies.VIPs.Count -ge 1) -and $_.Policies.VIPs.Contains($ServiceIp) -and $_.Policies.ExternalPort -eq $ServicePort}) -eq $null) +} diff --git a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 new file mode 100644 index 00000000..10b967e3 --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 @@ -0,0 +1,39 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #logic here +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + #logic here +} + +function IsNodeFaulted +{ + #logic here + return $true +} \ No newline at end of file