diff --git a/pkg/schema/v1/pod.go b/pkg/schema/v1/pod.go index 3c1e6ea2..fbbc4095 100644 --- a/pkg/schema/v1/pod.go +++ b/pkg/schema/v1/pod.go @@ -2,6 +2,7 @@ package v1 import ( "database/sql" + "fmt" "github.com/icinga/icinga-kubernetes/pkg/database" "github.com/icinga/icinga-kubernetes/pkg/strcase" "github.com/icinga/icinga-kubernetes/pkg/types" @@ -10,8 +11,15 @@ import ( ktypes "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "strings" + "time" ) +const Ok = "ok" +const Warning = "warning" +const Critical = "critical" +const Unknown = "unknown" +const prolongedInitializationThreshold = 10 * time.Minute + type PodFactory struct { clientset *kubernetes.Clientset } @@ -23,6 +31,8 @@ type Pod struct { NominatedNodeName string Ip string Phase string + IcingaState string + IcingaStateReason string CpuLimits int64 CpuRequests int64 MemoryLimits int64 @@ -99,6 +109,7 @@ func (p *Pod) Obtain(k8s kmetav1.Object) { p.NominatedNodeName = pod.Status.NominatedNodeName p.Ip = pod.Status.PodIP p.Phase = strcase.Snake(string(pod.Status.Phase)) + p.IcingaState, p.IcingaStateReason = getIcingaState(pod) p.Reason = pod.Status.Reason p.Message = pod.Status.Message p.Qos = strcase.Snake(string(pod.Status.QOSClass)) @@ -269,6 +280,94 @@ func (p *Pod) Obtain(k8s kmetav1.Object) { } } +func getIcingaState(pod *kcorev1.Pod) (string, string) { + readyContainers := 0 + state := Unknown + reason := string(pod.Status.Phase) + + if pod.Status.Reason != "" { + reason = pod.Status.Reason + } + + if pod.DeletionTimestamp != nil { + reason = fmt.Sprintf("Pod %s is being deleted", pod.Name) + return Ok, reason + } + + initializing := false + for i, container := range pod.Status.InitContainerStatuses { + switch { + case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: + continue + case container.State.Terminated != nil: + state = Critical + reason = fmt.Sprintf("Init container %s terminated with non-zero exit code %d: %s", container.Name, container.State.Terminated.ExitCode, container.State.Terminated.Reason) + initializing = true + case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": + state = Critical + reason = fmt.Sprintf("Init container %s is waiting: %s", container.Name, container.State.Waiting.Reason) + initializing = true + default: + initializing = true + if container.State.Running != nil { + duration := time.Since(container.State.Running.StartedAt.Time) + if duration > prolongedInitializationThreshold { + state = Warning + reason = fmt.Sprintf("Init container %s has been initializing for too long (%d/%d, %s elapsed)", container.Name, i+1, len(pod.Spec.InitContainers), duration) + } else { + reason = fmt.Sprintf("Init container %s is currently initializing (%d/%d)", container.Name, i+1, len(pod.Spec.InitContainers)) + } + } + } + break + } + if !initializing { + hasRunning := false + for _, container := range pod.Status.ContainerStatuses { + if !container.Ready { + state = Critical + reason = fmt.Sprintf("Container %s is not ready", container.Name) + } + if container.State.Waiting != nil && container.State.Waiting.Reason != "" && container.RestartCount >= 3 { + state = Critical + reason = fmt.Sprintf("Container %s is waiting and has restarted %d times: %s", container.Name, container.RestartCount, container.State.Waiting.Reason) + } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" && container.State.Terminated.ExitCode == 0 { + state = Ok + reason = fmt.Sprintf("Container %s terminated normally", container.Name) + } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { + state = Critical + reason = fmt.Sprintf("Container %s terminated abnormally", container.Name) + } else if container.Ready && container.State.Running != nil { + readyContainers++ + hasRunning = true + state = Ok + reason = fmt.Sprintf("Container %s is running", container.Name) + } + } + + if reason == "Completed" && hasRunning { + for _, condition := range pod.Status.Conditions { + if pod.Status.Phase == kcorev1.PodRunning { + if condition.Type == kcorev1.PodReady && condition.Status == kcorev1.ConditionTrue { + state = Ok + reason = fmt.Sprintf("Pod %s is %s", pod.Name, string(kcorev1.PodRunning)) + } else { + state = Critical + reason = fmt.Sprintf("Pod %s is not ready", pod.Name) + } + } + } + } + } + + if readyContainers == len(pod.Spec.Containers) { + state = Ok + reason = "All containers are ready" + } + + return state, reason +} + func (p *Pod) Relations() []database.Relation { fk := database.WithForeignKey("pod_id") diff --git a/schema/mysql/schema.sql b/schema/mysql/schema.sql index 9f8c0229..315ba524 100644 --- a/schema/mysql/schema.sql +++ b/schema/mysql/schema.sql @@ -72,6 +72,8 @@ CREATE TABLE pod ( memory_limits bigint unsigned NOT NULL, memory_requests bigint unsigned NOT NULL, phase enum('pending', 'running', 'succeeded', 'failed') COLLATE utf8mb4_unicode_ci NOT NULL, + icinga_state enum('ok', 'warning', 'critical', 'unknown') COLLATE utf8mb4_unicode_ci NOT NULL, + icinga_state_reason text NULL DEFAULT NULL, reason varchar(255) NULL DEFAULT NULL, message varchar(255) NULL DEFAULT NULL, qos enum('guaranteed', 'burstable', 'best_effort') COLLATE utf8mb4_unicode_ci NOT NULL,