Skip to content

Commit

Permalink
Add endpont for machine issues.
Browse files Browse the repository at this point in the history
  • Loading branch information
Gerrit91 committed Oct 2, 2023
1 parent 6f952f6 commit 1497133
Show file tree
Hide file tree
Showing 20 changed files with 1,752 additions and 0 deletions.
122 changes: 122 additions & 0 deletions cmd/metal-api/internal/issues/asn-uniqueness.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package issues

import (
"fmt"
"sort"
"strings"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
IssueTypeASNUniqueness IssueType = "asn-not-unique"
)

type (
IssueASNUniqueness struct {
details string
}
)

func (i *IssueASNUniqueness) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeASNUniqueness,
Severity: IssueSeverityMinor,
Description: "The ASN is not unique (only impact on firewalls)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique",
}
}

func (i *IssueASNUniqueness) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
var (
machineASNs = map[uint32]metal.Machines{}
overlaps []string
isNoFirewall = func(m metal.Machine) bool {
return m.Allocation == nil || m.Allocation.Role != metal.RoleFirewall
}
)

if isNoFirewall(m) {
return false
}

for _, n := range m.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

machineASNs[n.ASN] = nil
}

for _, machineFromAll := range c.Machines {
machineFromAll := machineFromAll

if machineFromAll.ID == m.ID {
continue
}
otherMachine := machineFromAll

if isNoFirewall(otherMachine) {
continue
}

for _, n := range otherMachine.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

_, ok := machineASNs[n.ASN]
if !ok {
continue
}

machineASNs[n.ASN] = append(machineASNs[n.ASN], otherMachine)
}
}

var asnList []uint32
for asn := range machineASNs {
asn := asn
asnList = append(asnList, asn)
}
sort.Slice(asnList, func(i, j int) bool {
return asnList[i] < asnList[j]
})

for _, asn := range asnList {
asn := asn

overlappingMachines, ok := machineASNs[asn]
if !ok || len(overlappingMachines) == 0 {
continue
}

var sharedIDs []string
for _, m := range overlappingMachines {
m := m
sharedIDs = append(sharedIDs, m.ID)
}

overlaps = append(overlaps, fmt.Sprintf("- ASN (%d) not unique, shared with %s", asn, sharedIDs))
}

if len(overlaps) == 0 {
return false
}

sort.Slice(overlaps, func(i, j int) bool {
return overlaps[i] < overlaps[j]
})

i.details = strings.Join(overlaps, "\n")

return true
}

func (i *IssueASNUniqueness) Details() string {
return i.details
}
47 changes: 47 additions & 0 deletions cmd/metal-api/internal/issues/bmc-info-outdated.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package issues

import (
"fmt"
"time"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
IssueTypeBMCInfoOutdated IssueType = "bmc-info-outdated"
)

type (
IssueBMCInfoOutdated struct {
details string
}
)

func (i *IssueBMCInfoOutdated) Details() string {
return i.details
}

func (i *IssueBMCInfoOutdated) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
if m.IPMI.LastUpdated.IsZero() {
i.details = "machine ipmi has never been set"
return true
}

lastUpdated := time.Since(m.IPMI.LastUpdated)

if lastUpdated > 20*time.Minute {
i.details = fmt.Sprintf("last updated %s ago", lastUpdated.String())
return true
}

return false
}

func (*IssueBMCInfoOutdated) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeBMCInfoOutdated,
Severity: IssueSeverityMajor,
Description: "BMC has not been updated from either metal-hammer or metal-bmc",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-info-outdated",
}
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-ip.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
IssueTypeBMCWithoutIP IssueType = "bmc-without-ip"
)

type (
IssueBMCWithoutIP struct{}
)

func (i *IssueBMCWithoutIP) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeBMCWithoutIP,
Severity: IssueSeverityMajor,
Description: "BMC has no ip address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip",
}
}

func (i *IssueBMCWithoutIP) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
return m.IPMI.Address == ""
}

func (i *IssueBMCWithoutIP) Details() string {
return ""
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-mac.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
IssueTypeBMCWithoutMAC IssueType = "bmc-without-mac"
)

type (
IssueBMCWithoutMAC struct{}
)

func (i *IssueBMCWithoutMAC) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeBMCWithoutMAC,
Severity: IssueSeverityMajor,
Description: "BMC has no mac address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac",
}
}

func (i *IssueBMCWithoutMAC) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
return m.IPMI.MacAddress == ""
}

func (i *IssueBMCWithoutMAC) Details() string {
return ""
}
38 changes: 38 additions & 0 deletions cmd/metal-api/internal/issues/crash-loop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
IssueTypeCrashLoop IssueType = "crashloop"
)

type (
IssueCrashLoop struct{}
)

func (i *IssueCrashLoop) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeCrashLoop,
Severity: IssueSeverityMajor,
Description: "machine is in a provisioning crash loop (⭕)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop",
}
}

func (i *IssueCrashLoop) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
if ec.CrashLoop {
if pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventWaiting {
// Machine which are waiting are not considered to have issues
} else {
return true
}
}
return false
}

func (i *IssueCrashLoop) Details() string {
return ""
}
41 changes: 41 additions & 0 deletions cmd/metal-api/internal/issues/failed-machine-reclaim.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
IssueTypeFailedMachineReclaim IssueType = "failed-machine-reclaim"
)

type (
IssueFailedMachineReclaim struct{}
)

func (i *IssueFailedMachineReclaim) Spec() *issueSpec {
return &issueSpec{
Type: IssueTypeFailedMachineReclaim,
Severity: IssueSeverityCritical,
Description: "machine phones home but not allocated",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim",
}
}

func (i *IssueFailedMachineReclaim) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *IssueConfig) bool {
if ec.FailedMachineReclaim {
return true
}

// compatibility: before the provisioning FSM was renewed, this state could be detected the following way
// we should keep this condition
if m.Allocation == nil && pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventPhonedHome {
return true
}

return false
}

func (i *IssueFailedMachineReclaim) Details() string {
return ""
}
Loading

0 comments on commit 1497133

Please sign in to comment.