From 72b0fc0800fd38b46bed27096cd660110ae28079 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Oct 2023 10:09:27 +0200 Subject: [PATCH] Add endpoint for machine issues. (#471) --- .../internal/issues/asn-uniqueness.go | 122 ++++ .../internal/issues/bmc-info-outdated.go | 47 ++ .../internal/issues/bmc-without-ip.go | 28 + .../internal/issues/bmc-without-mac.go | 28 + cmd/metal-api/internal/issues/crash-loop.go | 38 ++ .../internal/issues/failed-machine-reclaim.go | 41 ++ cmd/metal-api/internal/issues/issues.go | 207 +++++++ cmd/metal-api/internal/issues/issues_test.go | 530 ++++++++++++++++++ .../internal/issues/last-event-error.go | 51 ++ .../internal/issues/liveliness-dead.go | 28 + .../issues/liveliness-not-available.go | 33 ++ .../internal/issues/liveliness-unknown.go | 28 + .../internal/issues/no-event-container.go | 30 + cmd/metal-api/internal/issues/no-partition.go | 28 + .../internal/issues/non-distinct-bmc-ip.go | 65 +++ cmd/metal-api/internal/issues/severeties.go | 48 ++ cmd/metal-api/internal/issues/types.go | 70 +++ cmd/metal-api/internal/metal/machine.go | 5 - cmd/metal-api/internal/metal/provisioning.go | 5 - .../internal/metal/provisioning_test.go | 36 -- .../internal/service/machine-service.go | 170 +++++- .../internal/service/partition-service.go | 179 +++--- .../service/partition-service_test.go | 24 +- cmd/metal-api/internal/service/v1/machine.go | 25 + .../internal/service/v1/partition.go | 15 +- go.mod | 4 +- go.sum | 8 +- spec/metal-api.json | 366 +++++++++++- 28 files changed, 2056 insertions(+), 203 deletions(-) create mode 100644 cmd/metal-api/internal/issues/asn-uniqueness.go create mode 100644 cmd/metal-api/internal/issues/bmc-info-outdated.go create mode 100644 cmd/metal-api/internal/issues/bmc-without-ip.go create mode 100644 cmd/metal-api/internal/issues/bmc-without-mac.go create mode 100644 cmd/metal-api/internal/issues/crash-loop.go create mode 100644 cmd/metal-api/internal/issues/failed-machine-reclaim.go create mode 100644 cmd/metal-api/internal/issues/issues.go create mode 100644 cmd/metal-api/internal/issues/issues_test.go create mode 100644 cmd/metal-api/internal/issues/last-event-error.go create mode 100644 cmd/metal-api/internal/issues/liveliness-dead.go create mode 100644 cmd/metal-api/internal/issues/liveliness-not-available.go create mode 100644 cmd/metal-api/internal/issues/liveliness-unknown.go create mode 100644 cmd/metal-api/internal/issues/no-event-container.go create mode 100644 cmd/metal-api/internal/issues/no-partition.go create mode 100644 cmd/metal-api/internal/issues/non-distinct-bmc-ip.go create mode 100644 cmd/metal-api/internal/issues/severeties.go create mode 100644 cmd/metal-api/internal/issues/types.go diff --git a/cmd/metal-api/internal/issues/asn-uniqueness.go b/cmd/metal-api/internal/issues/asn-uniqueness.go new file mode 100644 index 000000000..9c8befe62 --- /dev/null +++ b/cmd/metal-api/internal/issues/asn-uniqueness.go @@ -0,0 +1,122 @@ +package issues + +import ( + "fmt" + "sort" + "strings" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +const ( + TypeASNUniqueness Type = "asn-not-unique" +) + +type ( + issueASNUniqueness struct { + details string + } +) + +func (i *issueASNUniqueness) Spec() *spec { + return &spec{ + Type: TypeASNUniqueness, + Severity: SeverityMinor, + Description: "The ASN is not unique (only impact on firewalls)", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique", + } +} + +func (i *issueASNUniqueness) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + var ( + machineASNs = map[uint32]metal.Machines{} + overlaps []string + isNoFirewall = func(m metal.Machine) bool { + return m.Allocation == nil || m.Allocation.Role != metal.RoleFirewall + } + ) + + if isNoFirewall(m) { + return false + } + + for _, n := range m.Allocation.MachineNetworks { + n := n + + if n.ASN == 0 { + continue + } + + machineASNs[n.ASN] = nil + } + + for _, machineFromAll := range c.Machines { + machineFromAll := machineFromAll + + if machineFromAll.ID == m.ID { + continue + } + otherMachine := machineFromAll + + if isNoFirewall(otherMachine) { + continue + } + + for _, n := range otherMachine.Allocation.MachineNetworks { + n := n + + if n.ASN == 0 { + continue + } + + _, ok := machineASNs[n.ASN] + if !ok { + continue + } + + machineASNs[n.ASN] = append(machineASNs[n.ASN], otherMachine) + } + } + + var asnList []uint32 + for asn := range machineASNs { + asn := asn + asnList = append(asnList, asn) + } + sort.Slice(asnList, func(i, j int) bool { + return asnList[i] < asnList[j] + }) + + for _, asn := range asnList { + asn := asn + + overlappingMachines, ok := machineASNs[asn] + if !ok || len(overlappingMachines) == 0 { + continue + } + + var sharedIDs []string + for _, m := range overlappingMachines { + m := m + sharedIDs = append(sharedIDs, m.ID) + } + + overlaps = append(overlaps, fmt.Sprintf("- ASN (%d) not unique, shared with %s", asn, sharedIDs)) + } + + if len(overlaps) == 0 { + return false + } + + sort.Slice(overlaps, func(i, j int) bool { + return overlaps[i] < overlaps[j] + }) + + i.details = strings.Join(overlaps, "\n") + + return true +} + +func (i *issueASNUniqueness) Details() string { + return i.details +} diff --git a/cmd/metal-api/internal/issues/bmc-info-outdated.go b/cmd/metal-api/internal/issues/bmc-info-outdated.go new file mode 100644 index 000000000..1a03ced9f --- /dev/null +++ b/cmd/metal-api/internal/issues/bmc-info-outdated.go @@ -0,0 +1,47 @@ +package issues + +import ( + "fmt" + "time" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +const ( + TypeBMCInfoOutdated Type = "bmc-info-outdated" +) + +type ( + issueBMCInfoOutdated struct { + details string + } +) + +func (i *issueBMCInfoOutdated) Details() string { + return i.details +} + +func (i *issueBMCInfoOutdated) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + if m.IPMI.LastUpdated.IsZero() { + i.details = "machine ipmi has never been set" + return true + } + + lastUpdated := time.Since(m.IPMI.LastUpdated) + + if lastUpdated > 20*time.Minute { + i.details = fmt.Sprintf("last updated %s ago", lastUpdated.String()) + return true + } + + return false +} + +func (*issueBMCInfoOutdated) Spec() *spec { + return &spec{ + Type: TypeBMCInfoOutdated, + Severity: SeverityMajor, + Description: "BMC has not been updated from either metal-hammer or metal-bmc", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-info-outdated", + } +} diff --git a/cmd/metal-api/internal/issues/bmc-without-ip.go b/cmd/metal-api/internal/issues/bmc-without-ip.go new file mode 100644 index 000000000..5b1766307 --- /dev/null +++ b/cmd/metal-api/internal/issues/bmc-without-ip.go @@ -0,0 +1,28 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeBMCWithoutIP Type = "bmc-without-ip" +) + +type ( + issueBMCWithoutIP struct{} +) + +func (i *issueBMCWithoutIP) Spec() *spec { + return &spec{ + Type: TypeBMCWithoutIP, + Severity: SeverityMajor, + Description: "BMC has no ip address", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip", + } +} + +func (i *issueBMCWithoutIP) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return m.IPMI.Address == "" +} + +func (i *issueBMCWithoutIP) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/bmc-without-mac.go b/cmd/metal-api/internal/issues/bmc-without-mac.go new file mode 100644 index 000000000..7d92209c2 --- /dev/null +++ b/cmd/metal-api/internal/issues/bmc-without-mac.go @@ -0,0 +1,28 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeBMCWithoutMAC Type = "bmc-without-mac" +) + +type ( + issueBMCWithoutMAC struct{} +) + +func (i *issueBMCWithoutMAC) Spec() *spec { + return &spec{ + Type: TypeBMCWithoutMAC, + Severity: SeverityMajor, + Description: "BMC has no mac address", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac", + } +} + +func (i *issueBMCWithoutMAC) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return m.IPMI.MacAddress == "" +} + +func (i *issueBMCWithoutMAC) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/crash-loop.go b/cmd/metal-api/internal/issues/crash-loop.go new file mode 100644 index 000000000..cf3a6c866 --- /dev/null +++ b/cmd/metal-api/internal/issues/crash-loop.go @@ -0,0 +1,38 @@ +package issues + +import ( + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + "github.com/metal-stack/metal-lib/pkg/pointer" +) + +const ( + TypeCrashLoop Type = "crashloop" +) + +type ( + issueCrashLoop struct{} +) + +func (i *issueCrashLoop) Spec() *spec { + return &spec{ + Type: TypeCrashLoop, + Severity: SeverityMajor, + Description: "machine is in a provisioning crash loop (⭕)", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop", + } +} + +func (i *issueCrashLoop) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + if ec.CrashLoop { + if pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventWaiting { + // Machine which are waiting are not considered to have issues + } else { + return true + } + } + return false +} + +func (i *issueCrashLoop) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/failed-machine-reclaim.go b/cmd/metal-api/internal/issues/failed-machine-reclaim.go new file mode 100644 index 000000000..5f3e84861 --- /dev/null +++ b/cmd/metal-api/internal/issues/failed-machine-reclaim.go @@ -0,0 +1,41 @@ +package issues + +import ( + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + "github.com/metal-stack/metal-lib/pkg/pointer" +) + +const ( + TypeFailedMachineReclaim Type = "failed-machine-reclaim" +) + +type ( + issueFailedMachineReclaim struct{} +) + +func (i *issueFailedMachineReclaim) Spec() *spec { + return &spec{ + Type: TypeFailedMachineReclaim, + Severity: SeverityCritical, + Description: "machine phones home but not allocated", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim", + } +} + +func (i *issueFailedMachineReclaim) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + if ec.FailedMachineReclaim { + return true + } + + // compatibility: before the provisioning FSM was renewed, this state could be detected the following way + // we should keep this condition + if m.Allocation == nil && pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventPhonedHome { + return true + } + + return false +} + +func (i *issueFailedMachineReclaim) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/issues.go b/cmd/metal-api/internal/issues/issues.go new file mode 100644 index 000000000..0d72e27b6 --- /dev/null +++ b/cmd/metal-api/internal/issues/issues.go @@ -0,0 +1,207 @@ +package issues + +import ( + "sort" + "time" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +type ( + // Config contains configuration parameters for finding machine issues + Config struct { + // Machines are the machines to evaluate issues for + Machines metal.Machines + // EventContainers are the event containers of the machines to evaluate issues for + // if not provided the machines will have a no-event-container issue + EventContainers metal.ProvisioningEventContainers + // Severity filters issues for the given severity + Severity Severity + // Only includes only the given issue types + Only []Type + // Omit omits the given issue types, this has precedence over only + Omit []Type + // LastErrorThreshold specifies for how long in the past the last event error is counted as an error + LastErrorThreshold time.Duration + } + + // Issue formulates an issue of a machine + Issue struct { + // Type specifies the issue type (id) + Type Type + // Severity specifies the severity of an issue + Severity Severity + // Description provides an issue description + Description string + // RefURL provides a link to a more detailed issue description in the metal-stack documentation + RefURL string + // Details may contain additional details on an evaluated issue + Details string + } + + // Issues is a list of issues + Issues []Issue + + // MachineWithIssues summarizes a machine with issues + MachineWithIssues struct { + Machine *metal.Machine + Issues Issues + } + // MachineIssues is map of a machine response to a list of machine issues + MachineIssues []*MachineWithIssues + + // MachineIssuesMap is a map of machine issues with the machine id as a map key + MachineIssuesMap map[string]*MachineWithIssues + + issue interface { + // Evaluate decides whether a given machine has the machine issue. + // the third argument contains additional information that may be required for the issue evaluation + Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool + // Spec returns the issue spec of this issue. + Spec() *spec + // Details returns additional information on the issue after the evaluation. + Details() string + } + + // spec defines the specification of an issue. + spec struct { + Type Type + Severity Severity + Description string + RefURL string + } +) + +func All() Issues { + var res Issues + + for _, t := range AllIssueTypes() { + i, err := NewIssueFromType(t) + if err != nil { + continue + } + + res = append(res, toIssue(i)) + } + + return res +} + +func toIssue(i issue) Issue { + return Issue{ + Type: i.Spec().Type, + Severity: i.Spec().Severity, + Description: i.Spec().Description, + RefURL: i.Spec().RefURL, + Details: i.Details(), + } +} + +func Find(c *Config) (MachineIssuesMap, error) { + if c.LastErrorThreshold == 0 { + c.LastErrorThreshold = DefaultLastErrorThreshold() + } + + res := MachineIssuesMap{} + + ecs := c.EventContainers.ByID() + + for _, m := range c.Machines { + m := m + + ec, ok := ecs[m.ID] + if !ok { + res.add(m, toIssue(&issueNoEventContainer{})) + continue + } + + for _, t := range AllIssueTypes() { + if !c.includeIssue(t) { + continue + } + + i, err := NewIssueFromType(t) + if err != nil { + return nil, err + } + + if i.Evaluate(m, ec, c) { + res.add(m, toIssue(i)) + } + } + } + + return res, nil +} + +func (mis MachineIssues) Get(id string) *MachineWithIssues { + for _, m := range mis { + m := m + + if m.Machine == nil { + continue + } + + if m.Machine.ID == id { + return m + } + } + + return nil +} + +func (c *Config) includeIssue(t Type) bool { + issue, err := NewIssueFromType(t) + if err != nil { + return false + } + + if issue.Spec().Severity.LowerThan(c.Severity) { + return false + } + + for _, o := range c.Omit { + if t == o { + return false + } + } + + if len(c.Only) > 0 { + for _, o := range c.Only { + if t == o { + return true + } + } + return false + } + + return true +} + +func (mim MachineIssuesMap) add(m metal.Machine, issue Issue) { + machineWithIssues, ok := mim[m.ID] + if !ok { + machineWithIssues = &MachineWithIssues{ + Machine: &m, + } + } + machineWithIssues.Issues = append(machineWithIssues.Issues, issue) + mim[m.ID] = machineWithIssues +} + +func (mim MachineIssuesMap) ToList() MachineIssues { + var res MachineIssues + + for _, machineWithIssues := range mim { + res = append(res, &MachineWithIssues{ + Machine: machineWithIssues.Machine, + Issues: machineWithIssues.Issues, + }) + } + + sort.Slice(res, func(i, j int) bool { + return res[i].Machine.ID < res[j].Machine.ID + }) + + return res +} diff --git a/cmd/metal-api/internal/issues/issues_test.go b/cmd/metal-api/internal/issues/issues_test.go new file mode 100644 index 000000000..934f26add --- /dev/null +++ b/cmd/metal-api/internal/issues/issues_test.go @@ -0,0 +1,530 @@ +package issues + +import ( + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + "github.com/stretchr/testify/require" +) + +func TestFindIssues(t *testing.T) { + machineTemplate := func(id string) metal.Machine { + return metal.Machine{ + Base: metal.Base{ + ID: id, + }, + PartitionID: "a", + IPMI: metal.IPMI{ + Address: "1.2.3.4", + MacAddress: "aa:bb:00", + LastUpdated: time.Now().Add(-1 * time.Minute), + }, + } + } + eventContainerTemplate := func(id string) metal.ProvisioningEventContainer { + return metal.ProvisioningEventContainer{ + Base: metal.Base{ + ID: id, + }, + Liveliness: metal.MachineLivelinessAlive, + } + } + + tests := []struct { + name string + only []Type + + machines func() metal.Machines + eventContainers func() metal.ProvisioningEventContainers + + want func(machines metal.Machines) MachineIssues + }{ + { + name: "good machine has no issues", + machines: func() metal.Machines { + return metal.Machines{ + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + return metal.ProvisioningEventContainers{ + eventContainerTemplate("good"), + } + }, + want: nil, + }, + { + name: "no partition", + only: []Type{TypeNoPartition}, + machines: func() metal.Machines { + noPartitionMachine := machineTemplate("no-partition") + noPartitionMachine.PartitionID = "" + + return metal.Machines{ + noPartitionMachine, + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + return metal.ProvisioningEventContainers{ + eventContainerTemplate("no-partition"), + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueNoPartition{}), + }, + }, + } + }, + }, + { + name: "liveliness dead", + only: []Type{TypeLivelinessDead}, + machines: func() metal.Machines { + return metal.Machines{ + machineTemplate("dead"), + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + dead := eventContainerTemplate("dead") + dead.Liveliness = metal.MachineLivelinessDead + + return metal.ProvisioningEventContainers{ + dead, + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueLivelinessDead{}), + }, + }, + } + }, + }, + { + name: "liveliness unknown", + only: []Type{TypeLivelinessUnknown}, + machines: func() metal.Machines { + return metal.Machines{ + machineTemplate("unknown"), + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + unknown := eventContainerTemplate("unknown") + unknown.Liveliness = metal.MachineLivelinessUnknown + + return metal.ProvisioningEventContainers{ + unknown, + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueLivelinessUnknown{}), + }, + }, + } + }, + }, + { + name: "liveliness not available", + only: []Type{TypeLivelinessNotAvailable}, + machines: func() metal.Machines { + return metal.Machines{ + machineTemplate("n/a"), + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + na := eventContainerTemplate("n/a") + na.Liveliness = metal.MachineLiveliness("") + + return metal.ProvisioningEventContainers{ + na, + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueLivelinessNotAvailable{}), + }, + }, + } + }, + }, + { + name: "failed machine reclaim", + only: []Type{TypeFailedMachineReclaim}, + machines: func() metal.Machines { + failedOld := machineTemplate("failed-old") + + return metal.Machines{ + machineTemplate("good"), + machineTemplate("failed"), + failedOld, + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + failed := eventContainerTemplate("failed") + failed.FailedMachineReclaim = true + + failedOld := eventContainerTemplate("failed-old") + failedOld.Events = metal.ProvisioningEvents{ + { + Event: metal.ProvisioningEventPhonedHome, + }, + } + + return metal.ProvisioningEventContainers{ + failed, + eventContainerTemplate("good"), + failedOld, + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueFailedMachineReclaim{}), + }, + }, + { + Machine: &machines[2], + Issues: Issues{ + toIssue(&issueFailedMachineReclaim{}), + }, + }, + } + }, + }, + { + name: "crashloop", + only: []Type{TypeCrashLoop}, + machines: func() metal.Machines { + return metal.Machines{ + machineTemplate("good"), + machineTemplate("crash"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + crash := eventContainerTemplate("crash") + crash.CrashLoop = true + + return metal.ProvisioningEventContainers{ + crash, + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueCrashLoop{}), + }, + }, + } + }, + }, + // FIXME: + // { + // name: "last event error", + // only: []IssueType{IssueTypeLastEventError}, + // machines: func() metal.Machines { + // lastEventErrorMachine := machineTemplate("last") + + // return metal.Machines{ + // machineTemplate("good"), + // lastEventErrorMachine, + // } + // }, + // eventContainers: func() metal.ProvisioningEventContainers { + // last := eventContainerTemplate("last") + // last.LastErrorEvent = &metal.ProvisioningEvent{ + // Time: time.Now().Add(-5 * time.Minute), + // } + // return metal.ProvisioningEventContainers{ + // last, + // eventContainerTemplate("good"), + // } + // }, + // want: func(machines metal.Machines) MachineIssues { + // return MachineIssues{ + // { + // Machine: &machines[1], + // Issues: Issues{ + // toIssue(&IssueLastEventError{details: "occurred 5m0s ago"}), + // }, + // }, + // } + // }, + // }, + { + name: "bmc without mac", + only: []Type{TypeBMCWithoutMAC}, + machines: func() metal.Machines { + noMac := machineTemplate("no-mac") + noMac.IPMI.MacAddress = "" + + return metal.Machines{ + machineTemplate("good"), + noMac, + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + crash := eventContainerTemplate("crash") + crash.CrashLoop = true + + return metal.ProvisioningEventContainers{ + eventContainerTemplate("no-mac"), + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueBMCWithoutMAC{}), + }, + }, + } + }, + }, + { + name: "bmc without ip", + only: []Type{TypeBMCWithoutIP}, + machines: func() metal.Machines { + noIP := machineTemplate("no-ip") + noIP.IPMI.Address = "" + + return metal.Machines{ + machineTemplate("good"), + noIP, + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + crash := eventContainerTemplate("crash") + crash.CrashLoop = true + + return metal.ProvisioningEventContainers{ + eventContainerTemplate("no-ip"), + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueBMCWithoutIP{}), + }, + }, + } + }, + }, + // FIXME: + // { + // name: "bmc info outdated", + // only: []IssueType{IssueTypeBMCInfoOutdated}, + // machines: func() metal.Machines { + // outdated := machineTemplate("outdated") + // outdated.IPMI.LastUpdated = time.Now().Add(-3 * 60 * time.Minute) + + // return metal.Machines{ + // machineTemplate("good"), + // outdated, + // } + // }, + // eventContainers: func() metal.ProvisioningEventContainers { + // return metal.ProvisioningEventContainers{ + // eventContainerTemplate("outdated"), + // eventContainerTemplate("good"), + // } + // }, + // want: func(machines metal.Machines) MachineIssues { + // return MachineIssues{ + // { + // Machine: &machines[1], + // Issues: Issues{ + // toIssue(&IssueBMCInfoOutdated{ + // details: "last updated 3h0m0s ago", + // }), + // }, + // }, + // } + // }, + // }, + { + name: "asn shared", + only: []Type{TypeASNUniqueness}, + machines: func() metal.Machines { + shared1 := machineTemplate("shared1") + shared1.Allocation = &metal.MachineAllocation{ + Role: metal.RoleFirewall, + MachineNetworks: []*metal.MachineNetwork{ + { + ASN: 0, + }, + { + ASN: 100, + }, + { + ASN: 200, + }, + }, + } + + shared2 := machineTemplate("shared2") + shared2.Allocation = &metal.MachineAllocation{ + Role: metal.RoleFirewall, + MachineNetworks: []*metal.MachineNetwork{ + { + ASN: 1, + }, + { + ASN: 100, + }, + { + ASN: 200, + }, + }, + } + + return metal.Machines{ + shared1, + shared2, + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + return metal.ProvisioningEventContainers{ + eventContainerTemplate("shared1"), + eventContainerTemplate("shared2"), + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueASNUniqueness{ + details: fmt.Sprintf("- ASN (100) not unique, shared with [%[1]s]\n- ASN (200) not unique, shared with [%[1]s]", machines[1].ID), + }), + }, + }, + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueASNUniqueness{ + details: fmt.Sprintf("- ASN (100) not unique, shared with [%[1]s]\n- ASN (200) not unique, shared with [%[1]s]", machines[0].ID), + }), + }, + }, + } + }, + }, + { + name: "non distinct bmc ip", + only: []Type{TypeNonDistinctBMCIP}, + machines: func() metal.Machines { + bmc1 := machineTemplate("bmc1") + bmc1.IPMI.Address = "127.0.0.1" + + bmc2 := machineTemplate("bmc2") + bmc2.IPMI.Address = "127.0.0.1" + + return metal.Machines{ + bmc1, + bmc2, + machineTemplate("good"), + } + }, + eventContainers: func() metal.ProvisioningEventContainers { + return metal.ProvisioningEventContainers{ + eventContainerTemplate("bmc1"), + eventContainerTemplate("bmc2"), + eventContainerTemplate("good"), + } + }, + want: func(machines metal.Machines) MachineIssues { + return MachineIssues{ + { + Machine: &machines[0], + Issues: Issues{ + toIssue(&issueNonDistinctBMCIP{ + details: fmt.Sprintf("BMC IP (127.0.0.1) not unique, shared with [%[1]s]", machines[1].ID), + }), + }, + }, + { + Machine: &machines[1], + Issues: Issues{ + toIssue(&issueNonDistinctBMCIP{ + details: fmt.Sprintf("BMC IP (127.0.0.1) not unique, shared with [%[1]s]", machines[0].ID), + }), + }, + }, + } + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + ms := tt.machines() + + got, err := Find(&Config{ + Machines: ms, + EventContainers: tt.eventContainers(), + Only: tt.only, + LastErrorThreshold: DefaultLastErrorThreshold(), + }) + require.NoError(t, err) + + var want MachineIssues + if tt.want != nil { + want = tt.want(ms) + } + + if diff := cmp.Diff(want, got.ToList(), cmp.AllowUnexported(issueLastEventError{}, issueASNUniqueness{}, issueNonDistinctBMCIP{})); diff != "" { + t.Errorf("diff (+got -want):\n %s", diff) + } + }) + } +} + +func TestAllIssues(t *testing.T) { + issuesTypes := map[Type]bool{} + for _, i := range All() { + issuesTypes[i.Type] = true + } + + for _, ty := range AllIssueTypes() { + if _, ok := issuesTypes[ty]; !ok { + t.Errorf("issue of type %s not contained in all issues", ty) + } + } +} diff --git a/cmd/metal-api/internal/issues/last-event-error.go b/cmd/metal-api/internal/issues/last-event-error.go new file mode 100644 index 000000000..99952d561 --- /dev/null +++ b/cmd/metal-api/internal/issues/last-event-error.go @@ -0,0 +1,51 @@ +package issues + +import ( + "fmt" + "time" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +const ( + TypeLastEventError Type = "last-event-error" +) + +type ( + issueLastEventError struct { + details string + } +) + +func DefaultLastErrorThreshold() time.Duration { + return 7 * 24 * time.Hour +} + +func (i *issueLastEventError) Spec() *spec { + return &spec{ + Type: TypeLastEventError, + Severity: SeverityMinor, + Description: "the machine had an error during the provisioning lifecycle", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#last-event-error", + } +} + +func (i *issueLastEventError) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + if c.LastErrorThreshold == 0 { + return false + } + + if ec.LastErrorEvent != nil { + timeSince := time.Since(time.Time(ec.LastErrorEvent.Time)) + if timeSince < c.LastErrorThreshold { + i.details = fmt.Sprintf("occurred %s ago", timeSince.String()) + return true + } + } + + return false +} + +func (i *issueLastEventError) Details() string { + return i.details +} diff --git a/cmd/metal-api/internal/issues/liveliness-dead.go b/cmd/metal-api/internal/issues/liveliness-dead.go new file mode 100644 index 000000000..c7c8d1407 --- /dev/null +++ b/cmd/metal-api/internal/issues/liveliness-dead.go @@ -0,0 +1,28 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeLivelinessDead Type = "liveliness-dead" +) + +type ( + issueLivelinessDead struct{} +) + +func (i *issueLivelinessDead) Spec() *spec { + return &spec{ + Type: TypeLivelinessDead, + Severity: SeverityMajor, + Description: "the machine is not sending events anymore", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-dead", + } +} + +func (i *issueLivelinessDead) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return ec.Liveliness == metal.MachineLivelinessDead +} + +func (i *issueLivelinessDead) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/liveliness-not-available.go b/cmd/metal-api/internal/issues/liveliness-not-available.go new file mode 100644 index 000000000..647eb468b --- /dev/null +++ b/cmd/metal-api/internal/issues/liveliness-not-available.go @@ -0,0 +1,33 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeLivelinessNotAvailable Type = "liveliness-not-available" +) + +type ( + issueLivelinessNotAvailable struct{} +) + +func (i *issueLivelinessNotAvailable) Spec() *spec { + return &spec{ + Type: TypeLivelinessNotAvailable, + Severity: SeverityMinor, + Description: "the machine liveliness is not available", + } +} + +func (i *issueLivelinessNotAvailable) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + allowed := map[metal.MachineLiveliness]bool{ + metal.MachineLivelinessAlive: true, + metal.MachineLivelinessDead: true, + metal.MachineLivelinessUnknown: true, + } + + return !allowed[ec.Liveliness] +} + +func (i *issueLivelinessNotAvailable) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/liveliness-unknown.go b/cmd/metal-api/internal/issues/liveliness-unknown.go new file mode 100644 index 000000000..73cf3bd18 --- /dev/null +++ b/cmd/metal-api/internal/issues/liveliness-unknown.go @@ -0,0 +1,28 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeLivelinessUnknown Type = "liveliness-unknown" +) + +type ( + issueLivelinessUnknown struct{} +) + +func (i *issueLivelinessUnknown) Spec() *spec { + return &spec{ + Type: TypeLivelinessUnknown, + Severity: SeverityMajor, + Description: "the machine is not sending LLDP alive messages anymore", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-unknown", + } +} + +func (i *issueLivelinessUnknown) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return ec.Liveliness == metal.MachineLivelinessUnknown +} + +func (i *issueLivelinessUnknown) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/no-event-container.go b/cmd/metal-api/internal/issues/no-event-container.go new file mode 100644 index 000000000..a5da32aca --- /dev/null +++ b/cmd/metal-api/internal/issues/no-event-container.go @@ -0,0 +1,30 @@ +package issues + +import ( + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +const ( + TypeNoEventContainer Type = "no-event-container" +) + +type ( + issueNoEventContainer struct{} +) + +func (i *issueNoEventContainer) Spec() *spec { + return &spec{ + Type: TypeNoEventContainer, + Severity: SeverityMajor, + Description: "machine has no event container", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#no-event-container", + } +} + +func (i *issueNoEventContainer) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return ec.Base.ID == "" +} + +func (i *issueNoEventContainer) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/no-partition.go b/cmd/metal-api/internal/issues/no-partition.go new file mode 100644 index 000000000..b4e79710d --- /dev/null +++ b/cmd/metal-api/internal/issues/no-partition.go @@ -0,0 +1,28 @@ +package issues + +import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + +const ( + TypeNoPartition Type = "no-partition" +) + +type ( + issueNoPartition struct{} +) + +func (i *issueNoPartition) Spec() *spec { + return &spec{ + Type: TypeNoPartition, + Severity: SeverityMajor, + Description: "machine with no partition", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#no-partition", + } +} + +func (i *issueNoPartition) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + return m.PartitionID == "" +} + +func (i *issueNoPartition) Details() string { + return "" +} diff --git a/cmd/metal-api/internal/issues/non-distinct-bmc-ip.go b/cmd/metal-api/internal/issues/non-distinct-bmc-ip.go new file mode 100644 index 000000000..15ebc6077 --- /dev/null +++ b/cmd/metal-api/internal/issues/non-distinct-bmc-ip.go @@ -0,0 +1,65 @@ +package issues + +import ( + "fmt" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +const ( + TypeNonDistinctBMCIP Type = "bmc-no-distinct-ip" +) + +type ( + issueNonDistinctBMCIP struct { + details string + } +) + +func (i *issueNonDistinctBMCIP) Spec() *spec { + return &spec{ + Type: TypeNonDistinctBMCIP, + Description: "BMC IP address is not distinct", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-no-distinct-ip", + } +} + +func (i *issueNonDistinctBMCIP) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool { + if m.IPMI.Address == "" { + return false + } + + var ( + bmcIP = m.IPMI.Address + overlaps []string + ) + + for _, machineFromAll := range c.Machines { + machineFromAll := machineFromAll + + if machineFromAll.ID == m.ID { + continue + } + otherMachine := machineFromAll + + if otherMachine.IPMI.Address == "" { + continue + } + + if bmcIP == otherMachine.IPMI.Address { + overlaps = append(overlaps, otherMachine.ID) + } + } + + if len(overlaps) == 0 { + return false + } + + i.details = fmt.Sprintf("BMC IP (%s) not unique, shared with %s", bmcIP, overlaps) + + return true +} + +func (i *issueNonDistinctBMCIP) Details() string { + return i.details +} diff --git a/cmd/metal-api/internal/issues/severeties.go b/cmd/metal-api/internal/issues/severeties.go new file mode 100644 index 000000000..1cba9c32b --- /dev/null +++ b/cmd/metal-api/internal/issues/severeties.go @@ -0,0 +1,48 @@ +package issues + +import "fmt" + +const ( + // SeverityMinor is an issue that should be checked from time to time but has no bad effects for the user. + SeverityMinor Severity = "minor" + // SeverityMajor is an issue where user experience is affected or provider resources are wasted. + // overall functionality is still maintained though. major issues should be resolved as soon as possible. + SeverityMajor Severity = "major" + // SeverityCritical is an issue that can lead to disfunction of the system and need to be handled as quickly as possible. + SeverityCritical Severity = "critical" +) + +type ( + Severity string +) + +func AllSevereties() []Severity { + return []Severity{ + SeverityMinor, + SeverityMajor, + SeverityCritical, + } +} + +func SeverityFromString(input string) (Severity, error) { + switch Severity(input) { + case SeverityCritical: + return SeverityCritical, nil + case SeverityMajor: + return SeverityMajor, nil + case SeverityMinor: + return SeverityMinor, nil + default: + return "", fmt.Errorf("unknown issue severity: %s", input) + } +} + +func (s Severity) LowerThan(o Severity) bool { + smap := map[Severity]int{ + SeverityCritical: 10, + SeverityMajor: 5, + SeverityMinor: 0, + } + + return smap[s] < smap[o] +} diff --git a/cmd/metal-api/internal/issues/types.go b/cmd/metal-api/internal/issues/types.go new file mode 100644 index 000000000..db05f05b1 --- /dev/null +++ b/cmd/metal-api/internal/issues/types.go @@ -0,0 +1,70 @@ +package issues + +import "fmt" + +type ( + Type string +) + +func AllIssueTypes() []Type { + return []Type{ + TypeNoPartition, + TypeLivelinessDead, + TypeLivelinessUnknown, + TypeLivelinessNotAvailable, + TypeFailedMachineReclaim, + TypeCrashLoop, + TypeLastEventError, + TypeBMCWithoutMAC, + TypeBMCWithoutIP, + TypeBMCInfoOutdated, + TypeASNUniqueness, + TypeNonDistinctBMCIP, + TypeNoEventContainer, + } +} + +func NotAllocatableIssueTypes() []Type { + return []Type{ + TypeNoPartition, + TypeLivelinessDead, + TypeLivelinessUnknown, + TypeLivelinessNotAvailable, + TypeFailedMachineReclaim, + TypeCrashLoop, + TypeNoEventContainer, + } +} + +func NewIssueFromType(t Type) (issue, error) { + switch t { + case TypeNoPartition: + return &issueNoPartition{}, nil + case TypeLivelinessDead: + return &issueLivelinessDead{}, nil + case TypeLivelinessUnknown: + return &issueLivelinessUnknown{}, nil + case TypeLivelinessNotAvailable: + return &issueLivelinessNotAvailable{}, nil + case TypeFailedMachineReclaim: + return &issueFailedMachineReclaim{}, nil + case TypeCrashLoop: + return &issueCrashLoop{}, nil + case TypeLastEventError: + return &issueLastEventError{}, nil + case TypeBMCWithoutMAC: + return &issueBMCWithoutMAC{}, nil + case TypeBMCWithoutIP: + return &issueBMCWithoutIP{}, nil + case TypeBMCInfoOutdated: + return &issueBMCInfoOutdated{}, nil + case TypeASNUniqueness: + return &issueASNUniqueness{}, nil + case TypeNonDistinctBMCIP: + return &issueNonDistinctBMCIP{}, nil + case TypeNoEventContainer: + return &issueNoEventContainer{}, nil + default: + return nil, fmt.Errorf("unknown issue type: %s", t) + } +} diff --git a/cmd/metal-api/internal/metal/machine.go b/cmd/metal-api/internal/metal/machine.go index 3a45ab765..0195ec646 100644 --- a/cmd/metal-api/internal/metal/machine.go +++ b/cmd/metal-api/internal/metal/machine.go @@ -304,11 +304,6 @@ const ( MachineResurrectAfter time.Duration = time.Hour ) -// Is return true if given liveliness is equal to specific Liveliness -func (l MachineLiveliness) Is(liveliness string) bool { - return string(l) == liveliness -} - // DiskCapacity calculates the capacity of all disks. func (hw *MachineHardware) DiskCapacity() uint64 { var c uint64 diff --git a/cmd/metal-api/internal/metal/provisioning.go b/cmd/metal-api/internal/metal/provisioning.go index 12503d7f5..618bf4c08 100644 --- a/cmd/metal-api/internal/metal/provisioning.go +++ b/cmd/metal-api/internal/metal/provisioning.go @@ -47,11 +47,6 @@ var ( // ProvisioningEvents is just a list of ProvisioningEvents type ProvisioningEvents []ProvisioningEvent -// Is return true if given event is equal to specific EventType -func (p ProvisioningEventType) Is(event string) bool { - return string(p) == event -} - // TrimEvents trim the events to maxCount func (p *ProvisioningEventContainer) TrimEvents(maxCount int) { if len(p.Events) > maxCount { diff --git a/cmd/metal-api/internal/metal/provisioning_test.go b/cmd/metal-api/internal/metal/provisioning_test.go index ac8b83ba6..1ce20cfe6 100644 --- a/cmd/metal-api/internal/metal/provisioning_test.go +++ b/cmd/metal-api/internal/metal/provisioning_test.go @@ -5,42 +5,6 @@ import ( "time" ) -func TestProvisioningEventType_Is(t *testing.T) { - tests := []struct { - name string - event string - p ProvisioningEventType - want bool - }{ - { - name: "simple", - event: "Waiting", - p: ProvisioningEventWaiting, - want: true, - }, - { - name: "simple", - event: "Waiting", - p: ProvisioningEventInstalling, - want: false, - }, - { - name: "simple", - event: "Alive", - p: ProvisioningEventAlive, - want: true, - }, - } - for i := range tests { - tt := tests[i] - t.Run(tt.name, func(t *testing.T) { - if got := tt.p.Is(tt.event); got != tt.want { - t.Errorf("ProvisioningEventType.Is() = %v, want %v", got, tt.want) - } - }) - } -} - func TestProvisioningEventContainer_Validate(t *testing.T) { now := time.Now() tests := []struct { diff --git a/cmd/metal-api/internal/service/machine-service.go b/cmd/metal-api/internal/service/machine-service.go index bf63ff9f2..54e9b4821 100644 --- a/cmd/metal-api/internal/service/machine-service.go +++ b/cmd/metal-api/internal/service/machine-service.go @@ -11,6 +11,7 @@ import ( "time" "github.com/metal-stack/metal-api/cmd/metal-api/internal/headscale" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/issues" "github.com/metal-stack/metal-lib/auditing" "github.com/avast/retry-go/v4" @@ -243,6 +244,27 @@ func (r *machineResource) webService() *restful.WebService { Returns(http.StatusOK, "OK", v1.MachineResponse{}). DefaultReturns("Error", httperrors.HTTPErrorResponse{})) + ws.Route(ws.GET("/issues"). + To(viewer(r.listIssues)). + Operation("listIssues"). + Doc("returns the list of issues that exist in the API"). + Metadata(restfulspec.KeyOpenAPITags, tags). + Metadata(auditing.Exclude, true). + Writes([]v1.MachineIssue{}). + Returns(http.StatusOK, "OK", []v1.MachineIssue{}). + DefaultReturns("Error", httperrors.HTTPErrorResponse{})) + + ws.Route(ws.POST("/issues/evaluate"). + To(viewer(r.issues)). + Operation("issues"). + Doc("returns machine issues"). + Metadata(restfulspec.KeyOpenAPITags, tags). + // is an expensive call so we audit this as well even if it does not change anything + Reads(v1.MachineIssuesRequest{}). + Writes([]v1.MachineIssueResponse{}). + Returns(http.StatusOK, "OK", []v1.MachineIssueResponse{}). + DefaultReturns("Error", httperrors.HTTPErrorResponse{})) + ws.Route(ws.POST("/ipmi"). To(editor(r.ipmiReport)). Operation("ipmiReport"). @@ -483,6 +505,127 @@ func (r *machineResource) updateMachine(request *restful.Request, response *rest r.send(request, response, http.StatusOK, resp) } +func (r *machineResource) listIssues(request *restful.Request, response *restful.Response) { + issues := issues.All() + + var issueResponse []v1.MachineIssue + for _, issue := range issues { + issue := issue + + issueResponse = append(issueResponse, v1.MachineIssue{ + ID: string(issue.Type), + Severity: string(issue.Severity), + Description: issue.Description, + RefURL: issue.RefURL, + Details: issue.Details, + }) + } + + r.send(request, response, http.StatusOK, issueResponse) +} + +func (r *machineResource) issues(request *restful.Request, response *restful.Response) { + var requestPayload v1.MachineIssuesRequest + err := request.ReadEntity(&requestPayload) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + + var ( + ms = metal.Machines{} + + severity = issues.SeverityMinor + only []issues.Type + omit []issues.Type + lastErrorThreshold = issues.DefaultLastErrorThreshold() + ) + + if requestPayload.Severity != "" { + severity, err = issues.SeverityFromString(requestPayload.Severity) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + } + + if len(requestPayload.Omit) > 0 { + for _, o := range requestPayload.Omit { + o := o + + _, err := issues.NewIssueFromType(o) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + + omit = append(omit, o) + } + } + + if len(requestPayload.Only) > 0 { + for _, o := range requestPayload.Only { + o := o + + _, err := issues.NewIssueFromType(o) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + + only = append(only, o) + } + } + + if requestPayload.LastErrorThreshold > 0 { + lastErrorThreshold = requestPayload.LastErrorThreshold + } + + err = r.ds.SearchMachines(&requestPayload.MachineSearchQuery, &ms) + if err != nil { + r.sendError(request, response, defaultError(err)) + return + } + + ecs, err := r.ds.ListProvisioningEventContainers() + if err != nil { + r.sendError(request, response, defaultError(err)) + return + } + + machinesWithIssues, err := issues.Find(&issues.Config{ + Machines: ms, + EventContainers: ecs, + Severity: severity, + Only: only, + Omit: omit, + LastErrorThreshold: lastErrorThreshold, + }) + if err != nil { + r.sendError(request, response, defaultError(err)) + return + } + + var issueResponse []*v1.MachineIssueResponse + for _, machineWithIssues := range machinesWithIssues.ToList() { + machineWithIssues := machineWithIssues + + entry := &v1.MachineIssueResponse{ + MachineID: machineWithIssues.Machine.ID, + } + + for _, issue := range machineWithIssues.Issues { + issue := issue + + entry.Issues = append(entry.Issues, string(issue.Type)) + } + + issueResponse = append(issueResponse, entry) + } + + r.send(request, response, http.StatusOK, issueResponse) +} + func (r *machineResource) getMachineConsolePassword(request *restful.Request, response *restful.Response) { var requestPayload v1.MachineConsolePasswordRequest err := request.ReadEntity(&requestPayload) @@ -1582,7 +1725,7 @@ func (r *machineResource) deleteMachine(request *restful.Request, response *rest r.sendError(request, response, defaultError(err)) return } - if err == nil && !ec.Liveliness.Is(string(metal.MachineLivelinessDead)) { + if err == nil && ec.Liveliness != metal.MachineLivelinessDead { r.sendError(request, response, defaultError(errors.New("can only delete dead machines"))) return } @@ -1790,7 +1933,7 @@ func evaluateMachineLiveliness(ds *datastore.RethinkStore, m metal.Machine) (met provisioningEvents, err := ds.FindProvisioningEventContainer(m.ID) if err != nil { // we have no provisioning events... we cannot tell - return metal.MachineLivelinessUnknown, fmt.Errorf("no provisioningEvents found for ID: %s", m.ID) + return metal.MachineLivelinessUnknown, fmt.Errorf("no provisioning event container found for machine: %s", m.ID) } old := *provisioningEvents @@ -1807,6 +1950,7 @@ func evaluateMachineLiveliness(ds *datastore.RethinkStore, m metal.Machine) (met } else { provisioningEvents.Liveliness = metal.MachineLivelinessAlive } + err = ds.UpdateProvisioningEventContainer(&old, provisioningEvents) if err != nil { return provisioningEvents.Liveliness, err @@ -1864,7 +2008,6 @@ func ResurrectMachines(ctx context.Context, ds *datastore.RethinkStore, publishe } continue } - } logger.Info("finished machine resurrection") @@ -2089,27 +2232,6 @@ func publishMachineCmd(logger *zap.SugaredLogger, m *metal.Machine, publisher bu return nil } -func machineHasIssues(m *v1.MachineResponse) bool { - if m.Partition == nil { - return true - } - if !metal.MachineLivelinessAlive.Is(m.Liveliness) { - return true - } - if m.Allocation == nil && len(m.RecentProvisioningEvents.Events) > 0 && metal.ProvisioningEventPhonedHome.Is(m.RecentProvisioningEvents.Events[0].Event) { - // not allocated, but phones home - return true - } - if m.RecentProvisioningEvents.CrashLoop || m.RecentProvisioningEvents.FailedMachineReclaim { - // Machines in crash loop but in "Waiting" state are considered available - if len(m.RecentProvisioningEvents.Events) > 0 && !metal.ProvisioningEventWaiting.Is(m.RecentProvisioningEvents.Events[0].Event) { - return true - } - } - - return false -} - func makeMachineResponse(m *metal.Machine, ds *datastore.RethinkStore) (*v1.MachineResponse, error) { s, p, i, ec, err := findMachineReferencedEntities(m, ds) if err != nil { diff --git a/cmd/metal-api/internal/service/partition-service.go b/cmd/metal-api/internal/service/partition-service.go index 4075a119e..ee1a90173 100644 --- a/cmd/metal-api/internal/service/partition-service.go +++ b/cmd/metal-api/internal/service/partition-service.go @@ -2,11 +2,14 @@ package service import ( "errors" + "fmt" "net/http" "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/issues" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" "github.com/metal-stack/metal-lib/auditing" + "github.com/metal-stack/metal-lib/pkg/pointer" "go.uber.org/zap" v1 "github.com/metal-stack/metal-api/cmd/metal-api/internal/service/v1" @@ -97,18 +100,6 @@ func (r *partitionResource) webService() *restful.WebService { Returns(http.StatusConflict, "Conflict", httperrors.HTTPErrorResponse{}). DefaultReturns("Error", httperrors.HTTPErrorResponse{})) - // Deprecated, can be removed in the future - ws.Route(ws.GET("/capacity"). - To(r.partitionCapacityCompat). - Operation("partitionCapacityCompat"). - Doc("get partition capacity"). - Metadata(restfulspec.KeyOpenAPITags, tags). - Metadata(auditing.Exclude, true). - Writes([]v1.PartitionCapacity{}). - Returns(http.StatusOK, "OK", []v1.PartitionCapacity{}). - DefaultReturns("Error", httperrors.HTTPErrorResponse{}). - Deprecate()) - ws.Route(ws.POST("/capacity"). To(r.partitionCapacity). Operation("partitionCapacity"). @@ -314,16 +305,6 @@ func (r *partitionResource) updatePartition(request *restful.Request, response * r.send(request, response, http.StatusOK, v1.NewPartitionResponse(&newPartition)) } -func (r *partitionResource) partitionCapacityCompat(request *restful.Request, response *restful.Response) { - partitionCapacities, err := r.calcPartitionCapacity(nil) - if err != nil { - r.sendError(request, response, httperrors.BadRequest(err)) - return - } - - r.send(request, response, http.StatusOK, partitionCapacities) -} - func (r *partitionResource) partitionCapacity(request *restful.Request, response *restful.Response) { var requestPayload v1.PartitionCapacityRequest err := request.ReadEntity(&requestPayload) @@ -342,15 +323,13 @@ func (r *partitionResource) partitionCapacity(request *restful.Request, response } func (r *partitionResource) calcPartitionCapacity(pcr *v1.PartitionCapacityRequest) ([]v1.PartitionCapacity, error) { - // FIXME bad workaround to be able to run make spec - if r.ds == nil { - return nil, nil - } - var ( - ps metal.Partitions - ms metal.Machines - err error + ps metal.Partitions + ms metal.Machines + + pcs = map[string]*v1.PartitionCapacity{} + + machineQuery = datastore.MachineSearchQuery{} ) if pcr != nil && pcr.ID != nil { @@ -359,96 +338,112 @@ func (r *partitionResource) calcPartitionCapacity(pcr *v1.PartitionCapacityReque return nil, err } ps = metal.Partitions{*p} + + machineQuery.PartitionID = pcr.ID } else { + var err error ps, err = r.ds.ListPartitions() if err != nil { return nil, err } } - msq := datastore.MachineSearchQuery{} if pcr != nil && pcr.Size != nil { - msq.SizeID = pcr.Size + machineQuery.SizeID = pcr.Size } - err = r.ds.SearchMachines(&msq, &ms) + err := r.ds.SearchMachines(&machineQuery, &ms) if err != nil { return nil, err } - machines, err := makeMachineResponseList(ms, r.ds) + + ecs, err := r.ds.ListProvisioningEventContainers() if err != nil { - return nil, err + return nil, fmt.Errorf("unable to fetch provisioning event containers: %w", err) } - partitionCapacities := []v1.PartitionCapacity{} - for _, p := range ps { - p := p - capacities := make(map[string]*v1.ServerCapacity) - for _, m := range machines { - m := m - if m.Partition == nil { - continue - } - if m.Partition.ID != p.ID { - continue - } + machinesWithIssues, err := issues.Find(&issues.Config{ + Machines: ms, + EventContainers: ecs, + Only: issues.NotAllocatableIssueTypes(), + }) + if err != nil { + return nil, fmt.Errorf("unable to calculate machine issues: %w", err) + } - size := metal.UnknownSize.ID - if m.Size != nil { - size = m.Size.ID - } + partitionsByID := ps.ByID() + ecsByID := ecs.ByID() - available := false - if m.State.Value == string(metal.AvailableState) && len(m.RecentProvisioningEvents.Events) > 0 { - events := m.RecentProvisioningEvents.Events - if metal.ProvisioningEventWaiting.Is(events[0].Event) && metal.ProvisioningEventAlive.Is(m.Liveliness) { - available = true - } - } + for _, m := range ms { + m := m - cap, ok := capacities[size] - if !ok { - cap = &v1.ServerCapacity{Size: size} - capacities[size] = cap - } + ec, ok := ecsByID[m.ID] + if !ok { + continue + } + + p, ok := partitionsByID[m.PartitionID] + if !ok { + continue + } - if m.Allocation != nil { - cap.Allocated++ - } else if machineHasIssues(m) { - cap.Faulty++ - cap.FaultyMachines = append(cap.FaultyMachines, m.ID) - } else if available { - cap.Free++ - } else { - cap.Other++ - cap.OtherMachines = append(cap.OtherMachines, m.ID) + pc, ok := pcs[m.PartitionID] + if !ok { + pc = &v1.PartitionCapacity{ + Common: v1.Common{ + Identifiable: v1.Identifiable{ + ID: p.ID, + }, + Describable: v1.Describable{ + Name: &p.Name, + Description: &p.Description, + }, + }, + ServerCapacities: v1.ServerCapacities{}, } + } + pcs[m.PartitionID] = pc - cap.Total++ + size := metal.UnknownSize.ID + if m.SizeID != "" { + size = m.SizeID } - sc := []v1.ServerCapacity{} - for i := range capacities { - if capacities[i] == nil { - continue + + cap := pc.ServerCapacities.FindBySize(size) + if cap == nil { + cap = &v1.ServerCapacity{ + Size: size, } - sc = append(sc, *capacities[i]) + pc.ServerCapacities = append(pc.ServerCapacities, cap) } - pc := v1.PartitionCapacity{ - Common: v1.Common{ - Identifiable: v1.Identifiable{ - ID: p.ID, - }, - Describable: v1.Describable{ - Name: &p.Name, - Description: &p.Description, - }, - }, - ServerCapacities: sc, + cap.Total++ + + if m.Allocation != nil { + cap.Allocated++ + continue + } + + if _, ok := machinesWithIssues[m.ID]; ok { + cap.Faulty++ + cap.FaultyMachines = append(cap.FaultyMachines, m.ID) + continue + } + + if m.State.Value == metal.AvailableState && metal.ProvisioningEventWaiting == pointer.FirstOrZero(ec.Events).Event { + cap.Free++ + continue } - partitionCapacities = append(partitionCapacities, pc) + cap.Other++ + cap.OtherMachines = append(cap.OtherMachines, m.ID) + } + + res := []v1.PartitionCapacity{} + for _, pc := range pcs { + pc := pc + res = append(res, *pc) } - return partitionCapacities, err + return res, nil } diff --git a/cmd/metal-api/internal/service/partition-service_test.go b/cmd/metal-api/internal/service/partition-service_test.go index 503c5dce7..0606015ab 100644 --- a/cmd/metal-api/internal/service/partition-service_test.go +++ b/cmd/metal-api/internal/service/partition-service_test.go @@ -10,9 +10,11 @@ import ( "github.com/stretchr/testify/assert" "go.uber.org/zap/zaptest" + r "gopkg.in/rethinkdb/rethinkdb-go.v6" restful "github.com/emicklei/go-restful/v3" "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" v1 "github.com/metal-stack/metal-api/cmd/metal-api/internal/service/v1" "github.com/metal-stack/metal-api/cmd/metal-api/internal/testdata" "github.com/metal-stack/metal-lib/httperrors" @@ -244,13 +246,28 @@ func TestUpdatePartition(t *testing.T) { func TestPartitionCapacity(t *testing.T) { ds, mock := datastore.InitMockDB(t) + + ecs := []metal.ProvisioningEventContainer{} + for _, m := range testdata.TestMachines { + m := m + ecs = append(ecs, metal.ProvisioningEventContainer{ + Base: m.Base, + }) + } + mock.On(r.DB("mockdb").Table("event")).Return(ecs, nil) + testdata.InitMockDBData(mock) log := zaptest.NewLogger(t).Sugar() service := NewPartition(log, ds, &nopTopicCreater{}) container := restful.NewContainer().Add(service) - req := httptest.NewRequest("GET", "/v1/partition/capacity", nil) + pcRequest := &v1.PartitionCapacityRequest{} + js, err := json.Marshal(pcRequest) + require.NoError(t, err) + body := bytes.NewBuffer(js) + + req := httptest.NewRequest("POST", "/v1/partition/capacity", body) req.Header.Add("Content-Type", "application/json") container = injectAdmin(log, container, req) w := httptest.NewRecorder() @@ -260,12 +277,13 @@ func TestPartitionCapacity(t *testing.T) { defer resp.Body.Close() require.Equal(t, http.StatusOK, resp.StatusCode, w.Body.String()) var result []v1.PartitionCapacity - err := json.NewDecoder(resp.Body).Decode(&result) + err = json.NewDecoder(resp.Body).Decode(&result) require.NoError(t, err) + require.Len(t, result, 1) require.Equal(t, testdata.Partition1.ID, result[0].ID) require.NotNil(t, result[0].ServerCapacities) - require.Equal(t, 1, len(result[0].ServerCapacities)) + require.Len(t, result[0].ServerCapacities, 1) c := result[0].ServerCapacities[0] require.Equal(t, "1", c.Size) require.Equal(t, 5, c.Total) diff --git a/cmd/metal-api/internal/service/v1/machine.go b/cmd/metal-api/internal/service/v1/machine.go index 3a83c1801..d1cfbc9d4 100644 --- a/cmd/metal-api/internal/service/v1/machine.go +++ b/cmd/metal-api/internal/service/v1/machine.go @@ -4,6 +4,7 @@ import ( "time" "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/issues" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" ) @@ -227,6 +228,7 @@ type MachineConsolePasswordRequest struct { ID string `json:"id" description:"id of the machine to get the consolepassword for"` Reason string `json:"reason" description:"reason why the consolepassword is requested, typically a incident number with short description"` } + type MachineConsolePasswordResponse struct { Common ConsolePassword string `json:"console_password" description:"the console password which was generated while provisioning"` @@ -264,6 +266,16 @@ type MachineReinstallRequest struct { ImageID string `json:"imageid" description:"the image id to be installed"` } +type MachineIssuesRequest struct { + datastore.MachineSearchQuery + + Only []issues.Type `json:"only" description:"a list of machine issues to include"` + Omit []issues.Type `json:"omit" description:"a list of machine issues to omit"` + + Severity string `json:"severity" description:"filters issue for given severity"` + LastErrorThreshold time.Duration `json:"last_error_threshold" description:"defines the last error threshold"` +} + type MachineAbortReinstallRequest struct { PrimaryDiskWiped bool `json:"primary_disk_wiped" description:"indicates whether the primary disk is already wiped"` } @@ -274,6 +286,19 @@ type MachineVPN struct { Connected bool `json:"connected" description:"connected to the VPN"` } +type MachineIssueResponse struct { + MachineID string `json:"machineid" description:"the machine id that has the given issues"` + Issues []string `json:"issues" description:"the list of issues (only issue ids) of this machine"` +} + +type MachineIssue struct { + ID string `json:"id" description:"the id of the issue"` + Severity string `json:"severity" description:"the severity of the issue"` + Description string `json:"description" description:"a description of the issue"` + RefURL string `json:"ref_url" description:"an issue reference to the issue in metal-stack docs"` + Details string `json:"details" description:"details of the issue"` +} + func NewMetalMachineHardware(r *MachineHardware) metal.MachineHardware { nics := metal.Nics{} for i := range r.Nics { diff --git a/cmd/metal-api/internal/service/v1/partition.go b/cmd/metal-api/internal/service/v1/partition.go index 49f1bbff5..522075f31 100644 --- a/cmd/metal-api/internal/service/v1/partition.go +++ b/cmd/metal-api/internal/service/v1/partition.go @@ -39,9 +39,11 @@ type PartitionCapacityRequest struct { Size *string `json:"sizeid" description:"the size to filter for" optional:"true"` } +type ServerCapacities []*ServerCapacity + type PartitionCapacity struct { Common - ServerCapacities []ServerCapacity `json:"servers" description:"servers available in this partition"` + ServerCapacities ServerCapacities `json:"servers" description:"servers available in this partition"` } type ServerCapacity struct { @@ -85,3 +87,14 @@ func NewPartitionResponse(p *metal.Partition) *PartitionResponse { }, } } + +func (s ServerCapacities) FindBySize(size string) *ServerCapacity { + for _, sc := range s { + sc := sc + if sc.Size == size { + return sc + } + } + + return nil +} diff --git a/go.mod b/go.mod index 5b57d8dbe..ac20d0105 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/looplab/fsm v0.3.0 github.com/metal-stack/go-ipam v1.8.5 github.com/metal-stack/masterdata-api v0.10.0 - github.com/metal-stack/metal-lib v0.13.2 + github.com/metal-stack/metal-lib v0.13.5 github.com/metal-stack/security v0.6.7 github.com/metal-stack/v v1.0.3 github.com/nsqio/go-nsq v1.1.0 @@ -45,6 +45,7 @@ replace ( ) require ( + connectrpc.com/connect v1.11.1 // indirect dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.0.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect @@ -56,7 +57,6 @@ require ( github.com/avast/retry-go v3.0.0+incompatible // indirect github.com/benbjohnson/clock v1.3.5 // indirect github.com/beorn7/perks v1.0.1 // indirect - github.com/bufbuild/connect-go v1.10.0 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/containerd/containerd v1.7.3 // indirect diff --git a/go.sum b/go.sum index d62756884..548b9416c 100644 --- a/go.sum +++ b/go.sum @@ -36,6 +36,8 @@ cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohl cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3fOKtUw0Xmo= +connectrpc.com/connect v1.11.1 h1:dqRwblixqkVh+OFBOOL1yIf1jS/yP0MSJLijRj29bFg= +connectrpc.com/connect v1.11.1/go.mod h1:3AGaO6RRGMx5IKFfqbe3hvK1NqLosFNP2BxDYTPmNPo= dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= filippo.io/edwards25519 v1.0.0 h1:0wAIcmJUqRdI8IJ/3eGi5/HwXZWPujYXXlkrQogz0Ek= @@ -124,8 +126,6 @@ github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dR github.com/bmizerany/perks v0.0.0-20230307044200-03f9df79da1e h1:mWOqoK5jV13ChKf/aF3plwQ96laasTJgZi4f1aSOu+M= github.com/bmizerany/perks v0.0.0-20230307044200-03f9df79da1e/go.mod h1:ac9efd0D1fsDb3EJvhqgXRbFx7bs2wqZ10HQPeU8U/Q= github.com/bshuster-repo/logrus-logstash-hook v0.4.1/go.mod h1:zsTqEiSzDgAa/8GZR7E1qaXrhYNDKBYy5/dWPTIflbk= -github.com/bufbuild/connect-go v1.10.0 h1:QAJ3G9A1OYQW2Jbk3DeoJbkCxuKArrvZgDt47mjdTbg= -github.com/bufbuild/connect-go v1.10.0/go.mod h1:CAIePUgkDR5pAFaylSMtNK45ANQjp9JvpluG20rhpV8= github.com/buger/jsonparser v0.0.0-20180808090653-f4dd9f5a6b44/go.mod h1:bbYlZJ7hK1yFx9hf58LP0zeX7UjIGs20ufpu3evjr+s= github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8= github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50= @@ -637,8 +637,8 @@ github.com/metal-stack/go-ipam v1.8.5 h1:XE1XfaU6Ck1Ucc7svTO25dlT7kEcE1oxOM3lBrW github.com/metal-stack/go-ipam v1.8.5/go.mod h1:JgsddJabu8A7lWD+4MJKqbQhmSA/zhBbO+Bp8pLhRZM= github.com/metal-stack/masterdata-api v0.10.0 h1:xcB8kd1FK5etmRbcTlAPk2bQXY6i9tTTvAeDsTjmh6E= github.com/metal-stack/masterdata-api v0.10.0/go.mod h1:xwaMDC9hhNjXep3ppD+Iqeg2OEFM6hwq2zFyBDnlXGc= -github.com/metal-stack/metal-lib v0.13.2 h1:gpsnKUxahT4r3N55QY1MTRBZy10CySTQCQKb9XFCDrA= -github.com/metal-stack/metal-lib v0.13.2/go.mod h1:l18VEuS1YkxnVE35iF8AMP6QRxoYjRZ9e2NE3aGxVY0= +github.com/metal-stack/metal-lib v0.13.5 h1:OX94H+Pw31MOE9xSr460kFBv6CNJ2Nhjf4GY5IcuCxM= +github.com/metal-stack/metal-lib v0.13.5/go.mod h1:BAR7fjdoV7DDg8i9GpJQBDaNSFirOcBs0vLYTBnhHQU= github.com/metal-stack/security v0.6.7 h1:8wstGy0pdUmphVclAlT+9RKQmx9lF+cIGklJZAB5cIc= github.com/metal-stack/security v0.6.7/go.mod h1:dXyrQ8PYZuUiodWFQ/NwSROxu6tajwRBc5yR/PoK5uE= github.com/metal-stack/v v1.0.3 h1:Sh2oBlnxrCUD+mVpzfC8HiqL045YWkxs0gpTvkjppqs= diff --git a/spec/metal-api.json b/spec/metal-api.json index dc97a467e..624aec38a 100644 --- a/spec/metal-api.json +++ b/spec/metal-api.json @@ -2736,6 +2736,267 @@ } } }, + "v1.MachineIssue": { + "properties": { + "description": { + "description": "a description of the issue", + "type": "string" + }, + "details": { + "description": "details of the issue", + "type": "string" + }, + "id": { + "description": "the id of the issue", + "type": "string" + }, + "ref_url": { + "description": "an issue reference to the issue in metal-stack docs", + "type": "string" + }, + "severity": { + "description": "the severity of the issue", + "type": "string" + } + }, + "required": [ + "description", + "details", + "id", + "ref_url", + "severity" + ] + }, + "v1.MachineIssueResponse": { + "properties": { + "issues": { + "description": "the list of issues (only issue ids) of this machine", + "items": { + "type": "string" + }, + "type": "array" + }, + "machineid": { + "description": "the machine id that has the given issues", + "type": "string" + } + }, + "required": [ + "issues", + "machineid" + ] + }, + "v1.MachineIssuesRequest": { + "properties": { + "allocation_hostname": { + "type": "string" + }, + "allocation_image_id": { + "type": "string" + }, + "allocation_name": { + "type": "string" + }, + "allocation_project": { + "type": "string" + }, + "allocation_role": { + "type": "string" + }, + "allocation_succeeded": { + "type": "boolean" + }, + "disk_names": { + "items": { + "type": "string" + }, + "type": "array" + }, + "disk_sizes": { + "items": { + "format": "int64", + "type": "integer" + }, + "type": "array" + }, + "fru_board_mfg": { + "type": "string" + }, + "fru_board_mfg_serial": { + "type": "string" + }, + "fru_board_part_number": { + "type": "string" + }, + "fru_chassis_part_number": { + "type": "string" + }, + "fru_chassis_part_serial": { + "type": "string" + }, + "fru_product_manufacturer": { + "type": "string" + }, + "fru_product_part_number": { + "type": "string" + }, + "fru_product_serial": { + "type": "string" + }, + "hardware_cpu_cores": { + "format": "int64", + "type": "integer" + }, + "hardware_memory": { + "format": "int64", + "type": "integer" + }, + "id": { + "type": "string" + }, + "ipmi_address": { + "type": "string" + }, + "ipmi_interface": { + "type": "string" + }, + "ipmi_mac_address": { + "type": "string" + }, + "ipmi_user": { + "type": "string" + }, + "last_error_threshold": { + "description": "defines the last error threshold", + "format": "int64", + "type": "integer" + }, + "name": { + "type": "string" + }, + "network_asns": { + "items": { + "format": "int64", + "type": "integer" + }, + "type": "array" + }, + "network_destination_prefixes": { + "items": { + "type": "string" + }, + "type": "array" + }, + "network_ids": { + "items": { + "type": "string" + }, + "type": "array" + }, + "network_ips": { + "items": { + "type": "string" + }, + "type": "array" + }, + "network_prefixes": { + "items": { + "type": "string" + }, + "type": "array" + }, + "network_vrfs": { + "items": { + "format": "int64", + "type": "integer" + }, + "type": "array" + }, + "nics_mac_addresses": { + "items": { + "type": "string" + }, + "type": "array" + }, + "nics_names": { + "items": { + "type": "string" + }, + "type": "array" + }, + "nics_neighbor_mac_addresses": { + "items": { + "type": "string" + }, + "type": "array" + }, + "nics_neighbor_names": { + "items": { + "type": "string" + }, + "type": "array" + }, + "nics_neighbor_vrfs": { + "items": { + "type": "string" + }, + "type": "array" + }, + "nics_vrfs": { + "items": { + "type": "string" + }, + "type": "array" + }, + "omit": { + "description": "a list of machine issues to omit", + "items": { + "type": "string" + }, + "type": "array" + }, + "only": { + "description": "a list of machine issues to include", + "items": { + "type": "string" + }, + "type": "array" + }, + "partition_id": { + "type": "string" + }, + "rackid": { + "type": "string" + }, + "severity": { + "description": "filters issue for given severity", + "type": "string" + }, + "sizeid": { + "type": "string" + }, + "state_value": { + "enum": [ + "", + "LOCKED", + "RESERVED" + ], + "type": "string" + }, + "tags": { + "items": { + "type": "string" + }, + "type": "array" + } + }, + "required": [ + "last_error_threshold", + "omit", + "only", + "severity" + ] + }, "v1.MachineNetwork": { "description": "prefixes that are reachable within this network", "properties": { @@ -6507,6 +6768,80 @@ ] } }, + "/v1/machine/issues": { + "get": { + "consumes": [ + "application/json" + ], + "operationId": "listIssues", + "produces": [ + "application/json" + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "items": { + "$ref": "#/definitions/v1.MachineIssue" + }, + "type": "array" + } + }, + "default": { + "description": "Error", + "schema": { + "$ref": "#/definitions/httperrors.HTTPErrorResponse" + } + } + }, + "summary": "returns the list of issues that exist in the API", + "tags": [ + "machine" + ] + } + }, + "/v1/machine/issues/evaluate": { + "post": { + "consumes": [ + "application/json" + ], + "operationId": "issues", + "parameters": [ + { + "in": "body", + "name": "body", + "required": true, + "schema": { + "$ref": "#/definitions/v1.MachineIssuesRequest" + } + } + ], + "produces": [ + "application/json" + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "items": { + "$ref": "#/definitions/v1.MachineIssueResponse" + }, + "type": "array" + } + }, + "default": { + "description": "Error", + "schema": { + "$ref": "#/definitions/httperrors.HTTPErrorResponse" + } + } + }, + "summary": "returns machine issues", + "tags": [ + "machine" + ] + } + }, "/v1/machine/update-firmware/{id}": { "post": { "consumes": [ @@ -7713,37 +8048,6 @@ } }, "/v1/partition/capacity": { - "get": { - "consumes": [ - "application/json" - ], - "deprecated": true, - "operationId": "partitionCapacityCompat", - "produces": [ - "application/json" - ], - "responses": { - "200": { - "description": "OK", - "schema": { - "items": { - "$ref": "#/definitions/v1.PartitionCapacity" - }, - "type": "array" - } - }, - "default": { - "description": "Error", - "schema": { - "$ref": "#/definitions/httperrors.HTTPErrorResponse" - } - } - }, - "summary": "get partition capacity", - "tags": [ - "Partition" - ] - }, "post": { "consumes": [ "application/json"