Skip to content

Commit

Permalink
Allow multiple issues to hold a site/machine in maintenance mode at t…
Browse files Browse the repository at this point in the history
…he same time (#19)

* Allows multiple issues to create maintenance on a site/machine at the same time and only closes maintenance when all issues have released it.

* Fixes a bug where removeIssue() was removing an issue for all sites/machine regardless of what was actually flagged in the issue.

* Fixes a logging issue, and makes log messages clearer.

* Fixes string-in-slice search logic by creating a function to do it, adds a new unit test to be sure maintenance is closed once all issues are gone.
  • Loading branch information
nkinkade authored Jun 3, 2019
1 parent e6858db commit 4299352
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 44 deletions.
91 changes: 61 additions & 30 deletions gmx.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ var (
},
[]string{
"machine",
"issue",
},
)
// Prometheus metric for exposing site maintenance status.
Expand All @@ -82,19 +81,18 @@ var (
},
[]string{
"site",
"issue",
},
)

state = maintenanceState{
Machines: make(map[string]string),
Sites: make(map[string]string),
Machines: make(map[string][]string),
Sites: make(map[string][]string),
}
)

// maintenanceState is a struct for storing both machine and site maintenance states.
type maintenanceState struct {
Machines, Sites map[string]string
Machines, Sites map[string][]string
}

// writeState serializes the content of a maintenanceState object into JSON and
Expand Down Expand Up @@ -124,53 +122,88 @@ func restoreState(r io.Reader, s *maintenanceState) error {
data, err := ioutil.ReadAll(r)
if err != nil {
log.Printf("ERROR: Failed to read state data from %s: %s", fStateFilePath, err)
metricError.WithLabelValues("readfile", "restoreState").Add(1)
metricError.WithLabelValues("readfile", "restoreState").Inc()
return err
}

err = json.Unmarshal(data, &s)
if err != nil {
log.Printf("ERROR: Failed to unmarshal JSON: %s", err)
metricError.WithLabelValues("unmarshaljson", "restoreState").Add(1)
metricError.WithLabelValues("unmarshaljson", "restoreState").Inc()
return err
}

// Restore machine maintenance state.
for machine, issue := range s.Machines {
metricMachine.WithLabelValues(machine, issue).Set(cEnterMaintenance)
for machine := range s.Machines {
metricMachine.WithLabelValues(machine).Set(cEnterMaintenance)
}

// Restore site maintenance state.
for site, issue := range state.Sites {
metricSite.WithLabelValues(site, issue).Set(cEnterMaintenance)
for site := range state.Sites {
metricSite.WithLabelValues(site).Set(cEnterMaintenance)
}

log.Printf("INFO: Successfully restored %s from disk.", fStateFilePath)
return nil
}

// Looks for a string a slice.
func stringInSlice(s string, list []string) int {
for i, v := range list {
if v == s {
return i
}
}
return -1
}

// Removes a single issue from a site/machine. If the issue was the last one
// associated with the site/machine, it will also remove the site/machine
// from maintenance.
func removeIssue(stateMap map[string][]string, mapKey string, metricState *prometheus.GaugeVec,
issueNumber string) int {
mux.Lock()
defer mux.Unlock()

var mods = 0
mapElement := stateMap[mapKey]

issueIndex := stringInSlice(issueNumber, mapElement)
if issueIndex >= 0 {
mapElement[issueIndex] = mapElement[len(mapElement)-1]
mapElement = mapElement[:len(mapElement)-1]
if len(mapElement) == 0 {
delete(stateMap, mapKey)
metricState.WithLabelValues(mapKey).Set(0)
} else {
stateMap[mapKey] = mapElement
}
log.Printf("INFO: %s was removed from maintenance for issue #%s", mapKey, issueNumber)
mods++
}
return mods
}

// closeIssue removes any machines and sites from maintenance mode when the
// issue that added them to maintenance mode is closed. The return value is the
// number of modifications that were made to the machine and site maintenance
// state.
func closeIssue(issueNumber string, s *maintenanceState) int {
var mods = 0
// Remove any machines from maintenance that were set by this issue.
for machine, issue := range s.Machines {
if issue == issueNumber {
delete(s.Machines, machine)
metricMachine.WithLabelValues(machine, issueNumber).Set(0)
log.Printf("INFO: Machine %s was removed from maintenance because issue was closed.", machine)
for machine, issues := range s.Machines {
issueIndex := stringInSlice(issueNumber, issues)
if issueIndex >= 0 {
removeIssue(s.Machines, machine, metricMachine, issueNumber)
mods++
}
}

// Remove any sites from maintenance that were set by this issue.
for site, issue := range s.Sites {
if issue == issueNumber {
delete(s.Sites, site)
metricSite.WithLabelValues(site, issueNumber).Set(0)
log.Printf("INFO: Site %s was removed from maintenance because issue was closed.", site)
for site, issues := range s.Sites {
issueIndex := stringInSlice(issueNumber, issues)
if issueIndex >= 0 {
removeIssue(s.Sites, site, metricSite, issueNumber)
mods++
}
}
Expand All @@ -180,20 +213,18 @@ func closeIssue(issueNumber string, s *maintenanceState) int {

// updateState modifies the maintenance state of a machine or site in the
// in-memory map as well as updating the Prometheus metric.
func updateState(stateMap map[string]string, mapKey string, metricState *prometheus.GaugeVec,
func updateState(stateMap map[string][]string, mapKey string, metricState *prometheus.GaugeVec,
issueNumber string, action float64) {
mux.Lock()
defer mux.Unlock()

switch action {
case cLeaveMaintenance:
delete(stateMap, mapKey)
metricState.WithLabelValues(mapKey, issueNumber).Set(action)
log.Printf("INFO: Machine %s was removed from maintenance.", mapKey)
removeIssue(stateMap, mapKey, metricState, issueNumber)
case cEnterMaintenance:
stateMap[mapKey] = issueNumber
metricState.WithLabelValues(mapKey, issueNumber).Set(action)
log.Printf("INFO: %s was added to maintenance.", mapKey)
mux.Lock()
stateMap[mapKey] = append(stateMap[mapKey], issueNumber)
metricState.WithLabelValues(mapKey).Set(action)
log.Printf("INFO: %s was added to maintenance for issue #%s", mapKey, issueNumber)
mux.Unlock()
default:
log.Printf("WARNING: Unknown action type: %f", action)
}
Expand Down
58 changes: 44 additions & 14 deletions gmx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ import (
var savedState = `
{
"Machines": {
"mlab1.abc01.measurement-lab.org": "1",
"mlab2.xyz01.measurement-lab.org": "2",
"mlab3.def01.measurement-lab.org": "8"
"mlab1.abc01.measurement-lab.org": ["1"],
"mlab2.xyz01.measurement-lab.org": ["2"],
"mlab3.def01.measurement-lab.org": ["8"]
},
"Sites": {
"abc02": "8",
"def02": "8",
"uvw03": "4",
"xyz03": "5"
"abc02": ["8"],
"def02": ["8"],
"uvw03": ["4", "11"],
"xyz03": ["5"]
}
}
Expand Down Expand Up @@ -229,16 +229,46 @@ func TestReceiveHook(t *testing.T) {
}

func TestCloseIssue(t *testing.T) {
expectedMods := 3

r := strings.NewReader(savedState)
var s maintenanceState
restoreState(r, &s)
tests := []struct {
name string
issue string
expectedMods int
closedMaintenance int
}{
{
name: "one-issue-per-entity-closes-maintenance",
issue: "8",
expectedMods: 3,
closedMaintenance: 3,
},
{
name: "multiple-issues-per-entity-does-not-close-maintenance",
issue: "4",
expectedMods: 1,
closedMaintenance: 0,
},
}

for _, test := range tests {
r := strings.NewReader(savedState)
var s maintenanceState
restoreState(r, &s)

mods := closeIssue("8", &s)
totalEntitiesBefore := len(s.Machines) + len(s.Sites)
mods := closeIssue(test.issue, &s)
totalEntitiesAfter := len(s.Machines) + len(s.Sites)
closedMaintenance := totalEntitiesBefore - totalEntitiesAfter

if mods != expectedMods {
t.Errorf("closeIssue(): Expected %d state modifications; got %d", expectedMods, mods)
if mods != test.expectedMods {
t.Errorf("closeIssue(): Expected %d state modifications; got %d",
test.expectedMods, mods)
}

if closedMaintenance != test.closedMaintenance {
t.Errorf("closeIssue(): Expected %d closed maintenances; got %d",
test.closedMaintenance, closedMaintenance)
}
}
}

Expand Down

0 comments on commit 4299352

Please sign in to comment.