From b17fb66cd5d566331feb0c7ee2737e419b13335d Mon Sep 17 00:00:00 2001 From: nkinkade Date: Wed, 18 Sep 2019 10:38:33 -0600 Subject: [PATCH] Add --project flag for filtering + adds `node` label to all machine metrics (#23) * Adds a new flag --project, and uses the value of that to filter machines/sites on per project basis so we don't pollute mlab-oti's GMX instance with staging and sandbox machines, and vice versa. * Adds a 'node' label to all machine metrics. * Don't enter maintenance more than once for a particular issue. Uses maintenance constantants instead of fixed integers. --- gmx.go | 56 +++++++++++++++++++++++++++++++++++++++++------------ gmx_test.go | 41 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/gmx.go b/gmx.go index 4d8e145..4720855 100644 --- a/gmx.go +++ b/gmx.go @@ -30,6 +30,7 @@ import ( "os" "regexp" "strconv" + "strings" "sync" "github.com/google/go-github/github" @@ -44,13 +45,23 @@ var ( fListenAddress string // Interface and port to listen on. fStateFilePath string // Filesystem path to write the maintenance state file. fGitHubSecretPath string // Filesystem path to file which contains the shared Github secret. + fProject string // GCP project where this instance is running. githubSecret []byte // The symetric secret used to validate that the webhook actually came from Github. mux sync.Mutex - machineRegExp = regexp.MustCompile(`\/machine (mlab[1-4]{1}\.[a-z]{3}[0-9c]{2})\s?(del)?`) - siteRegExp = regexp.MustCompile(`\/site ([a-z]{3}[0-9c]{2})\s?(del)?`) + machineRegExps = map[string]*regexp.Regexp{ + "mlab-sandbox": regexp.MustCompile(`\/machine\s+(mlab[1-4]\.[a-z]{3}[0-9]t)\s+(del)?`), + "mlab-staging": regexp.MustCompile(`\/machine\s+(mlab[4]\.[a-z]{3}[0-9c]{2})\s+(del)?`), + "mlab-oti": regexp.MustCompile(`\/machine\s+(mlab[1-3]\.[a-z]{3}[0-9c]{2})\s+(del)?`), + } + + siteRegExps = map[string]*regexp.Regexp{ + "mlab-sandbox": regexp.MustCompile(`\/site\s+([a-z]{3}[0-9]t)\s+(del)?`), + "mlab-staging": regexp.MustCompile(`\/site\s+([a-z]{3}[0-9c]{2})\s+(del)?`), + "mlab-oti": regexp.MustCompile(`\/site\s+([a-z]{3}[0-9c]{2})\s+(del)?`), + } // Prometheus metric for exposing any errors that the exporter encounters. metricError = prometheus.NewCounterVec( @@ -71,6 +82,7 @@ var ( }, []string{ "machine", + "node", }, ) // Prometheus metric for exposing site maintenance status. @@ -135,7 +147,7 @@ func restoreState(r io.Reader, s *maintenanceState) error { // Restore machine maintenance state. for machine := range s.Machines { - metricMachine.WithLabelValues(machine).Set(cEnterMaintenance) + metricMachine.WithLabelValues(machine, machine).Set(cEnterMaintenance) } // Restore site maintenance state. @@ -174,7 +186,13 @@ func removeIssue(stateMap map[string][]string, mapKey string, metricState *prome mapElement = mapElement[:len(mapElement)-1] if len(mapElement) == 0 { delete(stateMap, mapKey) - metricState.WithLabelValues(mapKey).Set(0) + // If this is a machine state, then we need to pass mapKey twice, once for the + // "machine" label and once for the "node" label. + if strings.HasPrefix(mapKey, "mlab") { + metricState.WithLabelValues(mapKey, mapKey).Set(0) + } else { + metricState.WithLabelValues(mapKey).Set(0) + } } else { stateMap[mapKey] = mapElement } @@ -220,9 +238,21 @@ func updateState(stateMap map[string][]string, mapKey string, metricState *prome case cLeaveMaintenance: removeIssue(stateMap, mapKey, metricState, issueNumber) case cEnterMaintenance: + // Don't enter maintenance more than once for a given issue. + issueIndex := stringInSlice(issueNumber, stateMap[mapKey]) + if issueIndex >= 0 { + log.Printf("INFO: %s is already in maintenance for issue #%s", mapKey, issueNumber) + return + } mux.Lock() stateMap[mapKey] = append(stateMap[mapKey], issueNumber) - metricState.WithLabelValues(mapKey).Set(action) + // If this is a machine state, then we need to pass mapKey twice, once for the + // "machine" label and once for the "node" label. + if strings.HasPrefix(mapKey, "mlab") { + metricState.WithLabelValues(mapKey, mapKey).Set(action) + } else { + metricState.WithLabelValues(mapKey).Set(action) + } log.Printf("INFO: %s was added to maintenance for issue #%s", mapKey, issueNumber) mux.Unlock() default: @@ -235,9 +265,9 @@ func updateState(stateMap map[string][]string, mapKey string, metricState *prome // added to or removed from maintenance mode. If any matches are found, it // updates the state for the item. The return value is the number of // modifications that were made to the machine and site maintenance state. -func parseMessage(msg string, issueNumber string, s *maintenanceState) int { +func parseMessage(msg string, issueNumber string, s *maintenanceState, project string) int { var mods = 0 - machineMatches := machineRegExp.FindAllStringSubmatch(msg, -1) + machineMatches := machineRegExps[project].FindAllStringSubmatch(msg, -1) if len(machineMatches) > 0 { for _, machine := range machineMatches { log.Printf("INFO: Flag found for machine: %s", machine[1]) @@ -252,15 +282,15 @@ func parseMessage(msg string, issueNumber string, s *maintenanceState) int { } } - siteMatches := siteRegExp.FindAllStringSubmatch(msg, -1) + siteMatches := siteRegExps[project].FindAllStringSubmatch(msg, -1) if len(siteMatches) > 0 { for _, site := range siteMatches { log.Printf("INFO: Flag found for site: %s", site[1]) if site[2] == "del" { - updateState(s.Sites, site[1], metricSite, issueNumber, 0) + updateState(s.Sites, site[1], metricSite, issueNumber, cLeaveMaintenance) mods++ } else { - updateState(s.Sites, site[1], metricSite, issueNumber, 1) + updateState(s.Sites, site[1], metricSite, issueNumber, cEnterMaintenance) mods++ } } @@ -314,7 +344,7 @@ func receiveHook(resp http.ResponseWriter, req *http.Request) { log.Printf("INFO: Issue #%s was %s.", issueNumber, eventAction) mods = closeIssue(issueNumber, &state) case "opened", "edited": - mods = parseMessage(event.Issue.GetBody(), issueNumber, &state) + mods = parseMessage(event.Issue.GetBody(), issueNumber, &state, fProject) default: log.Printf("INFO: Unsupported IssueEvent action: %s.", eventAction) status = http.StatusNotImplemented @@ -324,7 +354,7 @@ func receiveHook(resp http.ResponseWriter, req *http.Request) { issueNumber = strconv.Itoa(event.Issue.GetNumber()) issueState := event.Issue.GetState() if issueState == "open" { - mods = parseMessage(event.Comment.GetBody(), issueNumber, &state) + mods = parseMessage(event.Comment.GetBody(), issueNumber, &state, fProject) } else { log.Printf("INFO: Ignoring IssueComment event on closed issue #%s.", issueNumber) status = http.StatusExpectationFailed @@ -382,6 +412,8 @@ func init() { "Filesystem path for the state file.") flag.StringVar(&fGitHubSecretPath, "storage.github-secret", "", "Filesystem path of file containing the shared Github webhook secret.") + flag.StringVar(&fProject, "project", "mlab-oti", + "GCP project where this instance is running.") prometheus.MustRegister(metricError) prometheus.MustRegister(metricMachine) prometheus.MustRegister(metricSite) diff --git a/gmx_test.go b/gmx_test.go index 8deb483..f859188 100644 --- a/gmx_test.go +++ b/gmx_test.go @@ -24,7 +24,6 @@ var savedState = ` "def02": ["8"], "uvw03": ["4", "11"], "xyz03": ["5"] - } } ` @@ -274,43 +273,79 @@ func TestCloseIssue(t *testing.T) { func TestParseMessage(t *testing.T) { r := strings.NewReader(savedState) - var s maintenanceState + var s = state restoreState(r, &s) tests := []struct { name string msg string + project string expectedMods int }{ { name: "add-1-machine-to-maintenance", msg: `/machine mlab1.abc01 is in maintenance mode.`, + project: `mlab-oti`, expectedMods: 1, }, { name: "add-2-sites-to-maintenance", msg: `Putting /site abc01 and /site xyz02 into maintenance mode.`, + project: `mlab-oti`, expectedMods: 2, }, { name: "add-1-sites-and-1-machine-to-maintenance", msg: `Putting /site abc01 and /machine mlab1.xyz02 into maintenance mode.`, + project: `mlab-oti`, expectedMods: 2, }, { name: "remove-1-machine-and-1-site-from-maintenance", msg: `Removing /machine mlab2.xyz01 del and /site uvw02 del from maintenance.`, + project: `mlab-oti`, expectedMods: 2, }, { name: "3-malformed-flags", msg: `Add /machine and /site vw02 to maintenance. Removing /site lol del.`, + project: `mlab-oti`, expectedMods: 0, }, + { + name: "1-production-machine-1-staging-machine-flag", + msg: `Add /machine mlab2.ghi01 and /machine mlab4.ghi01 to maintenance.`, + project: `mlab-oti`, + expectedMods: 1, + }, + { + name: "1-sandbox-machine-1-staging-machine-flag", + msg: `Add /machine mlab3.hij0t and /machine mlab4.qrs01 to maintenance.`, + project: `mlab-oti`, + expectedMods: 0, + }, + { + name: "1-sandbox-machine-flag", + msg: `Add /machine mlab1.abc0t to maintenance.`, + project: `mlab-sandbox`, + expectedMods: 1, + }, + { + name: "2-staging-machine-flags", + msg: `Add /machine mlab4.abc03 and /machine mlab4.wxy01 to maintenance.`, + project: `mlab-staging`, + expectedMods: 2, + }, + { + name: "1-sandbox-site-flag", + msg: `Add /site nop0t to maintenance.`, + project: `mlab-sandbox`, + expectedMods: 1, + }, } for _, test := range tests { - mods := parseMessage(test.msg, "99", &s) + mods := parseMessage(test.msg, "99", &s, test.project) if mods != test.expectedMods { t.Errorf("parseMessage(): %s: expected %d state modifications; got %d", test.name, test.expectedMods, mods)