Skip to content

Commit

Permalink
implement alartmanager endpoint receive and update db (no subscriber …
Browse files Browse the repository at this point in the history
…notification)
  • Loading branch information
pixelsoccupied committed Nov 27, 2024
1 parent fe77c68 commit 05e12b2
Show file tree
Hide file tree
Showing 8 changed files with 393 additions and 47 deletions.
36 changes: 29 additions & 7 deletions internal/service/alarms/internal/alarms_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package internal
import (
"context"

"log/slog"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/alertmanager"

"fmt"
"net/http"
Expand Down Expand Up @@ -92,10 +92,32 @@ func (a *AlarmsServer) GetProbableCause(ctx context.Context, request api.GetProb
}

func (a *AlarmsServer) AmNotification(ctx context.Context, request api.AmNotificationRequestObject) (api.AmNotificationResponseObject, error) {
// TODO: Implement the logic to handle the AM notification
slog.Debug("Received AM notification", "groupLabels", request.Body.GroupLabels)
for _, alert := range request.Body.Alerts {
slog.Debug("Alert", "fingerprint", alert.Fingerprint, "startsAt", alert.StartsAt, "status", alert.Status)
// Audit the table with full list of alerts in the current payload. If missing set them to resolve
if err := a.AlarmsRepository.ResolveNotificationIfNotInCurrent(ctx, request.Body); err != nil {
return nil, fmt.Errorf("failed to resolve notification that are not present: %w", err)
}

// Get the definition data based on current set of Alert names and managed cluster ID
alarmDefinitions, err := a.AlarmsRepository.GetAlarmDefinitions(ctx, request.Body)
if err != nil {
return nil, fmt.Errorf("failed to get AlarmDefinitions: %w", err)
}

// Combine possible definitions with events
aerModels := alertmanager.ConvertAmToAlarmEventRecordModels(request.Body, alarmDefinitions)

// Insert and update AlarmEventRecord
if err := a.AlarmsRepository.UpsertAlarmEventRecord(ctx, aerModels); err != nil {
return nil, fmt.Errorf("failed to upsert AlarmEventRecord to db: %w", err)
}

//TODO: Get subscriber

//TODO: Notify subscriber

// Removed resolved events and move to archive
if err := a.AlarmsRepository.ArchiveResolvedAlarmEventRecord(ctx); err != nil {
return nil, fmt.Errorf("failed to archive AlarmEventRecord to db: %w", err)
}

return api.AmNotification200Response{}, nil
Expand All @@ -113,9 +135,9 @@ func convertAerModelToApi(aerModel models.AlarmEventRecord) api.AlarmEventRecord
AlarmChangedTime: aerModel.AlarmChangedTime,
AlarmClearedTime: aerModel.AlarmClearedTime,
AlarmDefinitionId: aerModel.AlarmDefinitionID,
AlarmEventRecordId: aerModel.AlarmEventRecordID,
AlarmEventRecordId: *aerModel.AlarmEventRecordID,
AlarmRaisedTime: aerModel.AlarmRaisedTime,
PerceivedSeverity: api.PerceivedSeverity(aerModel.PerceivedSeverity),
PerceivedSeverity: aerModel.PerceivedSeverity,
ProbableCauseId: aerModel.ProbableCauseID,
ResourceTypeID: aerModel.ResourceTypeID,
}
Expand Down
109 changes: 108 additions & 1 deletion internal/service/alarms/internal/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ import (
_ "embed"
"fmt"
"log/slog"
template "text/template"

"text/template"

"maps"

"github.com/google/uuid"
api "github.com/openshift-kni/oran-o2ims/internal/service/alarms/api/generated"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/db/models"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -61,3 +67,104 @@ func Setup(ctx context.Context, cl client.Client) error {
slog.Info("Successfully configured alertmanager")
return nil
}

// ConvertAmToAlarmEventRecordModels get alarmEventRecords based on the alertmanager notification and AlarmDefinition
func ConvertAmToAlarmEventRecordModels(am *api.AlertmanagerNotification, aDefinitionRecords []models.AlarmDefinition) []models.AlarmEventRecord {
var records []models.AlarmEventRecord

for _, alert := range am.Alerts {
slog.Info("Converting Alertmanager alert", "alert name", GetAlertName(*alert.Labels))
record := models.AlarmEventRecord{
AlarmRaisedTime: *alert.StartsAt, // nothing for Changed time
AlarmClearedTime: alert.EndsAt,
PerceivedSeverity: getPerceivedSeverity(*alert.Labels),
NotificationEventType: 0,
AlarmStatus: string(*alert.Status),
Fingerprint: *alert.Fingerprint,
ResourceID: GetResourceID(*alert.Labels),
}

// Update Extensions with things we didn't really process
record.Extensions = getExtensions(*alert.Labels, *alert.Annotations)

// See if possible to pick up additional info from its definition
for _, def := range aDefinitionRecords {
if def.AlarmName == GetAlertName(*alert.Labels) { // TODO: match with the resourceTypeID too once init data is ready
record.AlarmDefinitionID = def.AlarmDefinitionID
record.ProbableCauseID = def.ProbableCauseID
}
}

records = append(records, record)
}

return records
}

// GetResourceID for caas alerts it's the cluster ID
func GetResourceID(labels map[string]string) uuid.UUID {
val, ok := labels["managed_cluster"]
if !ok {
slog.Warn("Could not find managed_cluster", "alertname", GetAlertName(labels))
return uuid.UUID{}
}

id, err := uuid.Parse(val)
if err != nil {
slog.Warn("Could convert managed_cluster string too uuid", "err", err.Error())
return uuid.UUID{}
}

return id
}

// GetResourceTypeID get resource Type ID.
// TODO Update this so that resourceTypeID correctly mapped to resourceID
func GetResourceTypeID(labels map[string]string) uuid.UUID {
return GetResourceID(labels)
}

// GetAlertName extract name from alert label
func GetAlertName(labels map[string]string) string {
val, ok := labels["alertname"]
if !ok {
// this may never execute but keeping a check just in case
return "Unknown"
}

return val
}

// getPerceivedSeverity am's `severity` to oran's PerceivedSeverity
func getPerceivedSeverity(labels map[string]string) api.PerceivedSeverity {
input, ok := labels["severity"]
if !ok {
slog.Warn("Could not find severity label. This may impact the value of PerceivedSeverity")
return api.INDETERMINATE
}
slog.Info("Found alert severity", "severity", input)

// TODO: "info" semes like common one but no oran mapping
switch input {
case "cleared":
return api.CLEARED
case "critical":
return api.CRITICAL
case "major":
return api.MAJOR
case "minor":
return api.MINOR
case "warning":
return api.WARNING
default:
slog.Warn("Could not map", "severity", input)
return api.INDETERMINATE
}
}

// getExtensions extract oran extension from alert
func getExtensions(labels, annotations map[string]string) map[string]string {
maps.Copy(labels, annotations)
// TODO: delete anything from extension that we're already using
return labels
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@ route:
matchers:
# Always firing alert to verify alertmanager is working
- alertname=Watchdog
- receiver: oran_alarm_receiver
group_by:
# This label is guaranteed to be present in all alerts
- managed_cluster
matchers:
# Match alerts with a non-empty managed_cluster label
- managed_cluster=~".+"
receivers:
- name: oran_alarm_receiver
webhook_configs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ CREATE SEQUENCE IF NOT EXISTS alarm_sequence_seq
CREATE TABLE IF NOT EXISTS alarm_event_record (
-- O-RAN
alarm_event_record_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), -- Unique identifier for each event record
alarm_definition_id UUID NOT NULL, -- From alarm_definition table
probable_cause_id UUID NOT NULL, -- From alarm_definition table
alarm_definition_id UUID, -- From alarm_definition table
probable_cause_id UUID, -- From alarm_definition table
alarm_raised_time TIMESTAMPTZ NOT NULL, -- From current alert notification
alarm_changed_time TIMESTAMPTZ, -- From current alert notification
alarm_cleared_time TIMESTAMPTZ, -- From current alert notification
Expand All @@ -21,8 +21,8 @@ CREATE TABLE IF NOT EXISTS alarm_event_record (
extensions JSONB, -- Additional data for extensibility

-- O-RAN additional data to create AlarmEventNotification
resource_id UUID NOT NULL, -- Same as manager_cluster_id for caas alerts
resource_type_id UUID NOT NULL, -- Derived from manager_cluster_id
resource_id UUID, -- Same as manager_cluster_id for caas alerts
resource_type_id UUID, -- Derived from manager_cluster_id
notification_event_type INT NOT NULL, -- Should be enum calculated from current alert

-- Internal
Expand All @@ -44,7 +44,7 @@ CREATE OR REPLACE FUNCTION update_alarm_event_sequence()
RETURNS TRIGGER AS $$
BEGIN
-- Update sequence if status changes to 'resolved' or if alarm_changed_time is updated
IF (NEW.status = 'resolved' AND OLD.status IS DISTINCT FROM 'resolved')
IF (NEW.alarm_status = 'resolved' AND OLD.alarm_status IS DISTINCT FROM 'resolved')
OR (NEW.alarm_changed_time IS DISTINCT FROM OLD.alarm_changed_time) THEN
NEW.alarm_sequence_number := nextval('alarm_sequence_seq');
END IF;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
-- Intentionally no constrains or trigger to update value as this table simply archive
CREATE TABLE IF NOT EXISTS alarm_event_record_archive (
alarm_event_record_id UUID PRIMARY KEY,
alarm_definition_id UUID NOT NULL,
probable_cause_id UUID NOT NULL,
alarm_raised_time TIMESTAMPTZ NOT NULL,
alarm_definition_id UUID ,
probable_cause_id UUID ,
alarm_raised_time TIMESTAMPTZ ,
alarm_changed_time TIMESTAMPTZ,
alarm_cleared_time TIMESTAMPTZ,
alarm_acknowledged_time TIMESTAMPTZ,
alarm_acknowledged BOOLEAN NOT NULL DEFAULT FALSE,
perceived_severity INT NOT NULL,
alarm_acknowledged BOOLEAN DEFAULT FALSE,
perceived_severity INT,
extensions JSONB,
resource_id UUID NOT NULL,
resource_type_id UUID NOT NULL,
notification_event_type INT NOT NULL,
alarm_status VARCHAR(20) DEFAULT 'firing' NOT NULL,
finger_print TEXT NOT NULL,
resource_id UUID,
resource_type_id UUID,
notification_event_type INT,
alarm_status VARCHAR(20) DEFAULT 'firing',
fingerprint TEXT,
alarm_sequence_number BIGINT, -- Static; no auto-increment
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
);
21 changes: 21 additions & 0 deletions internal/service/alarms/internal/db/models/alarm_definition.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package models

import "github.com/google/uuid"

type AlarmDefinition struct {
AlarmDefinitionID uuid.UUID `db:"alarm_definition_id"`
AlarmName string `db:"alarm_name"`
AlarmLastChange string `db:"alarm_last_change"`
AlarmChangeType string `db:"alarm_change_type"`
AlarmDescription string `db:"alarm_description"`
ProposedRepairActions string `db:"proposed_repair_actions"`
ClearingType string `db:"clearing_type"`
ManagementInterfaceID []string `db:"management_interface_id"`
PkNotificationField []string `db:"pk_notification_field"`
AlarmAdditionalFields string `db:"alarm_additional_fields"`
AlarmDictionaryID uuid.UUID `db:"alarm_dictionary_id"`
ResourceTypeID uuid.UUID `db:"resource_type_id"`
ProbableCauseID uuid.UUID `db:"probable_cause_id"`
CreatedAt string `db:"created_at"`
UpdatedAt string `db:"updated_at"`
}
39 changes: 22 additions & 17 deletions internal/service/alarms/internal/db/models/alarm_event_record.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,35 @@ package models
import (
"time"

"github.com/stephenafamo/bob/dialect/psql"

"github.com/google/uuid"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/api/generated"
)

// AlarmEventRecord represents a record in the alarm_event_record table.
type AlarmEventRecord struct {
AlarmEventRecordID uuid.UUID `db:"alarm_event_record_id"`
AlarmDefinitionID uuid.UUID `db:"alarm_definition_id"`
ProbableCauseID uuid.UUID `db:"probable_cause_id"`
AlarmRaisedTime time.Time `db:"alarm_raised_time"`
AlarmChangedTime *time.Time `db:"alarm_changed_time"`
AlarmClearedTime *time.Time `db:"alarm_cleared_time"`
AlarmAcknowledgedTime *time.Time `db:"alarm_acknowledged_time"`
AlarmAcknowledged bool `db:"alarm_acknowledged"`
PerceivedSeverity int `db:"perceived_severity"`
Extensions map[string]interface{} `db:"extensions"`
ResourceID uuid.UUID `db:"resource_id"`
ResourceTypeID uuid.UUID `db:"resource_type_id"`
NotificationEventType int `db:"notification_event_type"`
AlarmStatus string `db:"alarm_status"`
Fingerprint string `db:"fingerprint"`
AlarmSequenceNumber int64 `db:"alarm_sequence_number"`
CreatedAt time.Time `db:"created_at"`
AlarmEventRecordID *uuid.UUID `db:"alarm_event_record_id"`
AlarmDefinitionID uuid.UUID `db:"alarm_definition_id"`
ProbableCauseID uuid.UUID `db:"probable_cause_id"`
AlarmRaisedTime time.Time `db:"alarm_raised_time"`
AlarmChangedTime *time.Time `db:"alarm_changed_time"`
AlarmClearedTime *time.Time `db:"alarm_cleared_time"`
AlarmAcknowledgedTime *time.Time `db:"alarm_acknowledged_time"`
AlarmAcknowledged bool `db:"alarm_acknowledged"`
PerceivedSeverity generated.PerceivedSeverity `db:"perceived_severity"`
Extensions map[string]string `db:"extensions"`
ResourceID uuid.UUID `db:"resource_id"`
ResourceTypeID uuid.UUID `db:"resource_type_id"`
NotificationEventType int `db:"notification_event_type"`
AlarmStatus string `db:"alarm_status"`
Fingerprint string `db:"fingerprint"`
AlarmSequenceNumber int64 `db:"alarm_sequence_number"`
CreatedAt time.Time `db:"created_at"`
}

func (r *AlarmEventRecord) TableName() string {
return "alarm_event_record"
}

var AlarmEventRecordView = psql.NewView[AlarmEventRecord]("public", "alarm_event_record")
Loading

0 comments on commit 05e12b2

Please sign in to comment.