Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

クラスターメトリクスに切断中のノードを含め、接続状態を gauge の値で返す #37

Merged
merged 3 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# CHANGES

## develop

- [FIX] Sora 2023.2.0 で `ListClusterNodes` API の `include_all_known_nodes` のデフォルト値が変更で panic が起こす問題に対応する
- Sora 2023.2.0 以降で Sora Exporter 2023.5.0 以前のバージョンを使用し、クラスターメトリクスが有効になっている場合に発生する
- @tnamao
- [CHANGE] Sora の `ListClusterNodes` API を呼び出す際に、API リクエストの `include_all_known_nodes` を `true` にし切断中のノードも含め、接続状態を gauge で返すようにする
- **破壊的変更** になるため、バージョンアップの際に注意してください
- gauge の値は 1 が接続、0 が切断を表し `ListClusterNodes` API のレスポンスに含まれる `connected` の値により返す値を切り替えている
- @tnamao

## 2023.5.0

- [UPDATE] CI の staticcheck のバージョンを 2023.1.6 に上げる
Expand Down
14 changes: 10 additions & 4 deletions collector/cluster_node.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package collector

import "github.com/prometheus/client_golang/prometheus"
import (
"github.com/prometheus/client_golang/prometheus"
)

var (
soraClusterMetrics = SoraClusterMetrics{
Expand All @@ -27,10 +29,14 @@ func (m *SoraClusterMetrics) Describe(ch chan<- *prometheus.Desc) {

func (m *SoraClusterMetrics) Collect(ch chan<- prometheus.Metric, nodeList []soraClusterNode, report soraClusterReport) {
for _, node := range nodeList {
if node.ClusterNodeName != nil {
ch <- newGauge(m.clusterNode, 1, *node.ClusterNodeName, *node.Mode)
value := 0.0
if node.Connected {
value = 1.0
}
if node.ClusterNodeName != "" {
ch <- newGauge(m.clusterNode, value, node.ClusterNodeName, node.Mode)
} else {
ch <- newGauge(m.clusterNode, 1, *node.NodeName, *node.Mode)
ch <- newGauge(m.clusterNode, value, node.NodeName, node.Mode)
}
}
ch <- newGauge(m.raftState, 1.0, report.RaftState)
Expand Down
17 changes: 16 additions & 1 deletion collector/collector.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package collector

import (
"bytes"
"context"
"crypto/tls"
"encoding/json"
Expand Down Expand Up @@ -50,6 +51,10 @@ type HTTPClient interface {
Do(*http.Request) (*http.Response, error)
}

type SoraListClusterNodesRequest struct {
IncludeAllKnownNodes bool `json:"include_all_known_nodes"`
}

func NewCollector(options *CollectorOptions) *Collector {
return &Collector{
URI: options.URI,
Expand Down Expand Up @@ -112,7 +117,17 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) {

var nodeList []soraClusterNode
if c.EnableSoraClusterMetrics {
req, err = http.NewRequestWithContext(ctx, http.MethodPost, c.URI, nil)
requestParams := SoraListClusterNodesRequest{
IncludeAllKnownNodes: true,
}
encodedParams, err := json.Marshal(requestParams)
if err != nil {
level.Error(c.logger).Log("msg", "failed to encode Sora ListClusterNodes API request parameters", "err", err)
ch <- newGauge(c.soraUp, 0)
return
}

req, err = http.NewRequestWithContext(ctx, http.MethodPost, c.URI, bytes.NewBuffer(encodedParams))
if err != nil {
level.Error(c.logger).Log("msg", "failed to create request to sora", "err", err)
ch <- newGauge(c.soraUp, 0)
Expand Down
7 changes: 4 additions & 3 deletions collector/sora_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,10 @@ type soraClusterReport struct {
}

type soraClusterNode struct {
ClusterNodeName *string `json:"cluster_node_name"`
NodeName *string `json:"node_name"`
Mode *string `json:"mode"`
ClusterNodeName string `json:"cluster_node_name"`
NodeName string `json:"node_name"`
Mode string `json:"mode"`
Connected bool `json:"connected"`
}

type soraLicenseInfo struct {
Expand Down
55 changes: 32 additions & 23 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,7 @@ var (
listClusterNodesJSONData = `[
{
"node_name": "[email protected]",
"epoch": 1,
"mode": "normal",
"cluster_signaling_url": "ws://127.0.0.1:5001/signaling",
"cluster_api_url": "http://127.0.0.1:3101/",
"member_since": "2022-05-09T07:44:52.973761Z",
"sora_version": "2022.1.0-canary.44",
"license_max_nodes": 10,
"license_max_connections": 100,
"license_serial_code": "SAMPLE-SRA-E001-202212-N10-100",
"license_type": "Experimental",
"connected": true
"connected": false
},
{
"node_name": "[email protected]",
Expand All @@ -175,20 +165,25 @@ var (
"license_serial_code": "SAMPLE-SRA-E001-202212-N10-100",
"license_type": "Experimental",
"connected": true
}
]`
},
{
"node_name": "[email protected]",
"epoch": 1,
"mode": "normal",
"cluster_signaling_url": "ws://127.0.0.1:5001/signaling",
"cluster_api_url": "http://127.0.0.1:3101/",
"member_since": "2022-05-09T07:44:54.160763Z",
"sora_version": "2022.1.0-canary.44",
"license_max_nodes": 10,
"license_max_connections": 100,
"license_serial_code": "SAMPLE-SRA-E001-202212-N10-100",
"license_type": "Experimental",
"connected": true
}
]`
listClusterNodesCurrentJSONData = `[
{
"cluster_node_name": "[email protected]",
"epoch": 1,
"mode": "normal",
"member_since": "2022-05-02T15:26:44.302363Z",
"sora_version": "2021.2.9",
"license_max_connections": 100,
"license_serial_code": "SAMPLE-SRA-E001-202212-N10-100",
"license_type": "Experimental",
"cluster_signaling_url": "ws://127.0.0.1:5001/signaling",
"cluster_api_url": "http://10.1.1.4:3000/",
"connected": false
},
{
Expand All @@ -203,7 +198,21 @@ var (
"cluster_signaling_url": "ws://127.0.0.1:5002/signaling",
"cluster_api_url": "http://10.1.1.3:3000/",
"connected": true
}
},
{
"node_name": "[email protected]",
"epoch": 1,
"mode": "normal",
"cluster_signaling_url": "ws://127.0.0.1:5001/signaling",
"cluster_api_url": "http://127.0.0.1:3101/",
"member_since": "2022-05-09T07:44:54.160763Z",
"sora_version": "2022.1.0-canary.44",
"license_max_nodes": 10,
"license_max_connections": 100,
"license_serial_code": "SAMPLE-SRA-E001-202212-N10-100",
"license_type": "Experimental",
"connected": true
}
]`
getLicenseJSONDATA = `{
"expired_at": "2025-09",
Expand Down
3 changes: 2 additions & 1 deletion test/maximum.metrics
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ sora_average_duration_seconds 706
sora_average_setup_time_seconds 0
# HELP sora_cluster_node The sora server known cluster node.
# TYPE sora_cluster_node gauge
sora_cluster_node{mode="",node_name="[email protected]"} 0
sora_cluster_node{mode="block_new_connection",node_name="[email protected]"} 1
sora_cluster_node{mode="normal",node_name="node-01_canary_sora@10.211.55.42"} 1
sora_cluster_node{mode="normal",node_name="node-03_canary_sora@10.211.55.41"} 1
# HELP sora_cluster_raft_commit_index The latest committed Raft log index.
# TYPE sora_cluster_raft_commit_index counter
sora_cluster_raft_commit_index 10
Expand Down
3 changes: 2 additions & 1 deletion test/sora_cluster_metrics_enabled.metrics
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ sora_average_duration_seconds 706
sora_average_setup_time_seconds 0
# HELP sora_cluster_node The sora server known cluster node.
# TYPE sora_cluster_node gauge
sora_cluster_node{mode="normal",node_name="[email protected]"} 1
sora_cluster_node{mode="block_new_connection",node_name="[email protected]"} 1
sora_cluster_node{mode="normal",node_name="[email protected]"} 1
sora_cluster_node{mode="",node_name="[email protected]"} 0
# HELP sora_cluster_raft_commit_index The latest committed Raft log index.
# TYPE sora_cluster_raft_commit_index counter
sora_cluster_raft_commit_index 10
Expand Down