Convert HA API service IDs (vm:106, ct:200) to the resource ID format used by /cluster/resources and the Python exporter (qemu/106, lxc/200). Rename label from "sid" to "id" so HA metrics can be joined with pve_ha_state, pve_guest_info, and other id-labeled metrics. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
175 lines
4.8 KiB
Go
175 lines
4.8 KiB
Go
package collector
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
func init() {
|
|
registerCollector("ha_status", func(logger *slog.Logger) Collector {
|
|
return newHAStatusCollector(logger)
|
|
})
|
|
}
|
|
|
|
type haStatusCollector struct {
|
|
logger *slog.Logger
|
|
}
|
|
|
|
func newHAStatusCollector(logger *slog.Logger) *haStatusCollector {
|
|
return &haStatusCollector{logger: logger}
|
|
}
|
|
|
|
type haManagerStatusResponse struct {
|
|
Data haManagerStatusData `json:"data"`
|
|
}
|
|
|
|
type haManagerStatusData struct {
|
|
ManagerStatus haManagerStatus `json:"manager_status"`
|
|
LRMStatus map[string]haLRMEntry `json:"lrm_status"`
|
|
}
|
|
|
|
type haManagerStatus struct {
|
|
MasterNode string `json:"master_node"`
|
|
NodeStatus map[string]string `json:"node_status"`
|
|
ServiceStatus map[string]haServiceRuntime `json:"service_status"`
|
|
}
|
|
|
|
type haLRMEntry struct {
|
|
Mode string `json:"mode"`
|
|
State string `json:"state"`
|
|
Timestamp float64 `json:"timestamp"`
|
|
}
|
|
|
|
type haServiceRuntime struct {
|
|
Node string `json:"node"`
|
|
Running int `json:"running"`
|
|
State string `json:"state"`
|
|
}
|
|
|
|
type haResourcesResponse struct {
|
|
Data []haResourceEntry `json:"data"`
|
|
}
|
|
|
|
type haResourceEntry struct {
|
|
SID string `json:"sid"`
|
|
Type string `json:"type"`
|
|
State string `json:"state"`
|
|
MaxRestart int `json:"max_restart"`
|
|
MaxRelocate int `json:"max_relocate"`
|
|
Failback int `json:"failback"`
|
|
}
|
|
|
|
var (
|
|
haCRMMasterDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "crm_master"),
|
|
"Whether a node is the CRM master.",
|
|
[]string{"node"}, nil,
|
|
)
|
|
haNodeStatusDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "node_status"),
|
|
"HA node status.",
|
|
[]string{"node", "status"}, nil,
|
|
)
|
|
haLRMTimestampDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "lrm_timestamp_seconds"),
|
|
"Last LRM heartbeat as Unix timestamp.",
|
|
[]string{"node"}, nil,
|
|
)
|
|
haLRMModeDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "lrm_mode"),
|
|
"LRM mode for a node.",
|
|
[]string{"node", "mode"}, nil,
|
|
)
|
|
haServiceConfigDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "service_config"),
|
|
"HA service configuration.",
|
|
[]string{"id", "type", "max_restart", "max_relocate", "failback"}, nil,
|
|
)
|
|
haServiceStatusDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, "ha", "service_status"),
|
|
"HA service runtime status.",
|
|
[]string{"id", "node", "state"}, nil,
|
|
)
|
|
)
|
|
|
|
func (c *haStatusCollector) Update(client *Client, ch chan<- prometheus.Metric) error {
|
|
// Fetch manager status.
|
|
mgrBody, err := client.Get("/cluster/ha/status/manager_status")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get HA manager status: %w", err)
|
|
}
|
|
|
|
var mgrResp haManagerStatusResponse
|
|
if err := json.Unmarshal(mgrBody, &mgrResp); err != nil {
|
|
return fmt.Errorf("failed to parse HA manager status: %w", err)
|
|
}
|
|
|
|
mgr := mgrResp.Data.ManagerStatus
|
|
|
|
// CRM master — emit for all nodes in node_status.
|
|
for node := range mgr.NodeStatus {
|
|
var val float64
|
|
if node == mgr.MasterNode {
|
|
val = 1
|
|
}
|
|
ch <- prometheus.MustNewConstMetric(haCRMMasterDesc, prometheus.GaugeValue, val, node)
|
|
}
|
|
|
|
// Node status.
|
|
for node, status := range mgr.NodeStatus {
|
|
ch <- prometheus.MustNewConstMetric(haNodeStatusDesc, prometheus.GaugeValue, 1, node, status)
|
|
}
|
|
|
|
// LRM status.
|
|
for node, lrm := range mgrResp.Data.LRMStatus {
|
|
ch <- prometheus.MustNewConstMetric(haLRMTimestampDesc, prometheus.GaugeValue, lrm.Timestamp, node)
|
|
ch <- prometheus.MustNewConstMetric(haLRMModeDesc, prometheus.GaugeValue, 1, node, lrm.Mode)
|
|
}
|
|
|
|
// Service runtime status from manager_status.
|
|
for sid, svc := range mgr.ServiceStatus {
|
|
ch <- prometheus.MustNewConstMetric(haServiceStatusDesc, prometheus.GaugeValue, 1, haSIDToID(sid), svc.Node, svc.State)
|
|
}
|
|
|
|
// Fetch HA resources for service config.
|
|
resBody, err := client.Get("/cluster/ha/resources")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get HA resources: %w", err)
|
|
}
|
|
|
|
var resResp haResourcesResponse
|
|
if err := json.Unmarshal(resBody, &resResp); err != nil {
|
|
return fmt.Errorf("failed to parse HA resources: %w", err)
|
|
}
|
|
|
|
for _, res := range resResp.Data {
|
|
ch <- prometheus.MustNewConstMetric(haServiceConfigDesc, prometheus.GaugeValue, 1,
|
|
haSIDToID(res.SID), res.Type,
|
|
strconv.Itoa(res.MaxRestart), strconv.Itoa(res.MaxRelocate), strconv.Itoa(res.Failback),
|
|
)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// haSIDToID converts HA service IDs (e.g. "vm:106", "ct:200") to the
|
|
// resource ID format used by /cluster/resources (e.g. "qemu/106", "lxc/200").
|
|
func haSIDToID(sid string) string {
|
|
parts := strings.SplitN(sid, ":", 2)
|
|
if len(parts) != 2 {
|
|
return sid
|
|
}
|
|
switch parts[0] {
|
|
case "vm":
|
|
return "qemu/" + parts[1]
|
|
case "ct":
|
|
return "lxc/" + parts[1]
|
|
default:
|
|
return sid
|
|
}
|
|
}
|