fix: normalize HA service IDs to match cluster_resources format
Convert HA API service IDs (vm:106, ct:200) to the resource ID format used by /cluster/resources and the Python exporter (qemu/106, lxc/200). Rename label from "sid" to "id" so HA metrics can be joined with pve_ha_state, pve_guest_info, and other id-labeled metrics. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
01dbc7cee4
commit
5e066a5c4b
3 changed files with 26 additions and 8 deletions
|
|
@ -165,8 +165,8 @@ Create a PVE API token with at least `PVEAuditor` role. Provide it via:
|
|||
| `pve_ha_node_status` | Gauge | `node`, `status` | Per-node HA status (always 1) |
|
||||
| `pve_ha_lrm_timestamp_seconds` | Gauge | `node` | Last LRM heartbeat as Unix timestamp |
|
||||
| `pve_ha_lrm_mode` | Gauge | `node`, `mode` | LRM mode per node (always 1) |
|
||||
| `pve_ha_service_config` | Gauge | `sid`, `type`, `max_restart`, `max_relocate`, `failback` | Service config (always 1) |
|
||||
| `pve_ha_service_status` | Gauge | `sid`, `node`, `state` | Service runtime state (always 1) |
|
||||
| `pve_ha_service_config` | Gauge | `id`, `type`, `max_restart`, `max_relocate`, `failback` | Service config (always 1) |
|
||||
| `pve_ha_service_status` | Gauge | `id`, `node`, `state` | Service runtime state (always 1) |
|
||||
|
||||
### Physical Disks
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
|
@ -87,12 +88,12 @@ var (
|
|||
haServiceConfigDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "ha", "service_config"),
|
||||
"HA service configuration.",
|
||||
[]string{"sid", "type", "max_restart", "max_relocate", "failback"}, nil,
|
||||
[]string{"id", "type", "max_restart", "max_relocate", "failback"}, nil,
|
||||
)
|
||||
haServiceStatusDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "ha", "service_status"),
|
||||
"HA service runtime status.",
|
||||
[]string{"sid", "node", "state"}, nil,
|
||||
[]string{"id", "node", "state"}, nil,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -132,7 +133,7 @@ func (c *haStatusCollector) Update(client *Client, ch chan<- prometheus.Metric)
|
|||
|
||||
// Service runtime status from manager_status.
|
||||
for sid, svc := range mgr.ServiceStatus {
|
||||
ch <- prometheus.MustNewConstMetric(haServiceStatusDesc, prometheus.GaugeValue, 1, sid, svc.Node, svc.State)
|
||||
ch <- prometheus.MustNewConstMetric(haServiceStatusDesc, prometheus.GaugeValue, 1, haSIDToID(sid), svc.Node, svc.State)
|
||||
}
|
||||
|
||||
// Fetch HA resources for service config.
|
||||
|
|
@ -148,10 +149,27 @@ func (c *haStatusCollector) Update(client *Client, ch chan<- prometheus.Metric)
|
|||
|
||||
for _, res := range resResp.Data {
|
||||
ch <- prometheus.MustNewConstMetric(haServiceConfigDesc, prometheus.GaugeValue, 1,
|
||||
res.SID, res.Type,
|
||||
haSIDToID(res.SID), res.Type,
|
||||
strconv.Itoa(res.MaxRestart), strconv.Itoa(res.MaxRelocate), strconv.Itoa(res.Failback),
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// haSIDToID converts HA service IDs (e.g. "vm:106", "ct:200") to the
|
||||
// resource ID format used by /cluster/resources (e.g. "qemu/106", "lxc/200").
|
||||
func haSIDToID(sid string) string {
|
||||
parts := strings.SplitN(sid, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
return sid
|
||||
}
|
||||
switch parts[0] {
|
||||
case "vm":
|
||||
return "qemu/" + parts[1]
|
||||
case "ct":
|
||||
return "lxc/" + parts[1]
|
||||
default:
|
||||
return sid
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,10 +44,10 @@ pve_ha_node_status{node="node02",status="online"} 1
|
|||
pve_ha_node_status{node="node03",status="online"} 1
|
||||
# HELP pve_ha_service_config HA service configuration.
|
||||
# TYPE pve_ha_service_config gauge
|
||||
pve_ha_service_config{failback="1",max_relocate="2",max_restart="2",sid="vm:106",type="vm"} 1
|
||||
pve_ha_service_config{failback="1",id="qemu/106",max_relocate="2",max_restart="2",type="vm"} 1
|
||||
# HELP pve_ha_service_status HA service runtime status.
|
||||
# TYPE pve_ha_service_status gauge
|
||||
pve_ha_service_status{node="node01",sid="vm:106",state="started"} 1
|
||||
pve_ha_service_status{id="qemu/106",node="node01",state="started"} 1
|
||||
`
|
||||
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue