feat: add cluster_resources collector (16 metrics)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Davíð Steinn Geirsson 2026-03-20 11:33:03 +00:00
parent 2a51e00fe1
commit a62264edf8
3 changed files with 364 additions and 0 deletions

View file

@ -0,0 +1,240 @@
package collector
import (
"encoding/json"
"fmt"
"log/slog"
"strconv"
"github.com/prometheus/client_golang/prometheus"
)
func init() {
registerCollector("cluster_resources", func(logger *slog.Logger) Collector {
return newClusterResourcesCollector(logger)
})
}
type clusterResourcesCollector struct {
logger *slog.Logger
}
func newClusterResourcesCollector(logger *slog.Logger) *clusterResourcesCollector {
return &clusterResourcesCollector{logger: logger}
}
type resourceEntry struct {
Type string `json:"type"`
ID string `json:"id"`
Node string `json:"node"`
Name string `json:"name"`
Status string `json:"status"`
VMID int `json:"vmid"`
CPU float64 `json:"cpu"`
MaxCPU float64 `json:"maxcpu"`
Mem float64 `json:"mem"`
MaxMem float64 `json:"maxmem"`
Disk float64 `json:"disk"`
MaxDisk float64 `json:"maxdisk"`
Uptime float64 `json:"uptime"`
NetIn float64 `json:"netin"`
NetOut float64 `json:"netout"`
DiskRead float64 `json:"diskread"`
DiskWrite float64 `json:"diskwrite"`
Template int `json:"template"`
HAState string `json:"hastate"`
Tags string `json:"tags"`
Lock string `json:"lock"`
Storage string `json:"storage"`
PluginType string `json:"plugintype"`
Content string `json:"content"`
Shared int `json:"shared"`
}
type resourceResponse struct {
Data []resourceEntry `json:"data"`
}
var (
upDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "up"),
"Whether the resource is up (1) or down (0).",
[]string{"id"},
nil,
)
cpuUsageRatioDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "cpu_usage_ratio"),
"CPU usage ratio.",
[]string{"id"},
nil,
)
cpuUsageLimitDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "cpu_usage_limit"),
"CPU usage limit (number of CPUs).",
[]string{"id"},
nil,
)
memoryUsageBytesDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "memory_usage_bytes"),
"Memory usage in bytes.",
[]string{"id"},
nil,
)
memorySizeBytesDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "memory_size_bytes"),
"Memory size in bytes.",
[]string{"id"},
nil,
)
diskUsageBytesDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "disk_usage_bytes"),
"Disk usage in bytes.",
[]string{"id"},
nil,
)
diskSizeBytesDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "disk_size_bytes"),
"Disk size in bytes.",
[]string{"id"},
nil,
)
uptimeSecondsDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "uptime_seconds"),
"Uptime in seconds.",
[]string{"id"},
nil,
)
networkTransmitBytesTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "network_transmit_bytes_total"),
"Total bytes transmitted over the network.",
[]string{"id"},
nil,
)
networkReceiveBytesTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "network_receive_bytes_total"),
"Total bytes received over the network.",
[]string{"id"},
nil,
)
diskWrittenBytesTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "disk_written_bytes_total"),
"Total bytes written to disk.",
[]string{"id"},
nil,
)
diskReadBytesTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "disk_read_bytes_total"),
"Total bytes read from disk.",
[]string{"id"},
nil,
)
guestInfoDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "guest_info"),
"Information about a guest (VM or container).",
[]string{"id", "node", "name", "type", "template", "tags"},
nil,
)
haStateDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "ha_state"),
"HA manager state of the resource.",
[]string{"id", "state"},
nil,
)
lockStateDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "lock_state"),
"Lock state of the resource.",
[]string{"id", "state"},
nil,
)
storageSharedDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "storage_shared"),
"Whether the storage is shared (1) or local (0).",
[]string{"id"},
nil,
)
storageInfoDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "storage_info"),
"Information about a storage resource.",
[]string{"id", "node", "storage", "plugintype", "content"},
nil,
)
)
func (c *clusterResourcesCollector) Update(client *Client, ch chan<- prometheus.Metric) error {
body, err := client.Get("/cluster/resources")
if err != nil {
return fmt.Errorf("failed to get /cluster/resources: %w", err)
}
var resp resourceResponse
if err := json.Unmarshal(body, &resp); err != nil {
return fmt.Errorf("failed to parse /cluster/resources response: %w", err)
}
for _, entry := range resp.Data {
switch entry.Type {
case "node":
c.emitNode(ch, entry)
case "qemu", "lxc":
c.emitGuest(ch, entry)
case "storage":
c.emitStorage(ch, entry)
}
}
return nil
}
func (c *clusterResourcesCollector) emitNode(ch chan<- prometheus.Metric, e resourceEntry) {
var up float64
if e.Status == "online" {
up = 1
}
ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID)
ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID)
ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID)
ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID)
ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID)
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID)
}
func (c *clusterResourcesCollector) emitGuest(ch chan<- prometheus.Metric, e resourceEntry) {
var up float64
if e.Status == "running" {
up = 1
}
ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID)
ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID)
ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID)
ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID)
ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID)
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID)
ch <- prometheus.MustNewConstMetric(networkTransmitBytesTotalDesc, prometheus.CounterValue, e.NetOut, e.ID)
ch <- prometheus.MustNewConstMetric(networkReceiveBytesTotalDesc, prometheus.CounterValue, e.NetIn, e.ID)
ch <- prometheus.MustNewConstMetric(diskWrittenBytesTotalDesc, prometheus.CounterValue, e.DiskWrite, e.ID)
ch <- prometheus.MustNewConstMetric(diskReadBytesTotalDesc, prometheus.CounterValue, e.DiskRead, e.ID)
ch <- prometheus.MustNewConstMetric(guestInfoDesc, prometheus.GaugeValue, 1,
e.ID, e.Node, e.Name, e.Type, strconv.Itoa(e.Template), e.Tags,
)
if e.HAState != "" {
ch <- prometheus.MustNewConstMetric(haStateDesc, prometheus.GaugeValue, 1, e.ID, e.HAState)
}
if e.Lock != "" {
ch <- prometheus.MustNewConstMetric(lockStateDesc, prometheus.GaugeValue, 1, e.ID, e.Lock)
}
}
func (c *clusterResourcesCollector) emitStorage(ch chan<- prometheus.Metric, e resourceEntry) {
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
ch <- prometheus.MustNewConstMetric(storageSharedDesc, prometheus.GaugeValue, float64(e.Shared), e.ID)
ch <- prometheus.MustNewConstMetric(storageInfoDesc, prometheus.GaugeValue, 1,
e.ID, e.Node, e.Storage, e.PluginType, e.Content,
)
}

View file

@ -0,0 +1,123 @@
package collector
import (
"log/slog"
"strings"
"testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)
func TestClusterResourcesCollector(t *testing.T) {
client := newTestClient(t, map[string]string{
"/cluster/resources": "cluster_resources.json",
})
collector := newClusterResourcesCollector(slog.Default())
adapter := &testCollectorAdapter{client: client, collector: collector}
reg := prometheus.NewRegistry()
reg.MustRegister(adapter)
expected := `
# HELP pve_cpu_usage_limit CPU usage limit (number of CPUs).
# TYPE pve_cpu_usage_limit gauge
pve_cpu_usage_limit{id="node/node01"} 16
pve_cpu_usage_limit{id="qemu/100"} 4
pve_cpu_usage_limit{id="qemu/101"} 2
# HELP pve_cpu_usage_ratio CPU usage ratio.
# TYPE pve_cpu_usage_ratio gauge
pve_cpu_usage_ratio{id="node/node01"} 0.05
pve_cpu_usage_ratio{id="qemu/100"} 0.1
pve_cpu_usage_ratio{id="qemu/101"} 0
# HELP pve_disk_read_bytes_total Total bytes read from disk.
# TYPE pve_disk_read_bytes_total counter
pve_disk_read_bytes_total{id="qemu/100"} 5.24288e+06
pve_disk_read_bytes_total{id="qemu/101"} 0
# HELP pve_disk_size_bytes Disk size in bytes.
# TYPE pve_disk_size_bytes gauge
pve_disk_size_bytes{id="node/node01"} 1.073741824e+11
pve_disk_size_bytes{id="qemu/100"} 5.36870912e+10
pve_disk_size_bytes{id="qemu/101"} 3.221225472e+10
pve_disk_size_bytes{id="storage/node01/local-lvm"} 1.073741824e+11
# HELP pve_disk_usage_bytes Disk usage in bytes.
# TYPE pve_disk_usage_bytes gauge
pve_disk_usage_bytes{id="node/node01"} 1.073741824e+10
pve_disk_usage_bytes{id="qemu/100"} 0
pve_disk_usage_bytes{id="qemu/101"} 0
pve_disk_usage_bytes{id="storage/node01/local-lvm"} 2.147483648e+10
# HELP pve_disk_written_bytes_total Total bytes written to disk.
# TYPE pve_disk_written_bytes_total counter
pve_disk_written_bytes_total{id="qemu/100"} 1.048576e+06
pve_disk_written_bytes_total{id="qemu/101"} 0
# HELP pve_guest_info Information about a guest (VM or container).
# TYPE pve_guest_info gauge
pve_guest_info{id="qemu/100",name="web-server",node="node01",tags="prod;web",template="0",type="qemu"} 1
pve_guest_info{id="qemu/101",name="db-backup",node="node01",tags="",template="0",type="qemu"} 1
# HELP pve_ha_state HA manager state of the resource.
# TYPE pve_ha_state gauge
pve_ha_state{id="qemu/100",state="started"} 1
# HELP pve_lock_state Lock state of the resource.
# TYPE pve_lock_state gauge
pve_lock_state{id="qemu/101",state="backup"} 1
# HELP pve_memory_size_bytes Memory size in bytes.
# TYPE pve_memory_size_bytes gauge
pve_memory_size_bytes{id="node/node01"} 3.4359738368e+10
pve_memory_size_bytes{id="qemu/100"} 4.294967296e+09
pve_memory_size_bytes{id="qemu/101"} 2.147483648e+09
# HELP pve_memory_usage_bytes Memory usage in bytes.
# TYPE pve_memory_usage_bytes gauge
pve_memory_usage_bytes{id="node/node01"} 8.589934592e+09
pve_memory_usage_bytes{id="qemu/100"} 2.147483648e+09
pve_memory_usage_bytes{id="qemu/101"} 0
# HELP pve_network_receive_bytes_total Total bytes received over the network.
# TYPE pve_network_receive_bytes_total counter
pve_network_receive_bytes_total{id="qemu/100"} 1.048576e+06
pve_network_receive_bytes_total{id="qemu/101"} 0
# HELP pve_network_transmit_bytes_total Total bytes transmitted over the network.
# TYPE pve_network_transmit_bytes_total counter
pve_network_transmit_bytes_total{id="qemu/100"} 2.097152e+06
pve_network_transmit_bytes_total{id="qemu/101"} 0
# HELP pve_storage_info Information about a storage resource.
# TYPE pve_storage_info gauge
pve_storage_info{content="images,rootdir",id="storage/node01/local-lvm",node="node01",plugintype="lvmthin",storage="local-lvm"} 1
# HELP pve_storage_shared Whether the storage is shared (1) or local (0).
# TYPE pve_storage_shared gauge
pve_storage_shared{id="storage/node01/local-lvm"} 0
# HELP pve_up Whether the resource is up (1) or down (0).
# TYPE pve_up gauge
pve_up{id="node/node01"} 1
pve_up{id="qemu/100"} 1
pve_up{id="qemu/101"} 0
# HELP pve_uptime_seconds Uptime in seconds.
# TYPE pve_uptime_seconds gauge
pve_uptime_seconds{id="node/node01"} 123456
pve_uptime_seconds{id="qemu/100"} 3600
pve_uptime_seconds{id="qemu/101"} 0
`
metricNames := []string{
"pve_up",
"pve_cpu_usage_ratio",
"pve_cpu_usage_limit",
"pve_memory_usage_bytes",
"pve_memory_size_bytes",
"pve_disk_usage_bytes",
"pve_disk_size_bytes",
"pve_uptime_seconds",
"pve_network_transmit_bytes_total",
"pve_network_receive_bytes_total",
"pve_disk_written_bytes_total",
"pve_disk_read_bytes_total",
"pve_guest_info",
"pve_ha_state",
"pve_lock_state",
"pve_storage_shared",
"pve_storage_info",
}
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), metricNames...); err != nil {
t.Errorf("unexpected metrics: %s", err)
}
}

View file

@ -0,0 +1 @@
{"data":[{"type":"node","id":"node/node01","node":"node01","status":"online","cpu":0.05,"maxcpu":16,"mem":8589934592,"maxmem":34359738368,"disk":10737418240,"maxdisk":107374182400,"uptime":123456},{"type":"qemu","id":"qemu/100","node":"node01","name":"web-server","status":"running","vmid":100,"cpu":0.1,"maxcpu":4,"mem":2147483648,"maxmem":4294967296,"disk":0,"maxdisk":53687091200,"uptime":3600,"netin":1048576,"netout":2097152,"diskread":5242880,"diskwrite":1048576,"template":0,"hastate":"started","tags":"prod;web","lock":""},{"type":"qemu","id":"qemu/101","node":"node01","name":"db-backup","status":"stopped","vmid":101,"cpu":0,"maxcpu":2,"mem":0,"maxmem":2147483648,"disk":0,"maxdisk":32212254720,"uptime":0,"netin":0,"netout":0,"diskread":0,"diskwrite":0,"template":0,"hastate":"","tags":"","lock":"backup"},{"type":"storage","id":"storage/node01/local-lvm","node":"node01","storage":"local-lvm","status":"available","disk":21474836480,"maxdisk":107374182400,"plugintype":"lvmthin","content":"images,rootdir","shared":0}]}