feat: add cluster_resources collector (16 metrics)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2a51e00fe1
commit
a62264edf8
3 changed files with 364 additions and 0 deletions
240
collector/cluster_resources.go
Normal file
240
collector/cluster_resources.go
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerCollector("cluster_resources", func(logger *slog.Logger) Collector {
|
||||
return newClusterResourcesCollector(logger)
|
||||
})
|
||||
}
|
||||
|
||||
type clusterResourcesCollector struct {
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func newClusterResourcesCollector(logger *slog.Logger) *clusterResourcesCollector {
|
||||
return &clusterResourcesCollector{logger: logger}
|
||||
}
|
||||
|
||||
type resourceEntry struct {
|
||||
Type string `json:"type"`
|
||||
ID string `json:"id"`
|
||||
Node string `json:"node"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
VMID int `json:"vmid"`
|
||||
CPU float64 `json:"cpu"`
|
||||
MaxCPU float64 `json:"maxcpu"`
|
||||
Mem float64 `json:"mem"`
|
||||
MaxMem float64 `json:"maxmem"`
|
||||
Disk float64 `json:"disk"`
|
||||
MaxDisk float64 `json:"maxdisk"`
|
||||
Uptime float64 `json:"uptime"`
|
||||
NetIn float64 `json:"netin"`
|
||||
NetOut float64 `json:"netout"`
|
||||
DiskRead float64 `json:"diskread"`
|
||||
DiskWrite float64 `json:"diskwrite"`
|
||||
Template int `json:"template"`
|
||||
HAState string `json:"hastate"`
|
||||
Tags string `json:"tags"`
|
||||
Lock string `json:"lock"`
|
||||
Storage string `json:"storage"`
|
||||
PluginType string `json:"plugintype"`
|
||||
Content string `json:"content"`
|
||||
Shared int `json:"shared"`
|
||||
}
|
||||
|
||||
type resourceResponse struct {
|
||||
Data []resourceEntry `json:"data"`
|
||||
}
|
||||
|
||||
var (
|
||||
upDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "up"),
|
||||
"Whether the resource is up (1) or down (0).",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
cpuUsageRatioDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "cpu_usage_ratio"),
|
||||
"CPU usage ratio.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
cpuUsageLimitDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "cpu_usage_limit"),
|
||||
"CPU usage limit (number of CPUs).",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
memoryUsageBytesDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "memory_usage_bytes"),
|
||||
"Memory usage in bytes.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
memorySizeBytesDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "memory_size_bytes"),
|
||||
"Memory size in bytes.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
diskUsageBytesDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "disk_usage_bytes"),
|
||||
"Disk usage in bytes.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
diskSizeBytesDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "disk_size_bytes"),
|
||||
"Disk size in bytes.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
uptimeSecondsDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "uptime_seconds"),
|
||||
"Uptime in seconds.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
networkTransmitBytesTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "network_transmit_bytes_total"),
|
||||
"Total bytes transmitted over the network.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
networkReceiveBytesTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "network_receive_bytes_total"),
|
||||
"Total bytes received over the network.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
diskWrittenBytesTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "disk_written_bytes_total"),
|
||||
"Total bytes written to disk.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
diskReadBytesTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "disk_read_bytes_total"),
|
||||
"Total bytes read from disk.",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
guestInfoDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "guest_info"),
|
||||
"Information about a guest (VM or container).",
|
||||
[]string{"id", "node", "name", "type", "template", "tags"},
|
||||
nil,
|
||||
)
|
||||
haStateDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "ha_state"),
|
||||
"HA manager state of the resource.",
|
||||
[]string{"id", "state"},
|
||||
nil,
|
||||
)
|
||||
lockStateDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "lock_state"),
|
||||
"Lock state of the resource.",
|
||||
[]string{"id", "state"},
|
||||
nil,
|
||||
)
|
||||
storageSharedDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "storage_shared"),
|
||||
"Whether the storage is shared (1) or local (0).",
|
||||
[]string{"id"},
|
||||
nil,
|
||||
)
|
||||
storageInfoDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "storage_info"),
|
||||
"Information about a storage resource.",
|
||||
[]string{"id", "node", "storage", "plugintype", "content"},
|
||||
nil,
|
||||
)
|
||||
)
|
||||
|
||||
func (c *clusterResourcesCollector) Update(client *Client, ch chan<- prometheus.Metric) error {
|
||||
body, err := client.Get("/cluster/resources")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get /cluster/resources: %w", err)
|
||||
}
|
||||
|
||||
var resp resourceResponse
|
||||
if err := json.Unmarshal(body, &resp); err != nil {
|
||||
return fmt.Errorf("failed to parse /cluster/resources response: %w", err)
|
||||
}
|
||||
|
||||
for _, entry := range resp.Data {
|
||||
switch entry.Type {
|
||||
case "node":
|
||||
c.emitNode(ch, entry)
|
||||
case "qemu", "lxc":
|
||||
c.emitGuest(ch, entry)
|
||||
case "storage":
|
||||
c.emitStorage(ch, entry)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *clusterResourcesCollector) emitNode(ch chan<- prometheus.Metric, e resourceEntry) {
|
||||
var up float64
|
||||
if e.Status == "online" {
|
||||
up = 1
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID)
|
||||
}
|
||||
|
||||
func (c *clusterResourcesCollector) emitGuest(ch chan<- prometheus.Metric, e resourceEntry) {
|
||||
var up float64
|
||||
if e.Status == "running" {
|
||||
up = 1
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(networkTransmitBytesTotalDesc, prometheus.CounterValue, e.NetOut, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(networkReceiveBytesTotalDesc, prometheus.CounterValue, e.NetIn, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskWrittenBytesTotalDesc, prometheus.CounterValue, e.DiskWrite, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskReadBytesTotalDesc, prometheus.CounterValue, e.DiskRead, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(guestInfoDesc, prometheus.GaugeValue, 1,
|
||||
e.ID, e.Node, e.Name, e.Type, strconv.Itoa(e.Template), e.Tags,
|
||||
)
|
||||
|
||||
if e.HAState != "" {
|
||||
ch <- prometheus.MustNewConstMetric(haStateDesc, prometheus.GaugeValue, 1, e.ID, e.HAState)
|
||||
}
|
||||
|
||||
if e.Lock != "" {
|
||||
ch <- prometheus.MustNewConstMetric(lockStateDesc, prometheus.GaugeValue, 1, e.ID, e.Lock)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *clusterResourcesCollector) emitStorage(ch chan<- prometheus.Metric, e resourceEntry) {
|
||||
ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(storageSharedDesc, prometheus.GaugeValue, float64(e.Shared), e.ID)
|
||||
ch <- prometheus.MustNewConstMetric(storageInfoDesc, prometheus.GaugeValue, 1,
|
||||
e.ID, e.Node, e.Storage, e.PluginType, e.Content,
|
||||
)
|
||||
}
|
||||
123
collector/cluster_resources_test.go
Normal file
123
collector/cluster_resources_test.go
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
package collector
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
)
|
||||
|
||||
func TestClusterResourcesCollector(t *testing.T) {
|
||||
client := newTestClient(t, map[string]string{
|
||||
"/cluster/resources": "cluster_resources.json",
|
||||
})
|
||||
|
||||
collector := newClusterResourcesCollector(slog.Default())
|
||||
adapter := &testCollectorAdapter{client: client, collector: collector}
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
reg.MustRegister(adapter)
|
||||
|
||||
expected := `
|
||||
# HELP pve_cpu_usage_limit CPU usage limit (number of CPUs).
|
||||
# TYPE pve_cpu_usage_limit gauge
|
||||
pve_cpu_usage_limit{id="node/node01"} 16
|
||||
pve_cpu_usage_limit{id="qemu/100"} 4
|
||||
pve_cpu_usage_limit{id="qemu/101"} 2
|
||||
# HELP pve_cpu_usage_ratio CPU usage ratio.
|
||||
# TYPE pve_cpu_usage_ratio gauge
|
||||
pve_cpu_usage_ratio{id="node/node01"} 0.05
|
||||
pve_cpu_usage_ratio{id="qemu/100"} 0.1
|
||||
pve_cpu_usage_ratio{id="qemu/101"} 0
|
||||
# HELP pve_disk_read_bytes_total Total bytes read from disk.
|
||||
# TYPE pve_disk_read_bytes_total counter
|
||||
pve_disk_read_bytes_total{id="qemu/100"} 5.24288e+06
|
||||
pve_disk_read_bytes_total{id="qemu/101"} 0
|
||||
# HELP pve_disk_size_bytes Disk size in bytes.
|
||||
# TYPE pve_disk_size_bytes gauge
|
||||
pve_disk_size_bytes{id="node/node01"} 1.073741824e+11
|
||||
pve_disk_size_bytes{id="qemu/100"} 5.36870912e+10
|
||||
pve_disk_size_bytes{id="qemu/101"} 3.221225472e+10
|
||||
pve_disk_size_bytes{id="storage/node01/local-lvm"} 1.073741824e+11
|
||||
# HELP pve_disk_usage_bytes Disk usage in bytes.
|
||||
# TYPE pve_disk_usage_bytes gauge
|
||||
pve_disk_usage_bytes{id="node/node01"} 1.073741824e+10
|
||||
pve_disk_usage_bytes{id="qemu/100"} 0
|
||||
pve_disk_usage_bytes{id="qemu/101"} 0
|
||||
pve_disk_usage_bytes{id="storage/node01/local-lvm"} 2.147483648e+10
|
||||
# HELP pve_disk_written_bytes_total Total bytes written to disk.
|
||||
# TYPE pve_disk_written_bytes_total counter
|
||||
pve_disk_written_bytes_total{id="qemu/100"} 1.048576e+06
|
||||
pve_disk_written_bytes_total{id="qemu/101"} 0
|
||||
# HELP pve_guest_info Information about a guest (VM or container).
|
||||
# TYPE pve_guest_info gauge
|
||||
pve_guest_info{id="qemu/100",name="web-server",node="node01",tags="prod;web",template="0",type="qemu"} 1
|
||||
pve_guest_info{id="qemu/101",name="db-backup",node="node01",tags="",template="0",type="qemu"} 1
|
||||
# HELP pve_ha_state HA manager state of the resource.
|
||||
# TYPE pve_ha_state gauge
|
||||
pve_ha_state{id="qemu/100",state="started"} 1
|
||||
# HELP pve_lock_state Lock state of the resource.
|
||||
# TYPE pve_lock_state gauge
|
||||
pve_lock_state{id="qemu/101",state="backup"} 1
|
||||
# HELP pve_memory_size_bytes Memory size in bytes.
|
||||
# TYPE pve_memory_size_bytes gauge
|
||||
pve_memory_size_bytes{id="node/node01"} 3.4359738368e+10
|
||||
pve_memory_size_bytes{id="qemu/100"} 4.294967296e+09
|
||||
pve_memory_size_bytes{id="qemu/101"} 2.147483648e+09
|
||||
# HELP pve_memory_usage_bytes Memory usage in bytes.
|
||||
# TYPE pve_memory_usage_bytes gauge
|
||||
pve_memory_usage_bytes{id="node/node01"} 8.589934592e+09
|
||||
pve_memory_usage_bytes{id="qemu/100"} 2.147483648e+09
|
||||
pve_memory_usage_bytes{id="qemu/101"} 0
|
||||
# HELP pve_network_receive_bytes_total Total bytes received over the network.
|
||||
# TYPE pve_network_receive_bytes_total counter
|
||||
pve_network_receive_bytes_total{id="qemu/100"} 1.048576e+06
|
||||
pve_network_receive_bytes_total{id="qemu/101"} 0
|
||||
# HELP pve_network_transmit_bytes_total Total bytes transmitted over the network.
|
||||
# TYPE pve_network_transmit_bytes_total counter
|
||||
pve_network_transmit_bytes_total{id="qemu/100"} 2.097152e+06
|
||||
pve_network_transmit_bytes_total{id="qemu/101"} 0
|
||||
# HELP pve_storage_info Information about a storage resource.
|
||||
# TYPE pve_storage_info gauge
|
||||
pve_storage_info{content="images,rootdir",id="storage/node01/local-lvm",node="node01",plugintype="lvmthin",storage="local-lvm"} 1
|
||||
# HELP pve_storage_shared Whether the storage is shared (1) or local (0).
|
||||
# TYPE pve_storage_shared gauge
|
||||
pve_storage_shared{id="storage/node01/local-lvm"} 0
|
||||
# HELP pve_up Whether the resource is up (1) or down (0).
|
||||
# TYPE pve_up gauge
|
||||
pve_up{id="node/node01"} 1
|
||||
pve_up{id="qemu/100"} 1
|
||||
pve_up{id="qemu/101"} 0
|
||||
# HELP pve_uptime_seconds Uptime in seconds.
|
||||
# TYPE pve_uptime_seconds gauge
|
||||
pve_uptime_seconds{id="node/node01"} 123456
|
||||
pve_uptime_seconds{id="qemu/100"} 3600
|
||||
pve_uptime_seconds{id="qemu/101"} 0
|
||||
`
|
||||
|
||||
metricNames := []string{
|
||||
"pve_up",
|
||||
"pve_cpu_usage_ratio",
|
||||
"pve_cpu_usage_limit",
|
||||
"pve_memory_usage_bytes",
|
||||
"pve_memory_size_bytes",
|
||||
"pve_disk_usage_bytes",
|
||||
"pve_disk_size_bytes",
|
||||
"pve_uptime_seconds",
|
||||
"pve_network_transmit_bytes_total",
|
||||
"pve_network_receive_bytes_total",
|
||||
"pve_disk_written_bytes_total",
|
||||
"pve_disk_read_bytes_total",
|
||||
"pve_guest_info",
|
||||
"pve_ha_state",
|
||||
"pve_lock_state",
|
||||
"pve_storage_shared",
|
||||
"pve_storage_info",
|
||||
}
|
||||
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), metricNames...); err != nil {
|
||||
t.Errorf("unexpected metrics: %s", err)
|
||||
}
|
||||
}
|
||||
1
collector/fixtures/cluster_resources.json
Normal file
1
collector/fixtures/cluster_resources.json
Normal file
|
|
@ -0,0 +1 @@
|
|||
{"data":[{"type":"node","id":"node/node01","node":"node01","status":"online","cpu":0.05,"maxcpu":16,"mem":8589934592,"maxmem":34359738368,"disk":10737418240,"maxdisk":107374182400,"uptime":123456},{"type":"qemu","id":"qemu/100","node":"node01","name":"web-server","status":"running","vmid":100,"cpu":0.1,"maxcpu":4,"mem":2147483648,"maxmem":4294967296,"disk":0,"maxdisk":53687091200,"uptime":3600,"netin":1048576,"netout":2097152,"diskread":5242880,"diskwrite":1048576,"template":0,"hastate":"started","tags":"prod;web","lock":""},{"type":"qemu","id":"qemu/101","node":"node01","name":"db-backup","status":"stopped","vmid":101,"cpu":0,"maxcpu":2,"mem":0,"maxmem":2147483648,"disk":0,"maxdisk":32212254720,"uptime":0,"netin":0,"netout":0,"diskread":0,"diskwrite":0,"template":0,"hastate":"","tags":"","lock":"backup"},{"type":"storage","id":"storage/node01/local-lvm","node":"node01","storage":"local-lvm","status":"available","disk":21474836480,"maxdisk":107374182400,"plugintype":"lvmthin","content":"images,rootdir","shared":0}]}
|
||||
Loading…
Add table
Add a link
Reference in a new issue