diff --git a/collector/cluster_resources.go b/collector/cluster_resources.go new file mode 100644 index 0000000..f86dd7f --- /dev/null +++ b/collector/cluster_resources.go @@ -0,0 +1,240 @@ +package collector + +import ( + "encoding/json" + "fmt" + "log/slog" + "strconv" + + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("cluster_resources", func(logger *slog.Logger) Collector { + return newClusterResourcesCollector(logger) + }) +} + +type clusterResourcesCollector struct { + logger *slog.Logger +} + +func newClusterResourcesCollector(logger *slog.Logger) *clusterResourcesCollector { + return &clusterResourcesCollector{logger: logger} +} + +type resourceEntry struct { + Type string `json:"type"` + ID string `json:"id"` + Node string `json:"node"` + Name string `json:"name"` + Status string `json:"status"` + VMID int `json:"vmid"` + CPU float64 `json:"cpu"` + MaxCPU float64 `json:"maxcpu"` + Mem float64 `json:"mem"` + MaxMem float64 `json:"maxmem"` + Disk float64 `json:"disk"` + MaxDisk float64 `json:"maxdisk"` + Uptime float64 `json:"uptime"` + NetIn float64 `json:"netin"` + NetOut float64 `json:"netout"` + DiskRead float64 `json:"diskread"` + DiskWrite float64 `json:"diskwrite"` + Template int `json:"template"` + HAState string `json:"hastate"` + Tags string `json:"tags"` + Lock string `json:"lock"` + Storage string `json:"storage"` + PluginType string `json:"plugintype"` + Content string `json:"content"` + Shared int `json:"shared"` +} + +type resourceResponse struct { + Data []resourceEntry `json:"data"` +} + +var ( + upDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "up"), + "Whether the resource is up (1) or down (0).", + []string{"id"}, + nil, + ) + cpuUsageRatioDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "cpu_usage_ratio"), + "CPU usage ratio.", + []string{"id"}, + nil, + ) + cpuUsageLimitDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "cpu_usage_limit"), + "CPU usage limit (number of CPUs).", + []string{"id"}, + nil, + ) + memoryUsageBytesDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "memory_usage_bytes"), + "Memory usage in bytes.", + []string{"id"}, + nil, + ) + memorySizeBytesDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "memory_size_bytes"), + "Memory size in bytes.", + []string{"id"}, + nil, + ) + diskUsageBytesDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "disk_usage_bytes"), + "Disk usage in bytes.", + []string{"id"}, + nil, + ) + diskSizeBytesDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "disk_size_bytes"), + "Disk size in bytes.", + []string{"id"}, + nil, + ) + uptimeSecondsDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "uptime_seconds"), + "Uptime in seconds.", + []string{"id"}, + nil, + ) + networkTransmitBytesTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "network_transmit_bytes_total"), + "Total bytes transmitted over the network.", + []string{"id"}, + nil, + ) + networkReceiveBytesTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "network_receive_bytes_total"), + "Total bytes received over the network.", + []string{"id"}, + nil, + ) + diskWrittenBytesTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "disk_written_bytes_total"), + "Total bytes written to disk.", + []string{"id"}, + nil, + ) + diskReadBytesTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "disk_read_bytes_total"), + "Total bytes read from disk.", + []string{"id"}, + nil, + ) + guestInfoDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "guest_info"), + "Information about a guest (VM or container).", + []string{"id", "node", "name", "type", "template", "tags"}, + nil, + ) + haStateDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "ha_state"), + "HA manager state of the resource.", + []string{"id", "state"}, + nil, + ) + lockStateDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "lock_state"), + "Lock state of the resource.", + []string{"id", "state"}, + nil, + ) + storageSharedDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "storage_shared"), + "Whether the storage is shared (1) or local (0).", + []string{"id"}, + nil, + ) + storageInfoDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "storage_info"), + "Information about a storage resource.", + []string{"id", "node", "storage", "plugintype", "content"}, + nil, + ) +) + +func (c *clusterResourcesCollector) Update(client *Client, ch chan<- prometheus.Metric) error { + body, err := client.Get("/cluster/resources") + if err != nil { + return fmt.Errorf("failed to get /cluster/resources: %w", err) + } + + var resp resourceResponse + if err := json.Unmarshal(body, &resp); err != nil { + return fmt.Errorf("failed to parse /cluster/resources response: %w", err) + } + + for _, entry := range resp.Data { + switch entry.Type { + case "node": + c.emitNode(ch, entry) + case "qemu", "lxc": + c.emitGuest(ch, entry) + case "storage": + c.emitStorage(ch, entry) + } + } + + return nil +} + +func (c *clusterResourcesCollector) emitNode(ch chan<- prometheus.Metric, e resourceEntry) { + var up float64 + if e.Status == "online" { + up = 1 + } + ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID) + ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID) + ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID) + ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID) + ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID) + ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID) + ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID) + ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID) +} + +func (c *clusterResourcesCollector) emitGuest(ch chan<- prometheus.Metric, e resourceEntry) { + var up float64 + if e.Status == "running" { + up = 1 + } + ch <- prometheus.MustNewConstMetric(upDesc, prometheus.GaugeValue, up, e.ID) + ch <- prometheus.MustNewConstMetric(cpuUsageRatioDesc, prometheus.GaugeValue, e.CPU, e.ID) + ch <- prometheus.MustNewConstMetric(cpuUsageLimitDesc, prometheus.GaugeValue, e.MaxCPU, e.ID) + ch <- prometheus.MustNewConstMetric(memoryUsageBytesDesc, prometheus.GaugeValue, e.Mem, e.ID) + ch <- prometheus.MustNewConstMetric(memorySizeBytesDesc, prometheus.GaugeValue, e.MaxMem, e.ID) + ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID) + ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID) + ch <- prometheus.MustNewConstMetric(uptimeSecondsDesc, prometheus.GaugeValue, e.Uptime, e.ID) + ch <- prometheus.MustNewConstMetric(networkTransmitBytesTotalDesc, prometheus.CounterValue, e.NetOut, e.ID) + ch <- prometheus.MustNewConstMetric(networkReceiveBytesTotalDesc, prometheus.CounterValue, e.NetIn, e.ID) + ch <- prometheus.MustNewConstMetric(diskWrittenBytesTotalDesc, prometheus.CounterValue, e.DiskWrite, e.ID) + ch <- prometheus.MustNewConstMetric(diskReadBytesTotalDesc, prometheus.CounterValue, e.DiskRead, e.ID) + ch <- prometheus.MustNewConstMetric(guestInfoDesc, prometheus.GaugeValue, 1, + e.ID, e.Node, e.Name, e.Type, strconv.Itoa(e.Template), e.Tags, + ) + + if e.HAState != "" { + ch <- prometheus.MustNewConstMetric(haStateDesc, prometheus.GaugeValue, 1, e.ID, e.HAState) + } + + if e.Lock != "" { + ch <- prometheus.MustNewConstMetric(lockStateDesc, prometheus.GaugeValue, 1, e.ID, e.Lock) + } +} + +func (c *clusterResourcesCollector) emitStorage(ch chan<- prometheus.Metric, e resourceEntry) { + ch <- prometheus.MustNewConstMetric(diskUsageBytesDesc, prometheus.GaugeValue, e.Disk, e.ID) + ch <- prometheus.MustNewConstMetric(diskSizeBytesDesc, prometheus.GaugeValue, e.MaxDisk, e.ID) + ch <- prometheus.MustNewConstMetric(storageSharedDesc, prometheus.GaugeValue, float64(e.Shared), e.ID) + ch <- prometheus.MustNewConstMetric(storageInfoDesc, prometheus.GaugeValue, 1, + e.ID, e.Node, e.Storage, e.PluginType, e.Content, + ) +} diff --git a/collector/cluster_resources_test.go b/collector/cluster_resources_test.go new file mode 100644 index 0000000..3426de6 --- /dev/null +++ b/collector/cluster_resources_test.go @@ -0,0 +1,123 @@ +package collector + +import ( + "log/slog" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestClusterResourcesCollector(t *testing.T) { + client := newTestClient(t, map[string]string{ + "/cluster/resources": "cluster_resources.json", + }) + + collector := newClusterResourcesCollector(slog.Default()) + adapter := &testCollectorAdapter{client: client, collector: collector} + + reg := prometheus.NewRegistry() + reg.MustRegister(adapter) + + expected := ` +# HELP pve_cpu_usage_limit CPU usage limit (number of CPUs). +# TYPE pve_cpu_usage_limit gauge +pve_cpu_usage_limit{id="node/node01"} 16 +pve_cpu_usage_limit{id="qemu/100"} 4 +pve_cpu_usage_limit{id="qemu/101"} 2 +# HELP pve_cpu_usage_ratio CPU usage ratio. +# TYPE pve_cpu_usage_ratio gauge +pve_cpu_usage_ratio{id="node/node01"} 0.05 +pve_cpu_usage_ratio{id="qemu/100"} 0.1 +pve_cpu_usage_ratio{id="qemu/101"} 0 +# HELP pve_disk_read_bytes_total Total bytes read from disk. +# TYPE pve_disk_read_bytes_total counter +pve_disk_read_bytes_total{id="qemu/100"} 5.24288e+06 +pve_disk_read_bytes_total{id="qemu/101"} 0 +# HELP pve_disk_size_bytes Disk size in bytes. +# TYPE pve_disk_size_bytes gauge +pve_disk_size_bytes{id="node/node01"} 1.073741824e+11 +pve_disk_size_bytes{id="qemu/100"} 5.36870912e+10 +pve_disk_size_bytes{id="qemu/101"} 3.221225472e+10 +pve_disk_size_bytes{id="storage/node01/local-lvm"} 1.073741824e+11 +# HELP pve_disk_usage_bytes Disk usage in bytes. +# TYPE pve_disk_usage_bytes gauge +pve_disk_usage_bytes{id="node/node01"} 1.073741824e+10 +pve_disk_usage_bytes{id="qemu/100"} 0 +pve_disk_usage_bytes{id="qemu/101"} 0 +pve_disk_usage_bytes{id="storage/node01/local-lvm"} 2.147483648e+10 +# HELP pve_disk_written_bytes_total Total bytes written to disk. +# TYPE pve_disk_written_bytes_total counter +pve_disk_written_bytes_total{id="qemu/100"} 1.048576e+06 +pve_disk_written_bytes_total{id="qemu/101"} 0 +# HELP pve_guest_info Information about a guest (VM or container). +# TYPE pve_guest_info gauge +pve_guest_info{id="qemu/100",name="web-server",node="node01",tags="prod;web",template="0",type="qemu"} 1 +pve_guest_info{id="qemu/101",name="db-backup",node="node01",tags="",template="0",type="qemu"} 1 +# HELP pve_ha_state HA manager state of the resource. +# TYPE pve_ha_state gauge +pve_ha_state{id="qemu/100",state="started"} 1 +# HELP pve_lock_state Lock state of the resource. +# TYPE pve_lock_state gauge +pve_lock_state{id="qemu/101",state="backup"} 1 +# HELP pve_memory_size_bytes Memory size in bytes. +# TYPE pve_memory_size_bytes gauge +pve_memory_size_bytes{id="node/node01"} 3.4359738368e+10 +pve_memory_size_bytes{id="qemu/100"} 4.294967296e+09 +pve_memory_size_bytes{id="qemu/101"} 2.147483648e+09 +# HELP pve_memory_usage_bytes Memory usage in bytes. +# TYPE pve_memory_usage_bytes gauge +pve_memory_usage_bytes{id="node/node01"} 8.589934592e+09 +pve_memory_usage_bytes{id="qemu/100"} 2.147483648e+09 +pve_memory_usage_bytes{id="qemu/101"} 0 +# HELP pve_network_receive_bytes_total Total bytes received over the network. +# TYPE pve_network_receive_bytes_total counter +pve_network_receive_bytes_total{id="qemu/100"} 1.048576e+06 +pve_network_receive_bytes_total{id="qemu/101"} 0 +# HELP pve_network_transmit_bytes_total Total bytes transmitted over the network. +# TYPE pve_network_transmit_bytes_total counter +pve_network_transmit_bytes_total{id="qemu/100"} 2.097152e+06 +pve_network_transmit_bytes_total{id="qemu/101"} 0 +# HELP pve_storage_info Information about a storage resource. +# TYPE pve_storage_info gauge +pve_storage_info{content="images,rootdir",id="storage/node01/local-lvm",node="node01",plugintype="lvmthin",storage="local-lvm"} 1 +# HELP pve_storage_shared Whether the storage is shared (1) or local (0). +# TYPE pve_storage_shared gauge +pve_storage_shared{id="storage/node01/local-lvm"} 0 +# HELP pve_up Whether the resource is up (1) or down (0). +# TYPE pve_up gauge +pve_up{id="node/node01"} 1 +pve_up{id="qemu/100"} 1 +pve_up{id="qemu/101"} 0 +# HELP pve_uptime_seconds Uptime in seconds. +# TYPE pve_uptime_seconds gauge +pve_uptime_seconds{id="node/node01"} 123456 +pve_uptime_seconds{id="qemu/100"} 3600 +pve_uptime_seconds{id="qemu/101"} 0 +` + + metricNames := []string{ + "pve_up", + "pve_cpu_usage_ratio", + "pve_cpu_usage_limit", + "pve_memory_usage_bytes", + "pve_memory_size_bytes", + "pve_disk_usage_bytes", + "pve_disk_size_bytes", + "pve_uptime_seconds", + "pve_network_transmit_bytes_total", + "pve_network_receive_bytes_total", + "pve_disk_written_bytes_total", + "pve_disk_read_bytes_total", + "pve_guest_info", + "pve_ha_state", + "pve_lock_state", + "pve_storage_shared", + "pve_storage_info", + } + + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), metricNames...); err != nil { + t.Errorf("unexpected metrics: %s", err) + } +} diff --git a/collector/fixtures/cluster_resources.json b/collector/fixtures/cluster_resources.json new file mode 100644 index 0000000..a1b0146 --- /dev/null +++ b/collector/fixtures/cluster_resources.json @@ -0,0 +1 @@ +{"data":[{"type":"node","id":"node/node01","node":"node01","status":"online","cpu":0.05,"maxcpu":16,"mem":8589934592,"maxmem":34359738368,"disk":10737418240,"maxdisk":107374182400,"uptime":123456},{"type":"qemu","id":"qemu/100","node":"node01","name":"web-server","status":"running","vmid":100,"cpu":0.1,"maxcpu":4,"mem":2147483648,"maxmem":4294967296,"disk":0,"maxdisk":53687091200,"uptime":3600,"netin":1048576,"netout":2097152,"diskread":5242880,"diskwrite":1048576,"template":0,"hastate":"started","tags":"prod;web","lock":""},{"type":"qemu","id":"qemu/101","node":"node01","name":"db-backup","status":"stopped","vmid":101,"cpu":0,"maxcpu":2,"mem":0,"maxmem":2147483648,"disk":0,"maxdisk":32212254720,"uptime":0,"netin":0,"netout":0,"diskread":0,"diskwrite":0,"template":0,"hastate":"","tags":"","lock":"backup"},{"type":"storage","id":"storage/node01/local-lvm","node":"node01","storage":"local-lvm","status":"available","disk":21474836480,"maxdisk":107374182400,"plugintype":"lvmthin","content":"images,rootdir","shared":0}]}