feat: add node_status collector (load, swap, rootfs, ksm, boot mode)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Davíð Steinn Geirsson 2026-03-20 15:23:09 +00:00
parent 2097451d15
commit 496a46460c

205
collector/node_status.go Normal file
View file

@ -0,0 +1,205 @@
package collector
import (
"encoding/json"
"fmt"
"log/slog"
"strconv"
"sync"
"github.com/prometheus/client_golang/prometheus"
)
func init() {
registerCollector("node_status", func(logger *slog.Logger) Collector {
return newNodeStatusCollector(logger)
})
}
type nodeStatusCollector struct {
logger *slog.Logger
mu sync.Mutex
nodes []string
}
func newNodeStatusCollector(logger *slog.Logger) *nodeStatusCollector {
return &nodeStatusCollector{logger: logger}
}
func (c *nodeStatusCollector) SetNodes(nodes []string) {
c.mu.Lock()
defer c.mu.Unlock()
c.nodes = nodes
}
type nodeStatusResponse struct {
Data nodeStatusData `json:"data"`
}
type nodeStatusData struct {
LoadAvg []string `json:"loadavg"`
Swap nodeStatusMem `json:"swap"`
RootFS nodeStatusFS `json:"rootfs"`
KSM nodeStatusKSM `json:"ksm"`
BootInfo nodeStatusBoot `json:"boot-info"`
}
type nodeStatusMem struct {
Total float64 `json:"total"`
Used float64 `json:"used"`
Free float64 `json:"free"`
}
type nodeStatusFS struct {
Total float64 `json:"total"`
Used float64 `json:"used"`
Avail float64 `json:"avail"`
}
type nodeStatusKSM struct {
Shared float64 `json:"shared"`
}
type nodeStatusBoot struct {
Mode string `json:"mode"`
SecureBoot int `json:"secureboot"`
}
var (
nodeLoad1Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "load1"),
"1-minute load average.",
[]string{"node"}, nil,
)
nodeLoad5Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "load5"),
"5-minute load average.",
[]string{"node"}, nil,
)
nodeLoad15Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "load15"),
"15-minute load average.",
[]string{"node"}, nil,
)
nodeSwapTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "swap_total_bytes"),
"Total swap in bytes.",
[]string{"node"}, nil,
)
nodeSwapUsedDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "swap_used_bytes"),
"Used swap in bytes.",
[]string{"node"}, nil,
)
nodeSwapFreeDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "swap_free_bytes"),
"Free swap in bytes.",
[]string{"node"}, nil,
)
nodeRootfsTotalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "rootfs_total_bytes"),
"Root filesystem total size in bytes.",
[]string{"node"}, nil,
)
nodeRootfsUsedDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "rootfs_used_bytes"),
"Root filesystem used space in bytes.",
[]string{"node"}, nil,
)
nodeRootfsAvailDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "rootfs_available_bytes"),
"Root filesystem available space in bytes.",
[]string{"node"}, nil,
)
nodeKSMSharedDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "ksm_shared_bytes"),
"KSM shared memory in bytes.",
[]string{"node"}, nil,
)
nodeBootModeDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "boot_mode_info"),
"Node boot mode information.",
[]string{"node", "mode", "secureboot"}, nil,
)
)
func (c *nodeStatusCollector) Update(client *Client, ch chan<- prometheus.Metric) error {
c.mu.Lock()
nodes := make([]string, len(c.nodes))
copy(nodes, c.nodes)
c.mu.Unlock()
var (
wg sync.WaitGroup
errs []error
emu sync.Mutex
)
sem := make(chan struct{}, client.MaxConcurrent())
for _, node := range nodes {
wg.Add(1)
go func(node string) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
if err := c.collectNode(client, ch, node); err != nil {
emu.Lock()
errs = append(errs, err)
emu.Unlock()
}
}(node)
}
wg.Wait()
if len(errs) > 0 {
return fmt.Errorf("node_status collection errors: %v", errs)
}
return nil
}
func (c *nodeStatusCollector) collectNode(client *Client, ch chan<- prometheus.Metric, node string) error {
body, err := client.Get(fmt.Sprintf("/nodes/%s/status", node))
if err != nil {
return fmt.Errorf("failed to get status for node %s: %w", node, err)
}
var resp nodeStatusResponse
if err := json.Unmarshal(body, &resp); err != nil {
return fmt.Errorf("failed to parse status response for node %s: %w", node, err)
}
d := resp.Data
// Load averages (strings in API).
if len(d.LoadAvg) >= 3 {
for i, desc := range []*prometheus.Desc{nodeLoad1Desc, nodeLoad5Desc, nodeLoad15Desc} {
val, err := strconv.ParseFloat(d.LoadAvg[i], 64)
if err != nil {
c.logger.Warn("failed to parse load average", "node", node, "index", i, "err", err)
continue
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, node)
}
}
// Swap.
ch <- prometheus.MustNewConstMetric(nodeSwapTotalDesc, prometheus.GaugeValue, d.Swap.Total, node)
ch <- prometheus.MustNewConstMetric(nodeSwapUsedDesc, prometheus.GaugeValue, d.Swap.Used, node)
ch <- prometheus.MustNewConstMetric(nodeSwapFreeDesc, prometheus.GaugeValue, d.Swap.Free, node)
// Root filesystem.
ch <- prometheus.MustNewConstMetric(nodeRootfsTotalDesc, prometheus.GaugeValue, d.RootFS.Total, node)
ch <- prometheus.MustNewConstMetric(nodeRootfsUsedDesc, prometheus.GaugeValue, d.RootFS.Used, node)
ch <- prometheus.MustNewConstMetric(nodeRootfsAvailDesc, prometheus.GaugeValue, d.RootFS.Avail, node)
// KSM.
ch <- prometheus.MustNewConstMetric(nodeKSMSharedDesc, prometheus.GaugeValue, d.KSM.Shared, node)
// Boot mode info.
secureboot := strconv.Itoa(d.BootInfo.SecureBoot)
ch <- prometheus.MustNewConstMetric(nodeBootModeDesc, prometheus.GaugeValue, 1, node, d.BootInfo.Mode, secureboot)
return nil
}