feat: add node_status collector (load, swap, rootfs, ksm, boot mode)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2097451d15
commit
496a46460c
1 changed files with 205 additions and 0 deletions
205
collector/node_status.go
Normal file
205
collector/node_status.go
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerCollector("node_status", func(logger *slog.Logger) Collector {
|
||||
return newNodeStatusCollector(logger)
|
||||
})
|
||||
}
|
||||
|
||||
type nodeStatusCollector struct {
|
||||
logger *slog.Logger
|
||||
mu sync.Mutex
|
||||
nodes []string
|
||||
}
|
||||
|
||||
func newNodeStatusCollector(logger *slog.Logger) *nodeStatusCollector {
|
||||
return &nodeStatusCollector{logger: logger}
|
||||
}
|
||||
|
||||
func (c *nodeStatusCollector) SetNodes(nodes []string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.nodes = nodes
|
||||
}
|
||||
|
||||
type nodeStatusResponse struct {
|
||||
Data nodeStatusData `json:"data"`
|
||||
}
|
||||
|
||||
type nodeStatusData struct {
|
||||
LoadAvg []string `json:"loadavg"`
|
||||
Swap nodeStatusMem `json:"swap"`
|
||||
RootFS nodeStatusFS `json:"rootfs"`
|
||||
KSM nodeStatusKSM `json:"ksm"`
|
||||
BootInfo nodeStatusBoot `json:"boot-info"`
|
||||
}
|
||||
|
||||
type nodeStatusMem struct {
|
||||
Total float64 `json:"total"`
|
||||
Used float64 `json:"used"`
|
||||
Free float64 `json:"free"`
|
||||
}
|
||||
|
||||
type nodeStatusFS struct {
|
||||
Total float64 `json:"total"`
|
||||
Used float64 `json:"used"`
|
||||
Avail float64 `json:"avail"`
|
||||
}
|
||||
|
||||
type nodeStatusKSM struct {
|
||||
Shared float64 `json:"shared"`
|
||||
}
|
||||
|
||||
type nodeStatusBoot struct {
|
||||
Mode string `json:"mode"`
|
||||
SecureBoot int `json:"secureboot"`
|
||||
}
|
||||
|
||||
var (
|
||||
nodeLoad1Desc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "load1"),
|
||||
"1-minute load average.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeLoad5Desc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "load5"),
|
||||
"5-minute load average.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeLoad15Desc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "load15"),
|
||||
"15-minute load average.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeSwapTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "swap_total_bytes"),
|
||||
"Total swap in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeSwapUsedDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "swap_used_bytes"),
|
||||
"Used swap in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeSwapFreeDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "swap_free_bytes"),
|
||||
"Free swap in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeRootfsTotalDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "rootfs_total_bytes"),
|
||||
"Root filesystem total size in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeRootfsUsedDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "rootfs_used_bytes"),
|
||||
"Root filesystem used space in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeRootfsAvailDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "rootfs_available_bytes"),
|
||||
"Root filesystem available space in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeKSMSharedDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "ksm_shared_bytes"),
|
||||
"KSM shared memory in bytes.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
nodeBootModeDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "node", "boot_mode_info"),
|
||||
"Node boot mode information.",
|
||||
[]string{"node", "mode", "secureboot"}, nil,
|
||||
)
|
||||
)
|
||||
|
||||
func (c *nodeStatusCollector) Update(client *Client, ch chan<- prometheus.Metric) error {
|
||||
c.mu.Lock()
|
||||
nodes := make([]string, len(c.nodes))
|
||||
copy(nodes, c.nodes)
|
||||
c.mu.Unlock()
|
||||
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
errs []error
|
||||
emu sync.Mutex
|
||||
)
|
||||
|
||||
sem := make(chan struct{}, client.MaxConcurrent())
|
||||
|
||||
for _, node := range nodes {
|
||||
wg.Add(1)
|
||||
go func(node string) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
if err := c.collectNode(client, ch, node); err != nil {
|
||||
emu.Lock()
|
||||
errs = append(errs, err)
|
||||
emu.Unlock()
|
||||
}
|
||||
}(node)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(errs) > 0 {
|
||||
return fmt.Errorf("node_status collection errors: %v", errs)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *nodeStatusCollector) collectNode(client *Client, ch chan<- prometheus.Metric, node string) error {
|
||||
body, err := client.Get(fmt.Sprintf("/nodes/%s/status", node))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get status for node %s: %w", node, err)
|
||||
}
|
||||
|
||||
var resp nodeStatusResponse
|
||||
if err := json.Unmarshal(body, &resp); err != nil {
|
||||
return fmt.Errorf("failed to parse status response for node %s: %w", node, err)
|
||||
}
|
||||
|
||||
d := resp.Data
|
||||
|
||||
// Load averages (strings in API).
|
||||
if len(d.LoadAvg) >= 3 {
|
||||
for i, desc := range []*prometheus.Desc{nodeLoad1Desc, nodeLoad5Desc, nodeLoad15Desc} {
|
||||
val, err := strconv.ParseFloat(d.LoadAvg[i], 64)
|
||||
if err != nil {
|
||||
c.logger.Warn("failed to parse load average", "node", node, "index", i, "err", err)
|
||||
continue
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, node)
|
||||
}
|
||||
}
|
||||
|
||||
// Swap.
|
||||
ch <- prometheus.MustNewConstMetric(nodeSwapTotalDesc, prometheus.GaugeValue, d.Swap.Total, node)
|
||||
ch <- prometheus.MustNewConstMetric(nodeSwapUsedDesc, prometheus.GaugeValue, d.Swap.Used, node)
|
||||
ch <- prometheus.MustNewConstMetric(nodeSwapFreeDesc, prometheus.GaugeValue, d.Swap.Free, node)
|
||||
|
||||
// Root filesystem.
|
||||
ch <- prometheus.MustNewConstMetric(nodeRootfsTotalDesc, prometheus.GaugeValue, d.RootFS.Total, node)
|
||||
ch <- prometheus.MustNewConstMetric(nodeRootfsUsedDesc, prometheus.GaugeValue, d.RootFS.Used, node)
|
||||
ch <- prometheus.MustNewConstMetric(nodeRootfsAvailDesc, prometheus.GaugeValue, d.RootFS.Avail, node)
|
||||
|
||||
// KSM.
|
||||
ch <- prometheus.MustNewConstMetric(nodeKSMSharedDesc, prometheus.GaugeValue, d.KSM.Shared, node)
|
||||
|
||||
// Boot mode info.
|
||||
secureboot := strconv.Itoa(d.BootInfo.SecureBoot)
|
||||
ch <- prometheus.MustNewConstMetric(nodeBootModeDesc, prometheus.GaugeValue, 1, node, d.BootInfo.Mode, secureboot)
|
||||
|
||||
return nil
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue