From 18bb43394e378516ee8f4833a7b2ecf2f46c7dfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dav=C3=AD=C3=B0=20Steinn=20Geirsson?= Date: Fri, 20 Mar 2026 15:20:17 +0000 Subject: [PATCH] docs: add implementation plan for remaining collectors 9 tasks covering node_status, vm_pressure, ha_status, and physical_disk collectors with TDD approach, fixtures, and README update. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-03-20-remaining-collectors-plan.md | 1217 +++++++++++++++++ 1 file changed, 1217 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-20-remaining-collectors-plan.md diff --git a/docs/superpowers/plans/2026-03-20-remaining-collectors-plan.md b/docs/superpowers/plans/2026-03-20-remaining-collectors-plan.md new file mode 100644 index 0000000..6702efc --- /dev/null +++ b/docs/superpowers/plans/2026-03-20-remaining-collectors-plan.md @@ -0,0 +1,1217 @@ +# Remaining Collectors Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add 4 new collectors (node_status, vm_pressure, ha_status, physical_disk) covering all remaining TODO metrics from the README. + +**Architecture:** Each collector is a standalone file following the existing pattern: init() self-registration, NodeAwareCollector or plain Collector interface, per-node fan-out with semaphore-bounded concurrency. Tests use JSON fixtures and testutil.GatherAndCompare. + +**Tech Stack:** Go, prometheus/client_golang, slog + +**Spec:** `docs/superpowers/specs/2026-03-20-remaining-collectors-design.md` + +--- + +### Task 1: Node Status Collector — Test + Fixture + +**Files:** +- Create: `collector/fixtures/node_status.json` +- Create: `collector/node_status_test.go` + +- [ ] **Step 1: Create the test fixture** + +Create `collector/fixtures/node_status.json`: + +```json +{"data":{"loadavg":["3.12","2.88","2.79"],"swap":{"total":8589930496,"used":1048576,"free":8588881920},"rootfs":{"used":28747304960,"total":100861726720,"avail":66943684608},"ksm":{"shared":4096},"boot-info":{"mode":"efi","secureboot":0}}} +``` + +- [ ] **Step 2: Write the test** + +Create `collector/node_status_test.go`: + +```go +package collector + +import ( + "log/slog" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestNodeStatusCollector(t *testing.T) { + client := newTestClient(t, map[string]string{ + "/nodes/node01/status": "node_status.json", + }) + + collector := newNodeStatusCollector(slog.Default()) + collector.SetNodes([]string{"node01"}) + adapter := &testCollectorAdapter{client: client, collector: collector} + + reg := prometheus.NewRegistry() + reg.MustRegister(adapter) + + expected := ` +# HELP pve_node_boot_mode_info Node boot mode information. +# TYPE pve_node_boot_mode_info gauge +pve_node_boot_mode_info{mode="efi",node="node01",secureboot="0"} 1 +# HELP pve_node_ksm_shared_bytes KSM shared memory in bytes. +# TYPE pve_node_ksm_shared_bytes gauge +pve_node_ksm_shared_bytes{node="node01"} 4096 +# HELP pve_node_load1 1-minute load average. +# TYPE pve_node_load1 gauge +pve_node_load1{node="node01"} 3.12 +# HELP pve_node_load15 15-minute load average. +# TYPE pve_node_load15 gauge +pve_node_load15{node="node01"} 2.79 +# HELP pve_node_load5 5-minute load average. +# TYPE pve_node_load5 gauge +pve_node_load5{node="node01"} 2.88 +# HELP pve_node_rootfs_available_bytes Root filesystem available space in bytes. +# TYPE pve_node_rootfs_available_bytes gauge +pve_node_rootfs_available_bytes{node="node01"} 6.6943684608e+10 +# HELP pve_node_rootfs_total_bytes Root filesystem total size in bytes. +# TYPE pve_node_rootfs_total_bytes gauge +pve_node_rootfs_total_bytes{node="node01"} 1.0086172672e+11 +# HELP pve_node_rootfs_used_bytes Root filesystem used space in bytes. +# TYPE pve_node_rootfs_used_bytes gauge +pve_node_rootfs_used_bytes{node="node01"} 2.874730496e+10 +# HELP pve_node_swap_free_bytes Free swap in bytes. +# TYPE pve_node_swap_free_bytes gauge +pve_node_swap_free_bytes{node="node01"} 8.58888192e+09 +# HELP pve_node_swap_total_bytes Total swap in bytes. +# TYPE pve_node_swap_total_bytes gauge +pve_node_swap_total_bytes{node="node01"} 8.589930496e+09 +# HELP pve_node_swap_used_bytes Used swap in bytes. +# TYPE pve_node_swap_used_bytes gauge +pve_node_swap_used_bytes{node="node01"} 1.048576e+06 +` + + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), + "pve_node_load1", "pve_node_load5", "pve_node_load15", + "pve_node_swap_total_bytes", "pve_node_swap_used_bytes", "pve_node_swap_free_bytes", + "pve_node_rootfs_total_bytes", "pve_node_rootfs_used_bytes", "pve_node_rootfs_available_bytes", + "pve_node_ksm_shared_bytes", "pve_node_boot_mode_info", + ); err != nil { + t.Errorf("unexpected metrics: %s", err) + } +} +``` + +- [ ] **Step 3: Run test to verify it fails** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestNodeStatusCollector -v` +Expected: FAIL — `newNodeStatusCollector` not defined. + +- [ ] **Step 4: Commit** + +```bash +git add collector/fixtures/node_status.json collector/node_status_test.go +git commit -m "test: add node_status collector test and fixture" +``` + +--- + +### Task 2: Node Status Collector — Implementation + +**Files:** +- Create: `collector/node_status.go` + +- [ ] **Step 1: Implement the collector** + +Create `collector/node_status.go`: + +```go +package collector + +import ( + "encoding/json" + "fmt" + "log/slog" + "strconv" + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("node_status", func(logger *slog.Logger) Collector { + return newNodeStatusCollector(logger) + }) +} + +type nodeStatusCollector struct { + logger *slog.Logger + mu sync.Mutex + nodes []string +} + +func newNodeStatusCollector(logger *slog.Logger) *nodeStatusCollector { + return &nodeStatusCollector{logger: logger} +} + +func (c *nodeStatusCollector) SetNodes(nodes []string) { + c.mu.Lock() + defer c.mu.Unlock() + c.nodes = nodes +} + +type nodeStatusResponse struct { + Data nodeStatusData `json:"data"` +} + +type nodeStatusData struct { + LoadAvg []string `json:"loadavg"` + Swap nodeStatusMem `json:"swap"` + RootFS nodeStatusFS `json:"rootfs"` + KSM nodeStatusKSM `json:"ksm"` + BootInfo nodeStatusBoot `json:"boot-info"` +} + +type nodeStatusMem struct { + Total float64 `json:"total"` + Used float64 `json:"used"` + Free float64 `json:"free"` +} + +type nodeStatusFS struct { + Total float64 `json:"total"` + Used float64 `json:"used"` + Avail float64 `json:"avail"` +} + +type nodeStatusKSM struct { + Shared float64 `json:"shared"` +} + +type nodeStatusBoot struct { + Mode string `json:"mode"` + SecureBoot int `json:"secureboot"` +} + +var ( + nodeLoad1Desc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "load1"), + "1-minute load average.", + []string{"node"}, nil, + ) + nodeLoad5Desc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "load5"), + "5-minute load average.", + []string{"node"}, nil, + ) + nodeLoad15Desc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "load15"), + "15-minute load average.", + []string{"node"}, nil, + ) + nodeSwapTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "swap_total_bytes"), + "Total swap in bytes.", + []string{"node"}, nil, + ) + nodeSwapUsedDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "swap_used_bytes"), + "Used swap in bytes.", + []string{"node"}, nil, + ) + nodeSwapFreeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "swap_free_bytes"), + "Free swap in bytes.", + []string{"node"}, nil, + ) + nodeRootfsTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "rootfs_total_bytes"), + "Root filesystem total size in bytes.", + []string{"node"}, nil, + ) + nodeRootfsUsedDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "rootfs_used_bytes"), + "Root filesystem used space in bytes.", + []string{"node"}, nil, + ) + nodeRootfsAvailDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "rootfs_available_bytes"), + "Root filesystem available space in bytes.", + []string{"node"}, nil, + ) + nodeKSMSharedDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "ksm_shared_bytes"), + "KSM shared memory in bytes.", + []string{"node"}, nil, + ) + nodeBootModeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "node", "boot_mode_info"), + "Node boot mode information.", + []string{"node", "mode", "secureboot"}, nil, + ) +) + +func (c *nodeStatusCollector) Update(client *Client, ch chan<- prometheus.Metric) error { + c.mu.Lock() + nodes := make([]string, len(c.nodes)) + copy(nodes, c.nodes) + c.mu.Unlock() + + var ( + wg sync.WaitGroup + errs []error + emu sync.Mutex + ) + + sem := make(chan struct{}, client.MaxConcurrent()) + + for _, node := range nodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + if err := c.collectNode(client, ch, node); err != nil { + emu.Lock() + errs = append(errs, err) + emu.Unlock() + } + }(node) + } + wg.Wait() + + if len(errs) > 0 { + return fmt.Errorf("node_status collection errors: %v", errs) + } + return nil +} + +func (c *nodeStatusCollector) collectNode(client *Client, ch chan<- prometheus.Metric, node string) error { + body, err := client.Get(fmt.Sprintf("/nodes/%s/status", node)) + if err != nil { + return fmt.Errorf("failed to get status for node %s: %w", node, err) + } + + var resp nodeStatusResponse + if err := json.Unmarshal(body, &resp); err != nil { + return fmt.Errorf("failed to parse status response for node %s: %w", node, err) + } + + d := resp.Data + + // Load averages (strings in API). + if len(d.LoadAvg) >= 3 { + for i, desc := range []*prometheus.Desc{nodeLoad1Desc, nodeLoad5Desc, nodeLoad15Desc} { + val, err := strconv.ParseFloat(d.LoadAvg[i], 64) + if err != nil { + c.logger.Warn("failed to parse load average", "node", node, "index", i, "err", err) + continue + } + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, node) + } + } + + // Swap. + ch <- prometheus.MustNewConstMetric(nodeSwapTotalDesc, prometheus.GaugeValue, d.Swap.Total, node) + ch <- prometheus.MustNewConstMetric(nodeSwapUsedDesc, prometheus.GaugeValue, d.Swap.Used, node) + ch <- prometheus.MustNewConstMetric(nodeSwapFreeDesc, prometheus.GaugeValue, d.Swap.Free, node) + + // Root filesystem. + ch <- prometheus.MustNewConstMetric(nodeRootfsTotalDesc, prometheus.GaugeValue, d.RootFS.Total, node) + ch <- prometheus.MustNewConstMetric(nodeRootfsUsedDesc, prometheus.GaugeValue, d.RootFS.Used, node) + ch <- prometheus.MustNewConstMetric(nodeRootfsAvailDesc, prometheus.GaugeValue, d.RootFS.Avail, node) + + // KSM. + ch <- prometheus.MustNewConstMetric(nodeKSMSharedDesc, prometheus.GaugeValue, d.KSM.Shared, node) + + // Boot mode info. + secureboot := strconv.Itoa(d.BootInfo.SecureBoot) + ch <- prometheus.MustNewConstMetric(nodeBootModeDesc, prometheus.GaugeValue, 1, node, d.BootInfo.Mode, secureboot) + + return nil +} +``` + +- [ ] **Step 2: Run tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestNodeStatusCollector -v` +Expected: PASS + +- [ ] **Step 3: Run all tests to check for regressions** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -v` +Expected: All tests PASS + +- [ ] **Step 4: Commit** + +```bash +git add collector/node_status.go +git commit -m "feat: add node_status collector (load, swap, rootfs, ksm, boot mode)" +``` + +--- + +### Task 3: VM Pressure Collector — Test + Fixture + +**Files:** +- Create: `collector/fixtures/node_qemu_pressure.json` +- Create: `collector/vm_pressure_test.go` + +- [ ] **Step 1: Create the test fixture** + +Create `collector/fixtures/node_qemu_pressure.json` with one running VM (has pressure fields) and one stopped VM (no pressure fields): + +```json +{"data":[{"vmid":112,"status":"running","name":"karmada","cpus":4,"pressurecpusome":1.5,"pressurecpufull":0.3,"pressurememorysome":2.1,"pressurememoryfull":0.0,"pressureiosome":0.8,"pressureiofull":0.1,"mem":6287664128,"maxmem":8589934592},{"vmid":104,"status":"stopped","name":"test3","cpus":2,"mem":0,"maxmem":4294967296}]} +``` + +- [ ] **Step 2: Write the test** + +Create `collector/vm_pressure_test.go`: + +```go +package collector + +import ( + "log/slog" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestVMPressureCollector(t *testing.T) { + client := newTestClient(t, map[string]string{ + "/nodes/node01/qemu": "node_qemu_pressure.json", + }) + + collector := newVMPressureCollector(slog.Default()) + collector.SetNodes([]string{"node01"}) + adapter := &testCollectorAdapter{client: client, collector: collector} + + reg := prometheus.NewRegistry() + reg.MustRegister(adapter) + + expected := ` +# HELP pve_vm_pressure_cpu_full_ratio CPU pressure (full) ratio. +# TYPE pve_vm_pressure_cpu_full_ratio gauge +pve_vm_pressure_cpu_full_ratio{id="qemu/112",node="node01"} 0.003 +# HELP pve_vm_pressure_cpu_some_ratio CPU pressure (some) ratio. +# TYPE pve_vm_pressure_cpu_some_ratio gauge +pve_vm_pressure_cpu_some_ratio{id="qemu/112",node="node01"} 0.015 +# HELP pve_vm_pressure_io_full_ratio I/O pressure (full) ratio. +# TYPE pve_vm_pressure_io_full_ratio gauge +pve_vm_pressure_io_full_ratio{id="qemu/112",node="node01"} 0.001 +# HELP pve_vm_pressure_io_some_ratio I/O pressure (some) ratio. +# TYPE pve_vm_pressure_io_some_ratio gauge +pve_vm_pressure_io_some_ratio{id="qemu/112",node="node01"} 0.008 +# HELP pve_vm_pressure_memory_full_ratio Memory pressure (full) ratio. +# TYPE pve_vm_pressure_memory_full_ratio gauge +pve_vm_pressure_memory_full_ratio{id="qemu/112",node="node01"} 0 +# HELP pve_vm_pressure_memory_some_ratio Memory pressure (some) ratio. +# TYPE pve_vm_pressure_memory_some_ratio gauge +pve_vm_pressure_memory_some_ratio{id="qemu/112",node="node01"} 0.021 +` + + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), + "pve_vm_pressure_cpu_some_ratio", "pve_vm_pressure_cpu_full_ratio", + "pve_vm_pressure_memory_some_ratio", "pve_vm_pressure_memory_full_ratio", + "pve_vm_pressure_io_some_ratio", "pve_vm_pressure_io_full_ratio", + ); err != nil { + t.Errorf("unexpected metrics: %s", err) + } +} +``` + +- [ ] **Step 3: Run test to verify it fails** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestVMPressureCollector -v` +Expected: FAIL — `newVMPressureCollector` not defined. + +- [ ] **Step 4: Commit** + +```bash +git add collector/fixtures/node_qemu_pressure.json collector/vm_pressure_test.go +git commit -m "test: add vm_pressure collector test and fixture" +``` + +--- + +### Task 4: VM Pressure Collector — Implementation + +**Files:** +- Create: `collector/vm_pressure.go` + +- [ ] **Step 1: Implement the collector** + +Create `collector/vm_pressure.go`: + +```go +package collector + +import ( + "encoding/json" + "fmt" + "log/slog" + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("vm_pressure", func(logger *slog.Logger) Collector { + return newVMPressureCollector(logger) + }) +} + +type vmPressureCollector struct { + logger *slog.Logger + mu sync.Mutex + nodes []string +} + +func newVMPressureCollector(logger *slog.Logger) *vmPressureCollector { + return &vmPressureCollector{logger: logger} +} + +func (c *vmPressureCollector) SetNodes(nodes []string) { + c.mu.Lock() + defer c.mu.Unlock() + c.nodes = nodes +} + +type vmPressureResponse struct { + Data []vmPressureEntry `json:"data"` +} + +type vmPressureEntry struct { + VMID int `json:"vmid"` + Status string `json:"status"` + PressureCPUSome float64 `json:"pressurecpusome"` + PressureCPUFull float64 `json:"pressurecpufull"` + PressureMemSome float64 `json:"pressurememorysome"` + PressureMemFull float64 `json:"pressurememoryfull"` + PressureIOSome float64 `json:"pressureiosome"` + PressureIOFull float64 `json:"pressureiofull"` +} + +var ( + vmPressureCPUSomeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "cpu_some_ratio"), + "CPU pressure (some) ratio.", + []string{"id", "node"}, nil, + ) + vmPressureCPUFullDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "cpu_full_ratio"), + "CPU pressure (full) ratio.", + []string{"id", "node"}, nil, + ) + vmPressureMemSomeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "memory_some_ratio"), + "Memory pressure (some) ratio.", + []string{"id", "node"}, nil, + ) + vmPressureMemFullDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "memory_full_ratio"), + "Memory pressure (full) ratio.", + []string{"id", "node"}, nil, + ) + vmPressureIOSomeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "io_some_ratio"), + "I/O pressure (some) ratio.", + []string{"id", "node"}, nil, + ) + vmPressureIOFullDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "vm_pressure", "io_full_ratio"), + "I/O pressure (full) ratio.", + []string{"id", "node"}, nil, + ) +) + +func (c *vmPressureCollector) Update(client *Client, ch chan<- prometheus.Metric) error { + c.mu.Lock() + nodes := make([]string, len(c.nodes)) + copy(nodes, c.nodes) + c.mu.Unlock() + + var ( + wg sync.WaitGroup + errs []error + emu sync.Mutex + ) + + sem := make(chan struct{}, client.MaxConcurrent()) + + for _, node := range nodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + if err := c.collectNode(client, ch, node); err != nil { + emu.Lock() + errs = append(errs, err) + emu.Unlock() + } + }(node) + } + wg.Wait() + + if len(errs) > 0 { + return fmt.Errorf("vm_pressure collection errors: %v", errs) + } + return nil +} + +func (c *vmPressureCollector) collectNode(client *Client, ch chan<- prometheus.Metric, node string) error { + body, err := client.Get(fmt.Sprintf("/nodes/%s/qemu", node)) + if err != nil { + return fmt.Errorf("failed to get qemu list for node %s: %w", node, err) + } + + var resp vmPressureResponse + if err := json.Unmarshal(body, &resp); err != nil { + return fmt.Errorf("failed to parse qemu list for node %s: %w", node, err) + } + + for _, vm := range resp.Data { + if vm.Status != "running" { + continue + } + + id := fmt.Sprintf("qemu/%d", vm.VMID) + + ch <- prometheus.MustNewConstMetric(vmPressureCPUSomeDesc, prometheus.GaugeValue, vm.PressureCPUSome/100, id, node) + ch <- prometheus.MustNewConstMetric(vmPressureCPUFullDesc, prometheus.GaugeValue, vm.PressureCPUFull/100, id, node) + ch <- prometheus.MustNewConstMetric(vmPressureMemSomeDesc, prometheus.GaugeValue, vm.PressureMemSome/100, id, node) + ch <- prometheus.MustNewConstMetric(vmPressureMemFullDesc, prometheus.GaugeValue, vm.PressureMemFull/100, id, node) + ch <- prometheus.MustNewConstMetric(vmPressureIOSomeDesc, prometheus.GaugeValue, vm.PressureIOSome/100, id, node) + ch <- prometheus.MustNewConstMetric(vmPressureIOFullDesc, prometheus.GaugeValue, vm.PressureIOFull/100, id, node) + } + + return nil +} +``` + +- [ ] **Step 2: Run tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestVMPressureCollector -v` +Expected: PASS + +- [ ] **Step 3: Run all tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -v` +Expected: All tests PASS + +- [ ] **Step 4: Commit** + +```bash +git add collector/vm_pressure.go +git commit -m "feat: add vm_pressure collector (PSI cpu/memory/io for QEMU VMs)" +``` + +--- + +### Task 5: HA Status Collector — Test + Fixtures + +**Files:** +- Create: `collector/fixtures/ha_manager_status.json` +- Create: `collector/fixtures/ha_resources.json` +- Create: `collector/ha_status_test.go` + +- [ ] **Step 1: Create the manager_status fixture** + +Create `collector/fixtures/ha_manager_status.json`: + +```json +{"data":{"manager_status":{"master_node":"node03","node_status":{"node01":"online","node02":"online","node03":"online"},"service_status":{"vm:106":{"node":"node01","running":1,"state":"started"}}},"lrm_status":{"node01":{"mode":"active","state":"active","timestamp":1774016350},"node02":{"mode":"active","state":"wait_for_agent_lock","timestamp":1774016351},"node03":{"mode":"active","state":"wait_for_agent_lock","timestamp":1774016351}}}} +``` + +- [ ] **Step 2: Create the ha_resources fixture** + +Create `collector/fixtures/ha_resources.json`: + +```json +{"data":[{"sid":"vm:106","type":"vm","state":"started","max_restart":2,"max_relocate":2,"failback":1}]} +``` + +- [ ] **Step 3: Write the test** + +Create `collector/ha_status_test.go`: + +```go +package collector + +import ( + "log/slog" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestHAStatusCollector(t *testing.T) { + client := newTestClient(t, map[string]string{ + "/cluster/ha/status/manager_status": "ha_manager_status.json", + "/cluster/ha/resources": "ha_resources.json", + }) + + collector := newHAStatusCollector(slog.Default()) + adapter := &testCollectorAdapter{client: client, collector: collector} + + reg := prometheus.NewRegistry() + reg.MustRegister(adapter) + + expected := ` +# HELP pve_ha_crm_master Whether a node is the CRM master. +# TYPE pve_ha_crm_master gauge +pve_ha_crm_master{node="node01"} 0 +pve_ha_crm_master{node="node02"} 0 +pve_ha_crm_master{node="node03"} 1 +# HELP pve_ha_lrm_mode LRM mode for a node. +# TYPE pve_ha_lrm_mode gauge +pve_ha_lrm_mode{mode="active",node="node01"} 1 +pve_ha_lrm_mode{mode="active",node="node02"} 1 +pve_ha_lrm_mode{mode="active",node="node03"} 1 +# HELP pve_ha_lrm_timestamp_seconds Last LRM heartbeat as Unix timestamp. +# TYPE pve_ha_lrm_timestamp_seconds gauge +pve_ha_lrm_timestamp_seconds{node="node01"} 1.77401635e+09 +pve_ha_lrm_timestamp_seconds{node="node02"} 1.774016351e+09 +pve_ha_lrm_timestamp_seconds{node="node03"} 1.774016351e+09 +# HELP pve_ha_node_status HA node status. +# TYPE pve_ha_node_status gauge +pve_ha_node_status{node="node01",status="online"} 1 +pve_ha_node_status{node="node02",status="online"} 1 +pve_ha_node_status{node="node03",status="online"} 1 +# HELP pve_ha_service_config HA service configuration. +# TYPE pve_ha_service_config gauge +pve_ha_service_config{failback="1",max_relocate="2",max_restart="2",sid="vm:106",type="vm"} 1 +# HELP pve_ha_service_status HA service runtime status. +# TYPE pve_ha_service_status gauge +pve_ha_service_status{node="node01",sid="vm:106",state="started"} 1 +` + + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), + "pve_ha_crm_master", "pve_ha_node_status", + "pve_ha_lrm_timestamp_seconds", "pve_ha_lrm_mode", + "pve_ha_service_config", "pve_ha_service_status", + ); err != nil { + t.Errorf("unexpected metrics: %s", err) + } +} +``` + +- [ ] **Step 4: Run test to verify it fails** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestHAStatusCollector -v` +Expected: FAIL — `newHAStatusCollector` not defined. + +- [ ] **Step 5: Commit** + +```bash +git add collector/fixtures/ha_manager_status.json collector/fixtures/ha_resources.json collector/ha_status_test.go +git commit -m "test: add ha_status collector test and fixtures" +``` + +--- + +### Task 6: HA Status Collector — Implementation + +**Files:** +- Create: `collector/ha_status.go` + +- [ ] **Step 1: Implement the collector** + +Create `collector/ha_status.go`: + +```go +package collector + +import ( + "encoding/json" + "fmt" + "log/slog" + "strconv" + + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("ha_status", func(logger *slog.Logger) Collector { + return newHAStatusCollector(logger) + }) +} + +type haStatusCollector struct { + logger *slog.Logger +} + +func newHAStatusCollector(logger *slog.Logger) *haStatusCollector { + return &haStatusCollector{logger: logger} +} + +type haManagerStatusResponse struct { + Data haManagerStatusData `json:"data"` +} + +type haManagerStatusData struct { + ManagerStatus haManagerStatus `json:"manager_status"` + LRMStatus map[string]haLRMEntry `json:"lrm_status"` +} + +type haManagerStatus struct { + MasterNode string `json:"master_node"` + NodeStatus map[string]string `json:"node_status"` + ServiceStatus map[string]haServiceRuntime `json:"service_status"` +} + +type haLRMEntry struct { + Mode string `json:"mode"` + State string `json:"state"` + Timestamp float64 `json:"timestamp"` +} + +type haServiceRuntime struct { + Node string `json:"node"` + Running int `json:"running"` + State string `json:"state"` +} + +type haResourcesResponse struct { + Data []haResourceEntry `json:"data"` +} + +type haResourceEntry struct { + SID string `json:"sid"` + Type string `json:"type"` + State string `json:"state"` + MaxRestart int `json:"max_restart"` + MaxRelocate int `json:"max_relocate"` + Failback int `json:"failback"` +} + +var ( + haCRMMasterDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "crm_master"), + "Whether a node is the CRM master.", + []string{"node"}, nil, + ) + haNodeStatusDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "node_status"), + "HA node status.", + []string{"node", "status"}, nil, + ) + haLRMTimestampDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "lrm_timestamp_seconds"), + "Last LRM heartbeat as Unix timestamp.", + []string{"node"}, nil, + ) + haLRMModeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "lrm_mode"), + "LRM mode for a node.", + []string{"node", "mode"}, nil, + ) + haServiceConfigDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "service_config"), + "HA service configuration.", + []string{"sid", "type", "max_restart", "max_relocate", "failback"}, nil, + ) + haServiceStatusDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "ha", "service_status"), + "HA service runtime status.", + []string{"sid", "node", "state"}, nil, + ) +) + +func (c *haStatusCollector) Update(client *Client, ch chan<- prometheus.Metric) error { + // Fetch manager status. + mgrBody, err := client.Get("/cluster/ha/status/manager_status") + if err != nil { + return fmt.Errorf("failed to get HA manager status: %w", err) + } + + var mgrResp haManagerStatusResponse + if err := json.Unmarshal(mgrBody, &mgrResp); err != nil { + return fmt.Errorf("failed to parse HA manager status: %w", err) + } + + mgr := mgrResp.Data.ManagerStatus + + // CRM master — emit for all nodes in node_status. + for node := range mgr.NodeStatus { + var val float64 + if node == mgr.MasterNode { + val = 1 + } + ch <- prometheus.MustNewConstMetric(haCRMMasterDesc, prometheus.GaugeValue, val, node) + } + + // Node status. + for node, status := range mgr.NodeStatus { + ch <- prometheus.MustNewConstMetric(haNodeStatusDesc, prometheus.GaugeValue, 1, node, status) + } + + // LRM status. + for node, lrm := range mgrResp.Data.LRMStatus { + ch <- prometheus.MustNewConstMetric(haLRMTimestampDesc, prometheus.GaugeValue, lrm.Timestamp, node) + ch <- prometheus.MustNewConstMetric(haLRMModeDesc, prometheus.GaugeValue, 1, node, lrm.Mode) + } + + // Service runtime status from manager_status. + for sid, svc := range mgr.ServiceStatus { + ch <- prometheus.MustNewConstMetric(haServiceStatusDesc, prometheus.GaugeValue, 1, sid, svc.Node, svc.State) + } + + // Fetch HA resources for service config. + resBody, err := client.Get("/cluster/ha/resources") + if err != nil { + return fmt.Errorf("failed to get HA resources: %w", err) + } + + var resResp haResourcesResponse + if err := json.Unmarshal(resBody, &resResp); err != nil { + return fmt.Errorf("failed to parse HA resources: %w", err) + } + + for _, res := range resResp.Data { + ch <- prometheus.MustNewConstMetric(haServiceConfigDesc, prometheus.GaugeValue, 1, + res.SID, res.Type, + strconv.Itoa(res.MaxRestart), strconv.Itoa(res.MaxRelocate), strconv.Itoa(res.Failback), + ) + } + + return nil +} +``` + +- [ ] **Step 2: Run tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestHAStatusCollector -v` +Expected: PASS + +- [ ] **Step 3: Run all tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -v` +Expected: All tests PASS + +- [ ] **Step 4: Commit** + +```bash +git add collector/ha_status.go +git commit -m "feat: add ha_status collector (CRM master, node/LRM status, service config)" +``` + +--- + +### Task 7: Physical Disk Collector — Test + Fixture + +**Files:** +- Create: `collector/fixtures/node_disks.json` +- Create: `collector/physical_disk_test.go` + +- [ ] **Step 1: Create the test fixture** + +Create `collector/fixtures/node_disks.json` with one OSD disk, one multi-OSD disk, and one non-OSD disk with `"N/A"` wearout: + +```json +{"data":[{"devpath":"/dev/nvme0n1","health":"PASSED","wearout":95,"size":7681501126656,"model":"VV007680KYFFL","serial":"ADD3NA317I0104K2N","type":"nvme","used":"LVM","osdid":"8","osdid-list":["8"],"bluestore":1},{"devpath":"/dev/nvme1n1","health":"PASSED","wearout":88,"size":7681501126656,"model":"VV007680KYFFL","serial":"ADD3NA317I0104K2O","type":"nvme","used":"LVM","osdid":"9","osdid-list":["9","10"],"bluestore":1},{"devpath":"/dev/nvme4n1","health":"UNKNOWN","wearout":"N/A","size":480036519936,"model":"HPE NS204i-u","serial":"PXTYH0ARHJS080","type":"nvme","used":"BIOS boot","osdid":-1,"osdid-list":null}]} +``` + +- [ ] **Step 2: Write the test** + +Create `collector/physical_disk_test.go`: + +```go +package collector + +import ( + "log/slog" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestPhysicalDiskCollector(t *testing.T) { + client := newTestClient(t, map[string]string{ + "/nodes/node01/disks/list": "node_disks.json", + }) + + collector := newPhysicalDiskCollector(slog.Default()) + collector.SetNodes([]string{"node01"}) + adapter := &testCollectorAdapter{client: client, collector: collector} + + reg := prometheus.NewRegistry() + reg.MustRegister(adapter) + + expected := ` +# HELP pve_physical_disk_health 1 if SMART health is PASSED, 0 otherwise. +# TYPE pve_physical_disk_health gauge +pve_physical_disk_health{devpath="/dev/nvme0n1",model="VV007680KYFFL",node="node01",serial="ADD3NA317I0104K2N",type="nvme"} 1 +pve_physical_disk_health{devpath="/dev/nvme1n1",model="VV007680KYFFL",node="node01",serial="ADD3NA317I0104K2O",type="nvme"} 1 +pve_physical_disk_health{devpath="/dev/nvme4n1",model="HPE NS204i-u",node="node01",serial="PXTYH0ARHJS080",type="nvme"} 0 +# HELP pve_physical_disk_info Physical disk information. +# TYPE pve_physical_disk_info gauge +pve_physical_disk_info{devpath="/dev/nvme0n1",model="VV007680KYFFL",node="node01",serial="ADD3NA317I0104K2N",type="nvme",used="LVM"} 1 +pve_physical_disk_info{devpath="/dev/nvme1n1",model="VV007680KYFFL",node="node01",serial="ADD3NA317I0104K2O",type="nvme",used="LVM"} 1 +pve_physical_disk_info{devpath="/dev/nvme4n1",model="HPE NS204i-u",node="node01",serial="PXTYH0ARHJS080",type="nvme",used="BIOS boot"} 1 +# HELP pve_physical_disk_osd Disk-to-OSD mapping. +# TYPE pve_physical_disk_osd gauge +pve_physical_disk_osd{devpath="/dev/nvme0n1",node="node01",osd="osd.8"} 1 +pve_physical_disk_osd{devpath="/dev/nvme1n1",node="node01",osd="osd.9"} 1 +pve_physical_disk_osd{devpath="/dev/nvme1n1",node="node01",osd="osd.10"} 1 +# HELP pve_physical_disk_size_bytes Physical disk size in bytes. +# TYPE pve_physical_disk_size_bytes gauge +pve_physical_disk_size_bytes{devpath="/dev/nvme0n1",node="node01"} 7.681501126656e+12 +pve_physical_disk_size_bytes{devpath="/dev/nvme1n1",node="node01"} 7.681501126656e+12 +pve_physical_disk_size_bytes{devpath="/dev/nvme4n1",node="node01"} 4.80036519936e+11 +# HELP pve_physical_disk_wearout_remaining_ratio Wearout remaining as a ratio (1.0 = new). +# TYPE pve_physical_disk_wearout_remaining_ratio gauge +pve_physical_disk_wearout_remaining_ratio{devpath="/dev/nvme0n1",node="node01"} 0.95 +pve_physical_disk_wearout_remaining_ratio{devpath="/dev/nvme1n1",node="node01"} 0.88 +` + + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), + "pve_physical_disk_health", "pve_physical_disk_wearout_remaining_ratio", + "pve_physical_disk_size_bytes", "pve_physical_disk_info", "pve_physical_disk_osd", + ); err != nil { + t.Errorf("unexpected metrics: %s", err) + } +} +``` + +- [ ] **Step 3: Run test to verify it fails** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestPhysicalDiskCollector -v` +Expected: FAIL — `newPhysicalDiskCollector` not defined. + +- [ ] **Step 4: Commit** + +```bash +git add collector/fixtures/node_disks.json collector/physical_disk_test.go +git commit -m "test: add physical_disk collector test and fixture" +``` + +--- + +### Task 8: Physical Disk Collector — Implementation + +**Files:** +- Create: `collector/physical_disk.go` + +- [ ] **Step 1: Implement the collector** + +Create `collector/physical_disk.go`: + +```go +package collector + +import ( + "encoding/json" + "fmt" + "log/slog" + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("physical_disk", func(logger *slog.Logger) Collector { + return newPhysicalDiskCollector(logger) + }) +} + +type physicalDiskCollector struct { + logger *slog.Logger + mu sync.Mutex + nodes []string +} + +func newPhysicalDiskCollector(logger *slog.Logger) *physicalDiskCollector { + return &physicalDiskCollector{logger: logger} +} + +func (c *physicalDiskCollector) SetNodes(nodes []string) { + c.mu.Lock() + defer c.mu.Unlock() + c.nodes = nodes +} + +type diskListResponse struct { + Data []diskEntry `json:"data"` +} + +type diskEntry struct { + DevPath string `json:"devpath"` + Health string `json:"health"` + Wearout json.RawMessage `json:"wearout"` + Size float64 `json:"size"` + Model string `json:"model"` + Serial string `json:"serial"` + Type string `json:"type"` + Used string `json:"used"` + OSDList []string `json:"osdid-list"` +} + +var ( + physDiskHealthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "physical_disk", "health"), + "1 if SMART health is PASSED, 0 otherwise.", + []string{"node", "devpath", "model", "serial", "type"}, nil, + ) + physDiskWearoutDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "physical_disk", "wearout_remaining_ratio"), + "Wearout remaining as a ratio (1.0 = new).", + []string{"node", "devpath"}, nil, + ) + physDiskSizeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "physical_disk", "size_bytes"), + "Physical disk size in bytes.", + []string{"node", "devpath"}, nil, + ) + physDiskInfoDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "physical_disk", "info"), + "Physical disk information.", + []string{"node", "devpath", "model", "serial", "type", "used"}, nil, + ) + physDiskOSDDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "physical_disk", "osd"), + "Disk-to-OSD mapping.", + []string{"node", "devpath", "osd"}, nil, + ) +) + +func (c *physicalDiskCollector) Update(client *Client, ch chan<- prometheus.Metric) error { + c.mu.Lock() + nodes := make([]string, len(c.nodes)) + copy(nodes, c.nodes) + c.mu.Unlock() + + var ( + wg sync.WaitGroup + errs []error + emu sync.Mutex + ) + + sem := make(chan struct{}, client.MaxConcurrent()) + + for _, node := range nodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + if err := c.collectNode(client, ch, node); err != nil { + emu.Lock() + errs = append(errs, err) + emu.Unlock() + } + }(node) + } + wg.Wait() + + if len(errs) > 0 { + return fmt.Errorf("physical_disk collection errors: %v", errs) + } + return nil +} + +func (c *physicalDiskCollector) collectNode(client *Client, ch chan<- prometheus.Metric, node string) error { + body, err := client.Get(fmt.Sprintf("/nodes/%s/disks/list", node)) + if err != nil { + return fmt.Errorf("failed to get disks for node %s: %w", node, err) + } + + var resp diskListResponse + if err := json.Unmarshal(body, &resp); err != nil { + return fmt.Errorf("failed to parse disks response for node %s: %w", node, err) + } + + for _, disk := range resp.Data { + // Health: 1 if PASSED, 0 otherwise. + var health float64 + if disk.Health == "PASSED" { + health = 1 + } + ch <- prometheus.MustNewConstMetric(physDiskHealthDesc, prometheus.GaugeValue, health, + node, disk.DevPath, disk.Model, disk.Serial, disk.Type) + + // Wearout: try to parse as number. Skip if "N/A" or not a number. + if len(disk.Wearout) > 0 { + var wearout float64 + if err := json.Unmarshal(disk.Wearout, &wearout); err == nil { + ch <- prometheus.MustNewConstMetric(physDiskWearoutDesc, prometheus.GaugeValue, wearout/100, node, disk.DevPath) + } + } + + // Size. + ch <- prometheus.MustNewConstMetric(physDiskSizeDesc, prometheus.GaugeValue, disk.Size, node, disk.DevPath) + + // Info. + ch <- prometheus.MustNewConstMetric(physDiskInfoDesc, prometheus.GaugeValue, 1, + node, disk.DevPath, disk.Model, disk.Serial, disk.Type, disk.Used) + + // OSD mapping. + for _, osdID := range disk.OSDList { + osd := fmt.Sprintf("osd.%s", osdID) + ch <- prometheus.MustNewConstMetric(physDiskOSDDesc, prometheus.GaugeValue, 1, node, disk.DevPath, osd) + } + } + + return nil +} +``` + +- [ ] **Step 2: Run tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -run TestPhysicalDiskCollector -v` +Expected: PASS + +- [ ] **Step 3: Run all tests** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -v` +Expected: All tests PASS + +- [ ] **Step 4: Commit** + +```bash +git add collector/physical_disk.go +git commit -m "feat: add physical_disk collector (health, wearout, size, OSD mapping)" +``` + +--- + +### Task 9: Update README + +**Files:** +- Modify: `README.md` + +- [ ] **Step 1: Add new metrics tables to README** + +Add sections for all 4 new collectors after the existing Subscription section, following the same table format. Each section should have the metrics table matching the spec. + +- [ ] **Step 2: Remove TODO section** + +Remove the entire "TODO: Future Metrics" section from the README since all planned metrics are now implemented (SDN, kernel version, and CPU model were excluded by design). + +- [ ] **Step 3: Run all tests one final time** + +Run: `cd /home/user/git/pve-exporter && go test ./collector/ -v` +Expected: All tests PASS + +- [ ] **Step 4: Build to verify** + +Run: `cd /home/user/git/pve-exporter && make build` +Expected: Binary builds successfully + +- [ ] **Step 5: Commit** + +```bash +git add README.md +git commit -m "docs: update README with new collector metrics, remove TODO section" +```