fix: resolve deadlock in node_config collector causing request exhaustion

The outer goroutine per-node acquired a semaphore slot and held it while
collectNode spawned inner goroutines needing slots from the same semaphore.
With maxConc=5 and 5+ nodes, all slots were consumed by outer goroutines,
inner goroutines blocked forever, and Collect() never returned — permanently
consuming an HTTP MaxRequestsInFlight slot until the server stopped responding.

Remove the redundant outer semaphore acquire (inner goroutines already manage
their own slots) and add a 120s HTTP timeout as defense-in-depth.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Davíð Steinn Geirsson 2026-03-24 11:30:54 +00:00
parent 5e066a5c4b
commit 3bad7963af
2 changed files with 12 additions and 9 deletions

View file

@ -75,8 +75,6 @@ func (c *nodeConfigCollector) Update(client *Client, ch chan<- prometheus.Metric
wg.Add(1)
go func(node string) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
if err := c.collectNode(client, ch, node, sem); err != nil {
emu.Lock()

View file

@ -6,6 +6,7 @@ import (
"net/http"
"os"
"strings"
"time"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
@ -67,13 +68,17 @@ func main() {
registry.MustRegister(versioncollector.NewCollector("pve_exporter"))
registry.MustRegister(pveCollector)
http.Handle(*metricsPath, promhttp.HandlerFor(
http.Handle(*metricsPath, http.TimeoutHandler(
promhttp.HandlerFor(
registry,
promhttp.HandlerOpts{
ErrorLog: slog.NewLogLogger(logger.Handler(), slog.LevelError),
ErrorHandling: promhttp.ContinueOnError,
MaxRequestsInFlight: 5,
},
),
120*time.Second,
"Scrape timed out",
))
if *metricsPath != "/" {