fix: resolve deadlock in node_config collector causing request exhaustion

The outer goroutine per-node acquired a semaphore slot and held it while collectNode spawned inner goroutines needing slots from the same semaphore. With maxConc=5 and 5+ nodes, all slots were consumed by outer goroutines, inner goroutines blocked forever, and Collect() never returned — permanently consuming an HTTP MaxRequestsInFlight slot until the server stopped responding. Remove the redundant outer semaphore acquire (inner goroutines already manage their own slots) and add a 120s HTTP timeout as defense-in-depth. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 11:30:54 +00:00 · 2026-03-24 11:30:54 +00:00 · 3bad7963af
commit 3bad7963af
parent 5e066a5c4b
2 changed files with 12 additions and 9 deletions
--- a/collector/node_config.go
+++ b/collector/node_config.go
@ -75,8 +75,6 @@ func (c *nodeConfigCollector) Update(client *Client, ch chan<- prometheus.Metric
 		wg.Add(1)
 		go func(node string) {
 			defer wg.Done()
-			sem <- struct{}{}
-			defer func() { <-sem }()

 			if err := c.collectNode(client, ch, node, sem); err != nil {
 				emu.Lock()
--- a/main.go
+++ b/main.go
@ -6,6 +6,7 @@ import (
 	"net/http"
 	"os"
 	"strings"
+	"time"

 	"github.com/alecthomas/kingpin/v2"
 	"github.com/prometheus/client_golang/prometheus"
@ -67,13 +68,17 @@ func main() {
 	registry.MustRegister(versioncollector.NewCollector("pve_exporter"))
 	registry.MustRegister(pveCollector)

-	http.Handle(*metricsPath, promhttp.HandlerFor(
+	http.Handle(*metricsPath, http.TimeoutHandler(
+		promhttp.HandlerFor(
 			registry,
 			promhttp.HandlerOpts{
 				ErrorLog:            slog.NewLogLogger(logger.Handler(), slog.LevelError),
 				ErrorHandling:       promhttp.ContinueOnError,
 				MaxRequestsInFlight: 5,
 			},
+		),
+		120*time.Second,
+		"Scrape timed out",
 	))

 	if *metricsPath != "/" {