From 0a1dc905efcd5e5dc3f99ddf846f1ac55a620e22 Mon Sep 17 00:00:00 2001 From: Mateusz Kowalski Date: Mon, 15 Jun 2026 14:32:13 +0200 Subject: [PATCH] Increase machineset-controller startup probe failure threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During major upgrades on Single Node OpenShift (SNO), the machineset-controller container repeatedly fails its startup probe and enters CrashLoopBackOff. On SNO, all pods restart simultaneously on a single node (~1000 containers), overloading the API server. The machineset-controller needs API server round-trips for cache sync and leader election, which exceed the current 5-minute startup probe window (30 failures × 10s period). Increase FailureThreshold from 30 to 60, extending the startup probe tolerance from 5 to 10 minutes. This gives the machineset-controller enough time to complete cache sync under heavy API server load without entering CrashLoopBackOff. Bug: https://redhat.atlassian.net/browse/OCPBUGS-88561 Signed-off-by: Mateusz Kowalski Generated-by: AI Signed-off-by: Mateusz Kowalski --- pkg/operator/sync.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 986f86d91..2c9026094 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -730,7 +730,7 @@ func newContainers(config *OperatorConfig, features map[string]bool, tlsArgs []s StartupProbe: &corev1.Probe{ PeriodSeconds: 10, TimeoutSeconds: 10, - FailureThreshold: 30, + FailureThreshold: 60, SuccessThreshold: 1, InitialDelaySeconds: 0, ProbeHandler: corev1.ProbeHandler{