@@ -152,6 +152,14 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152 return ctrl.Result {}, err
153153 }
154154
155+ // Clear stale upgrade labels from nodes that no longer have driver pods
156+ // Use the built state to avoid duplicate API calls and to skip nodes actively being upgraded
157+ // This is best-effort cleanup that runs every reconciliation
158+ if err := r .clearUpgradeLabelsWhereDriverNotRunning (ctx , state , driverLabel , clusterPolicyCtrl .operatorNamespace ); err != nil {
159+ // Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
160+ r .Log .Error (err , "Failed to clear stale upgrade labels" )
161+ }
162+
155163 reqLogger .Info ("Propagate state to state manager" )
156164 reqLogger .V (consts .LogLevelDebug ).Info ("Current cluster upgrade state" , "state" , state )
157165
@@ -198,6 +206,108 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198206 return ctrl.Result {Requeue : true , RequeueAfter : plannedRequeueInterval }, nil
199207}
200208
209+ // clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
210+ // This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
211+ // but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
212+ func (r * UpgradeReconciler ) clearUpgradeLabelsWhereDriverNotRunning (ctx context.Context , state * upgrade.ClusterUpgradeState , driverLabel map [string ]string , namespace string ) error {
213+ upgradeStateLabel := upgrade .GetUpgradeStateLabelKey ()
214+
215+ // Build a set of nodes being actively managed by the upgrade process first
216+ managedNodes := make (map [string ]bool )
217+ for _ , nodeStates := range state .NodeStates {
218+ for _ , nodeState := range nodeStates {
219+ if nodeState .Node != nil {
220+ managedNodes [nodeState .Node .Name ] = true
221+ }
222+ }
223+ }
224+
225+ // List only nodes that have the upgrade label
226+ nodeList := & corev1.NodeList {}
227+ if err := r .List (ctx , nodeList , client.HasLabels {upgradeStateLabel }); err != nil {
228+ return fmt .Errorf ("failed to list nodes with upgrade labels: %w" , err )
229+ }
230+
231+ if len (nodeList .Items ) == 0 {
232+ return nil
233+ }
234+
235+ // Filter out nodes being actively managed by upgrade process
236+ var nodesToCheck []corev1.Node
237+ for _ , node := range nodeList .Items {
238+ if ! managedNodes [node .Name ] {
239+ nodesToCheck = append (nodesToCheck , node )
240+ }
241+ }
242+
243+ if len (nodesToCheck ) == 0 {
244+ return nil
245+ }
246+
247+ // List driver DaemonSets to check which nodes should have pods
248+ // This protects against removing labels during pod recreation (e.g., during rolling updates)
249+ dsList := & appsv1.DaemonSetList {}
250+ if err := r .List (ctx , dsList , client .InNamespace (namespace ), client .MatchingLabels (driverLabel )); err != nil {
251+ return fmt .Errorf ("failed to list driver DaemonSets: %w" , err )
252+ }
253+
254+ // Build a set of nodes that DaemonSets would schedule to
255+ // Check against ALL DaemonSets to protect nodes during pod recreation (rolling updates)
256+ nodesShouldHavePods := make (map [string ]bool )
257+ for _ , ds := range dsList .Items {
258+ // DaemonSet nodeSelector is a map[string]string in spec.template.spec.nodeSelector
259+ nodeSelector := ds .Spec .Template .Spec .NodeSelector
260+ selector := labels .SelectorFromSet (nodeSelector )
261+
262+ for _ , node := range nodesToCheck {
263+ // Check if this DaemonSet would schedule to this node
264+ if selector .Matches (labels .Set (node .Labels )) {
265+ nodesShouldHavePods [node .Name ] = true
266+ }
267+ }
268+ }
269+
270+ // Extract node-to-pod mapping from the already-built state (avoids duplicate API call)
271+ // The state was built by BuildState() which already listed all driver pods
272+ nodesWithPods := make (map [string ]bool )
273+ for _ , nodeStates := range state .NodeStates {
274+ for _ , nodeState := range nodeStates {
275+ if nodeState .DriverPod != nil && nodeState .DriverPod .Spec .NodeName != "" {
276+ nodesWithPods [nodeState .DriverPod .Spec .NodeName ] = true
277+ }
278+ }
279+ }
280+
281+ // Clear upgrade label from nodes that don't have driver pods AND aren't targeted by any DaemonSet
282+ // The nodesShouldHavePods check protects against removing labels during pod recreation windows
283+ // (e.g., during rolling updates when old pod is deleted but new pod not yet scheduled)
284+ // Note: There is a small race condition window where a DaemonSet's nodeSelector is updated
285+ // after we list DaemonSets but before we patch the node. This is acceptable because:
286+ // 1. This is best-effort cleanup that runs every 2 minutes
287+ // 2. The primary use case (nodeSelector narrowing to exclude nodes) is correctly handled
288+ // 3. False negatives (not removing when we should) are safer than false positives
289+ for i := range nodesToCheck {
290+ node := & nodesToCheck [i ]
291+ hasDriverPod := nodesWithPods [node .Name ]
292+ shouldHaveDriverPod := nodesShouldHavePods [node .Name ]
293+
294+ // Only remove label if node has no pod AND no DaemonSet intends to schedule there
295+ if ! hasDriverPod && ! shouldHaveDriverPod {
296+ r .Log .Info ("Clearing stale upgrade label from node" , "node" , node .Name )
297+
298+ nodeCopy := node .DeepCopy ()
299+ delete (node .Labels , upgradeStateLabel )
300+ if err := r .Patch (ctx , node , client .MergeFrom (nodeCopy )); err != nil {
301+ r .Log .Error (err , "Failed to clear upgrade label from node" , "node" , node .Name )
302+ // Continue with other nodes even if one fails
303+ continue
304+ }
305+ }
306+ }
307+
308+ return nil
309+ }
310+
201311// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202312// It is used for cleanup when autoUpgrade feature gets disabled
203313func (r * UpgradeReconciler ) removeNodeUpgradeStateLabels (ctx context.Context ) error {
0 commit comments