@@ -152,6 +152,13 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152 return ctrl.Result {}, err
153153 }
154154
155+ // Clear stale upgrade labels from nodes that no longer have driver pods
156+ // Use the built state so we can avoid removing labels from nodes actively being upgraded
157+ if err := r .clearUpgradeLabelsWhereDriverNotRunning (ctx , state , driverLabel , clusterPolicyCtrl .operatorNamespace ); err != nil {
158+ // Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
159+ r .Log .Error (err , "Failed to clear stale upgrade labels" )
160+ }
161+
155162 reqLogger .Info ("Propagate state to state manager" )
156163 reqLogger .V (consts .LogLevelDebug ).Info ("Current cluster upgrade state" , "state" , state )
157164
@@ -198,6 +205,78 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198205 return ctrl.Result {Requeue : true , RequeueAfter : plannedRequeueInterval }, nil
199206}
200207
208+ // clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
209+ // This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
210+ // but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
211+ func (r * UpgradeReconciler ) clearUpgradeLabelsWhereDriverNotRunning (ctx context.Context , state * upgrade.ClusterUpgradeState , driverLabel map [string ]string , namespace string ) error {
212+ upgradeStateLabel := upgrade .GetUpgradeStateLabelKey ()
213+
214+ // Build a set of nodes being actively managed by the upgrade process first
215+ // This allows us to skip API calls if all labeled nodes are already managed
216+ managedNodes := make (map [string ]bool )
217+ for _ , nodeStates := range state .NodeStates {
218+ for _ , nodeState := range nodeStates {
219+ if nodeState .Node != nil {
220+ managedNodes [nodeState .Node .Name ] = true
221+ }
222+ }
223+ }
224+
225+ // List only nodes that have the upgrade label
226+ nodeList := & corev1.NodeList {}
227+ if err := r .List (ctx , nodeList , client.HasLabels {upgradeStateLabel }); err != nil {
228+ return fmt .Errorf ("failed to list nodes with upgrade labels: %w" , err )
229+ }
230+
231+ if len (nodeList .Items ) == 0 {
232+ return nil
233+ }
234+
235+ // Filter out nodes being actively managed by upgrade process
236+ var nodesToCheck []corev1.Node
237+ for _ , node := range nodeList .Items {
238+ if ! managedNodes [node .Name ] {
239+ nodesToCheck = append (nodesToCheck , node )
240+ }
241+ }
242+
243+ if len (nodesToCheck ) == 0 {
244+ return nil
245+ }
246+
247+ // List driver pods only if we have nodes to check (optimization)
248+ podList := & corev1.PodList {}
249+ if err := r .List (ctx , podList , client .InNamespace (namespace ), client .MatchingLabels (driverLabel )); err != nil {
250+ return fmt .Errorf ("failed to list driver pods: %w" , err )
251+ }
252+
253+ // Create a set of nodes that have driver pods (any driver pods)
254+ nodesWithPods := make (map [string ]bool )
255+ for _ , pod := range podList .Items {
256+ if pod .Spec .NodeName != "" {
257+ nodesWithPods [pod .Spec .NodeName ] = true
258+ }
259+ }
260+
261+ // Clear upgrade label from nodes that don't have driver pods
262+ for i := range nodesToCheck {
263+ node := & nodesToCheck [i ]
264+ if _ , hasDriverPod := nodesWithPods [node .Name ]; ! hasDriverPod {
265+ r .Log .Info ("Clearing stale upgrade label from node" , "node" , node .Name )
266+
267+ nodeCopy := node .DeepCopy ()
268+ delete (node .Labels , upgradeStateLabel )
269+ if err := r .Patch (ctx , node , client .MergeFrom (nodeCopy )); err != nil {
270+ r .Log .Error (err , "Failed to clear upgrade label from node" , "node" , node .Name )
271+ // Continue with other nodes even if one fails
272+ continue
273+ }
274+ }
275+ }
276+
277+ return nil
278+ }
279+
201280// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202281// It is used for cleanup when autoUpgrade feature gets disabled
203282func (r * UpgradeReconciler ) removeNodeUpgradeStateLabels (ctx context.Context ) error {
0 commit comments