Skip to content

Commit f5b193c

Browse files
committed
remove driver upgrade label from nodes
this commit removes driver upgrade label from nodes which don't have any driver pod running on them Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 688e38d commit f5b193c

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

controllers/upgrade_controller.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,13 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152
return ctrl.Result{}, err
153153
}
154154

155+
// Clear stale upgrade labels from nodes that no longer have driver pods
156+
// Use the built state so we can avoid removing labels from nodes actively being upgraded
157+
if err := r.clearUpgradeLabelsWhereDriverNotRunning(ctx, state, driverLabel, clusterPolicyCtrl.operatorNamespace); err != nil {
158+
// Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
159+
r.Log.Error(err, "Failed to clear stale upgrade labels")
160+
}
161+
155162
reqLogger.Info("Propagate state to state manager")
156163
reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state", "state", state)
157164

@@ -198,6 +205,78 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198205
return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil
199206
}
200207

208+
// clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
209+
// This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
210+
// but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
211+
func (r *UpgradeReconciler) clearUpgradeLabelsWhereDriverNotRunning(ctx context.Context, state *upgrade.ClusterUpgradeState, driverLabel map[string]string, namespace string) error {
212+
upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
213+
214+
// Build a set of nodes being actively managed by the upgrade process first
215+
// This allows us to skip API calls if all labeled nodes are already managed
216+
managedNodes := make(map[string]bool)
217+
for _, nodeStates := range state.NodeStates {
218+
for _, nodeState := range nodeStates {
219+
if nodeState.Node != nil {
220+
managedNodes[nodeState.Node.Name] = true
221+
}
222+
}
223+
}
224+
225+
// List only nodes that have the upgrade label
226+
nodeList := &corev1.NodeList{}
227+
if err := r.List(ctx, nodeList, client.HasLabels{upgradeStateLabel}); err != nil {
228+
return fmt.Errorf("failed to list nodes with upgrade labels: %w", err)
229+
}
230+
231+
if len(nodeList.Items) == 0 {
232+
return nil
233+
}
234+
235+
// Filter out nodes being actively managed by upgrade process
236+
var nodesToCheck []corev1.Node
237+
for _, node := range nodeList.Items {
238+
if !managedNodes[node.Name] {
239+
nodesToCheck = append(nodesToCheck, node)
240+
}
241+
}
242+
243+
if len(nodesToCheck) == 0 {
244+
return nil
245+
}
246+
247+
// List driver pods only if we have nodes to check (optimization)
248+
podList := &corev1.PodList{}
249+
if err := r.List(ctx, podList, client.InNamespace(namespace), client.MatchingLabels(driverLabel)); err != nil {
250+
return fmt.Errorf("failed to list driver pods: %w", err)
251+
}
252+
253+
// Create a set of nodes that have driver pods (any driver pods)
254+
nodesWithPods := make(map[string]bool)
255+
for _, pod := range podList.Items {
256+
if pod.Spec.NodeName != "" {
257+
nodesWithPods[pod.Spec.NodeName] = true
258+
}
259+
}
260+
261+
// Clear upgrade label from nodes that don't have driver pods
262+
for i := range nodesToCheck {
263+
node := &nodesToCheck[i]
264+
if _, hasDriverPod := nodesWithPods[node.Name]; !hasDriverPod {
265+
r.Log.Info("Clearing stale upgrade label from node", "node", node.Name)
266+
267+
nodeCopy := node.DeepCopy()
268+
delete(node.Labels, upgradeStateLabel)
269+
if err := r.Patch(ctx, node, client.MergeFrom(nodeCopy)); err != nil {
270+
r.Log.Error(err, "Failed to clear upgrade label from node", "node", node.Name)
271+
// Continue with other nodes even if one fails
272+
continue
273+
}
274+
}
275+
}
276+
277+
return nil
278+
}
279+
201280
// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202281
// It is used for cleanup when autoUpgrade feature gets disabled
203282
func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error {

0 commit comments

Comments
 (0)