Skip to content

Commit 604eaa6

Browse files
committed
remove driver upgrade label from nodes
this commit removes driver upgrade label from nodes which don't have any driver pod running on them Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 688e38d commit 604eaa6

File tree

2 files changed

+550
-0
lines changed

2 files changed

+550
-0
lines changed

controllers/upgrade_controller.go

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,14 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152
return ctrl.Result{}, err
153153
}
154154

155+
// Clear stale upgrade labels from nodes that no longer have driver pods
156+
// Use the built state to avoid duplicate API calls and to skip nodes actively being upgraded
157+
// This is best-effort cleanup that runs every reconciliation
158+
if err := r.clearUpgradeLabelsWhereDriverNotRunning(ctx, state, driverLabel, clusterPolicyCtrl.operatorNamespace); err != nil {
159+
// Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
160+
r.Log.Error(err, "Failed to clear stale upgrade labels")
161+
}
162+
155163
reqLogger.Info("Propagate state to state manager")
156164
reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state", "state", state)
157165

@@ -198,6 +206,108 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198206
return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil
199207
}
200208

209+
// clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
210+
// This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
211+
// but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
212+
func (r *UpgradeReconciler) clearUpgradeLabelsWhereDriverNotRunning(ctx context.Context, state *upgrade.ClusterUpgradeState, driverLabel map[string]string, namespace string) error {
213+
upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
214+
215+
// Build a set of nodes being actively managed by the upgrade process first
216+
managedNodes := make(map[string]bool)
217+
for _, nodeStates := range state.NodeStates {
218+
for _, nodeState := range nodeStates {
219+
if nodeState.Node != nil {
220+
managedNodes[nodeState.Node.Name] = true
221+
}
222+
}
223+
}
224+
225+
// List only nodes that have the upgrade label
226+
nodeList := &corev1.NodeList{}
227+
if err := r.List(ctx, nodeList, client.HasLabels{upgradeStateLabel}); err != nil {
228+
return fmt.Errorf("failed to list nodes with upgrade labels: %w", err)
229+
}
230+
231+
if len(nodeList.Items) == 0 {
232+
return nil
233+
}
234+
235+
// Filter out nodes being actively managed by upgrade process
236+
var nodesToCheck []corev1.Node
237+
for _, node := range nodeList.Items {
238+
if !managedNodes[node.Name] {
239+
nodesToCheck = append(nodesToCheck, node)
240+
}
241+
}
242+
243+
if len(nodesToCheck) == 0 {
244+
return nil
245+
}
246+
247+
// List driver DaemonSets to check which nodes should have pods
248+
// This protects against removing labels during pod recreation (e.g., during rolling updates)
249+
dsList := &appsv1.DaemonSetList{}
250+
if err := r.List(ctx, dsList, client.InNamespace(namespace), client.MatchingLabels(driverLabel)); err != nil {
251+
return fmt.Errorf("failed to list driver DaemonSets: %w", err)
252+
}
253+
254+
// Build a set of nodes that DaemonSets would schedule to
255+
// Check against ALL DaemonSets to protect nodes during pod recreation (rolling updates)
256+
nodesShouldHavePods := make(map[string]bool)
257+
for _, ds := range dsList.Items {
258+
// DaemonSet nodeSelector is a map[string]string in spec.template.spec.nodeSelector
259+
nodeSelector := ds.Spec.Template.Spec.NodeSelector
260+
selector := labels.SelectorFromSet(nodeSelector)
261+
262+
for _, node := range nodesToCheck {
263+
// Check if this DaemonSet would schedule to this node
264+
if selector.Matches(labels.Set(node.Labels)) {
265+
nodesShouldHavePods[node.Name] = true
266+
}
267+
}
268+
}
269+
270+
// Extract node-to-pod mapping from the already-built state (avoids duplicate API call)
271+
// The state was built by BuildState() which already listed all driver pods
272+
nodesWithPods := make(map[string]bool)
273+
for _, nodeStates := range state.NodeStates {
274+
for _, nodeState := range nodeStates {
275+
if nodeState.DriverPod != nil && nodeState.DriverPod.Spec.NodeName != "" {
276+
nodesWithPods[nodeState.DriverPod.Spec.NodeName] = true
277+
}
278+
}
279+
}
280+
281+
// Clear upgrade label from nodes that don't have driver pods AND aren't targeted by any DaemonSet
282+
// The nodesShouldHavePods check protects against removing labels during pod recreation windows
283+
// (e.g., during rolling updates when old pod is deleted but new pod not yet scheduled)
284+
// Note: There is a small race condition window where a DaemonSet's nodeSelector is updated
285+
// after we list DaemonSets but before we patch the node. This is acceptable because:
286+
// 1. This is best-effort cleanup that runs every 2 minutes
287+
// 2. The primary use case (nodeSelector narrowing to exclude nodes) is correctly handled
288+
// 3. False negatives (not removing when we should) are safer than false positives
289+
for i := range nodesToCheck {
290+
node := &nodesToCheck[i]
291+
hasDriverPod := nodesWithPods[node.Name]
292+
shouldHaveDriverPod := nodesShouldHavePods[node.Name]
293+
294+
// Only remove label if node has no pod AND no DaemonSet intends to schedule there
295+
if !hasDriverPod && !shouldHaveDriverPod {
296+
r.Log.Info("Clearing stale upgrade label from node", "node", node.Name)
297+
298+
nodeCopy := node.DeepCopy()
299+
delete(node.Labels, upgradeStateLabel)
300+
if err := r.Patch(ctx, node, client.MergeFrom(nodeCopy)); err != nil {
301+
r.Log.Error(err, "Failed to clear upgrade label from node", "node", node.Name)
302+
// Continue with other nodes even if one fails
303+
continue
304+
}
305+
}
306+
}
307+
308+
return nil
309+
}
310+
201311
// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202312
// It is used for cleanup when autoUpgrade feature gets disabled
203313
func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error {

0 commit comments

Comments
 (0)