From 66c5d29b5ac6cbe2769f4bcb6e9e7de201486129 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 11 Jun 2026 20:53:26 -0700 Subject: [PATCH] add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver changes include: * added default nvidiadriver which doesn't conflict with other user-specified nvidiadrivers * added migration support from clusterpolicy to nvidiadriver * added/updated e2e tests for nvidiadriver workflow * use new field to mark a nvidiadriver as default instead of using a label * don't allow adding nodeselector to default nvidiadriver, it should cover entire cluster * move default NVIDIADriver ownership helpers into internal/nvidiadriver and re-use * requeue after node owner label changes so later reconcile steps read updated cache state * use patch instead of update for node label updates Signed-off-by: Rahul Sharma --- api/nvidia/v1/clusterpolicy_types.go | 5 + api/nvidia/v1/clusterpolicy_types_test.go | 76 ++++ api/nvidia/v1alpha1/nvidiadriver_types.go | 16 +- .../manifests/nvidia.com_nvidiadrivers.yaml | 19 +- .../crd/bases/nvidia.com_nvidiadrivers.yaml | 19 +- controllers/nvidiadriver_controller.go | 93 ++++- controllers/nvidiadriver_controller_test.go | 177 ++++++++ controllers/nvidiadriver_migration.go | 110 +++++ controllers/object_controls.go | 8 +- controllers/object_controls_test.go | 115 +++++ controllers/state_manager.go | 19 +- controllers/upgrade_controller.go | 10 +- .../crds/nvidia.com_nvidiadrivers.yaml | 19 +- .../gpu-operator/templates/clusterpolicy.yaml | 2 + .../gpu-operator/templates/nvidiadriver.yaml | 4 +- internal/consts/consts.go | 5 + internal/nvidiadriver/nvidiadriver.go | 143 +++++++ internal/nvidiadriver/nvidiadriver_test.go | 393 ++++++++++++++++++ internal/state/driver.go | 8 +- internal/state/driver_test.go | 27 ++ internal/state/nodepool.go | 18 +- internal/validator/validator.go | 24 +- internal/validator/validator_test.go | 162 +++++--- tests/scripts/checks.sh | 51 ++- tests/scripts/end-to-end-nvidia-driver.sh | 40 +- tests/scripts/end-to-end.sh | 13 +- .../migrate-clusterpolicy-to-nvidiadriver.sh | 193 +++++++++ tests/scripts/update-nvidiadriver.sh | 283 +++++++++++-- 28 files changed, 1928 insertions(+), 124 deletions(-) create mode 100644 api/nvidia/v1/clusterpolicy_types_test.go create mode 100644 controllers/nvidiadriver_migration.go create mode 100644 internal/nvidiadriver/nvidiadriver.go create mode 100644 internal/nvidiadriver/nvidiadriver_test.go create mode 100755 tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 12f8e9dcd..300b8f261 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -2183,6 +2183,11 @@ func (d *DriverSpec) UseNvidiaDriverCRDType() bool { return *d.UseNvidiaDriverCRD } +// IsNVIDIADriverCRDEnabled returns true if driver management is enabled through NVIDIADriver CRs. +func (d *DriverSpec) IsNVIDIADriverCRDEnabled() bool { + return d != nil && d.IsEnabled() && d.UseNvidiaDriverCRDType() +} + // UsePrecompiledDrivers returns true if driver install is enabled using pre-compiled modules func (d *DriverSpec) UsePrecompiledDrivers() bool { if d.UsePrecompiled == nil { diff --git a/api/nvidia/v1/clusterpolicy_types_test.go b/api/nvidia/v1/clusterpolicy_types_test.go new file mode 100644 index 000000000..ef66eed4c --- /dev/null +++ b/api/nvidia/v1/clusterpolicy_types_test.go @@ -0,0 +1,76 @@ +/* + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1 + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDriverSpecIsNVIDIADriverCRDEnabled(t *testing.T) { + driverEnabled := true + driverDisabled := false + crdEnabled := true + crdDisabled := false + + tests := []struct { + name string + driverEnabled *bool + crdEnabled *bool + expected bool + }{ + { + name: "driver enabled by default and CRD enabled", + crdEnabled: &crdEnabled, + expected: true, + }, + { + name: "CRD disabled", + crdEnabled: &crdDisabled, + expected: false, + }, + { + name: "driver disabled", + driverEnabled: &driverDisabled, + crdEnabled: &crdEnabled, + expected: false, + }, + { + name: "driver explicitly enabled and CRD enabled", + driverEnabled: &driverEnabled, + crdEnabled: &crdEnabled, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + driverSpec := DriverSpec{ + Enabled: tc.driverEnabled, + UseNvidiaDriverCRD: tc.crdEnabled, + } + + require.Equal(t, tc.expected, driverSpec.IsNVIDIADriverCRDEnabled()) + }) + } + + t.Run("nil driver spec", func(t *testing.T) { + var driverSpec *DriverSpec + require.False(t, driverSpec.IsNVIDIADriverCRDEnabled()) + }) +} diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 613355ca6..0527d1efb 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -36,11 +36,19 @@ const ( // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. -// NVIDIADriverSpec defines the desired state of NVIDIADriver +// NVIDIADriverSpec defines the desired state of NVIDIADriver. +// The CEL validation allows non-default drivers to use nodeSelector, but requires +// default drivers to leave nodeSelector unset or empty. +// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector" type NVIDIADriverSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster // Important: Run "make" to regenerate code after modifying this file + // Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes + // that do not match any non-default NVIDIADriver nodeSelector. + // +kubebuilder:default=false + Default bool `json:"default"` + // +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager // +kubebuilder:default=gpu // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting." @@ -504,6 +512,7 @@ type NVIDIADriverStatus struct { //+kubebuilder:subresource:status //+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"} //+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0 +//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0 //+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0 // NVIDIADriver is the Schema for the nvidiadrivers API @@ -524,6 +533,11 @@ type NVIDIADriverList struct { Items []NVIDIADriver `json:"items"` } +// IsDefault returns true when the NVIDIADriver is marked as the fallback driver. +func (d *NVIDIADriver) IsDefault() bool { + return d != nil && d.Spec.Default +} + // UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool { if d.UsePrecompiled == nil { diff --git a/bundle/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/manifests/nvidia.com_nvidiadrivers.yaml index 81c234157..02ebd2b8d 100644 --- a/bundle/manifests/nvidia.com_nvidiadrivers.yaml +++ b/bundle/manifests/nvidia.com_nvidiadrivers.yaml @@ -22,6 +22,9 @@ spec: - jsonPath: .status.state name: Status type: string + - jsonPath: .spec.default + name: Default + type: boolean - jsonPath: .metadata.creationTimestamp name: Age type: string @@ -48,7 +51,10 @@ spec: metadata: type: object spec: - description: NVIDIADriverSpec defines the desired state of NVIDIADriver + description: |- + NVIDIADriverSpec defines the desired state of NVIDIADriver. + The CEL validation allows non-default drivers to use nodeSelector, but requires + default drivers to leave nodeSelector unset or empty. properties: annotations: additionalProperties: @@ -70,6 +76,12 @@ spec: name: type: string type: object + default: + default: false + description: |- + Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes + that do not match any non-default NVIDIADriver nodeSelector. + type: boolean driverType: default: gpu description: DriverType defines NVIDIA driver type @@ -968,9 +980,14 @@ spec: type: string type: object required: + - default - driverType - image type: object + x-kubernetes-validations: + - message: default NVIDIADriver must not use nodeSelector + rule: 'has(self.default) && self.default ? !has(self.nodeSelector) || + size(self.nodeSelector) == 0 : true' status: description: NVIDIADriverStatus defines the observed state of NVIDIADriver properties: diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index 81c234157..02ebd2b8d 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -22,6 +22,9 @@ spec: - jsonPath: .status.state name: Status type: string + - jsonPath: .spec.default + name: Default + type: boolean - jsonPath: .metadata.creationTimestamp name: Age type: string @@ -48,7 +51,10 @@ spec: metadata: type: object spec: - description: NVIDIADriverSpec defines the desired state of NVIDIADriver + description: |- + NVIDIADriverSpec defines the desired state of NVIDIADriver. + The CEL validation allows non-default drivers to use nodeSelector, but requires + default drivers to leave nodeSelector unset or empty. properties: annotations: additionalProperties: @@ -70,6 +76,12 @@ spec: name: type: string type: object + default: + default: false + description: |- + Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes + that do not match any non-default NVIDIADriver nodeSelector. + type: boolean driverType: default: gpu description: DriverType defines NVIDIA driver type @@ -968,9 +980,14 @@ spec: type: string type: object required: + - default - driverType - image type: object + x-kubernetes-validations: + - message: default NVIDIADriver must not use nodeSelector + rule: 'has(self.default) && self.default ? !has(self.nodeSelector) || + size(self.nodeSelector) == 0 : true' status: description: NVIDIADriverStatus defines the observed state of NVIDIADriver properties: diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index 637b9cec9..84c98dd75 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -45,6 +45,7 @@ import ( "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" "github.com/NVIDIA/gpu-operator/internal/conditions" "github.com/NVIDIA/gpu-operator/internal/consts" + nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver" "github.com/NVIDIA/gpu-operator/internal/state" "github.com/NVIDIA/gpu-operator/internal/validator" ) @@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.Get(ctx, req.NamespacedName, instance); err != nil { if apierrors.IsNotFound(err) { // Request object not found, could have been deleted after reconcile request. - // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. - // Return and don't requeue - return reconcile.Result{}, nil + // Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels. + return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx) } wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err) logger.Error(err, "error getting NVIDIADriver object") @@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request // Error reading the object - requeue the request. return reconcile.Result{}, wrappedErr } + if !instance.GetDeletionTimestamp().IsZero() { + logger.Info("NVIDIADriver delete requested; cleaning up owner labels") + return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx) + } // Get the singleton NVIDIA ClusterPolicy object in the cluster. clusterPolicyList := &gpuv1.ClusterPolicyList{} @@ -121,7 +125,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request clusterPolicyInstance := clusterPolicyList.Items[0] // Ensure that ClusterPolicy is configured to use NVIDIADriver CRD - if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() { + if !clusterPolicyInstance.Spec.Driver.IsNVIDIADriverCRDEnabled() { msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy" logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg) instance.Status.State = nvidiav1alpha1.Disabled @@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } + changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client) + if err != nil { + logger.Error(err, "failed to assign NVIDIADriver owners to nodes") + instance.Status.State = nvidiav1alpha1.NotReady + if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { + logger.Error(condErr, "failed to set condition") + } + return reconcile.Result{}, err + } + if changed { + return reconcile.Result{RequeueAfter: time.Second}, nil + } + if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) { err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers") logger.Error(err, "unsupported driver combination detected") @@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request // Sync state and update status managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog) + if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil { + logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods") + return reconcile.Result{}, err + } + // update CR status if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil { return ctrl.Result{}, err @@ -276,6 +298,65 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) [] return reconcileRequests } +// enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the +// event and all current NVIDIADriver instances. The triggering object is +// included even for delete events so the NotFound reconcile path can clear +// stale node owner labels. +func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { + requests := r.enqueueAllNVIDIADrivers(ctx) + if driver != nil { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: driver.GetName(), + Namespace: driver.GetNamespace(), + }, + }) + } + return dedupeReconcileRequests(requests) +} + +func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request { + seen := map[types.NamespacedName]struct{}{} + deduped := make([]reconcile.Request, 0, len(requests)) + for _, request := range requests { + if _, ok := seen[request.NamespacedName]; ok { + continue + } + seen[request.NamespacedName] = struct{}{} + deduped = append(deduped, request) + } + return deduped +} + +func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabelsAndReturn(ctx context.Context) (reconcile.Result, error) { + if err := r.cleanupNVIDIADriverOwnerLabels(ctx); err != nil { + log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels") + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} + +// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event +// when ClusterPolicy is still configured for NVIDIADriver mode. This lets +// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining +// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver. +func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) error { + clusterPolicyList := &gpuv1.ClusterPolicyList{} + if err := r.List(ctx, clusterPolicyList); err != nil { + return fmt.Errorf("error getting ClusterPolicy list: %w", err) + } + + for i := range clusterPolicyList.Items { + if !clusterPolicyList.Items[i].Spec.Driver.IsNVIDIADriverCRDEnabled() { + continue + } + _, err := nvidiadriverutil.AssignOwners(ctx, r.Client) + return err + } + + return nil +} + // SetupWithManager sets up the controller with the Manager. func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { // Create state manager @@ -307,8 +388,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. // Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR, // enqueue a reconcile request for all NVIDIADriver instances. - nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { - return r.enqueueAllNVIDIADrivers(ctx) + nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { + return r.enqueueNVIDIADriverReconcilers(ctx, driver) } // Watch for changes to the primary resource NVIDIADriver diff --git a/controllers/nvidiadriver_controller_test.go b/controllers/nvidiadriver_controller_test.go index 7630e7e4b..b82484fee 100644 --- a/controllers/nvidiadriver_controller_test.go +++ b/controllers/nvidiadriver_controller_test.go @@ -28,15 +28,18 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/zap" "go.uber.org/zap/zapcore" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/validator" ) @@ -70,6 +73,15 @@ func (f *FakeNodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1al return f.CustomError } +type patchFailingClient struct { + client.Client + patchErr error +} + +func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return c.patchErr +} + // newTestLogger creates a zap.Logger that writes to an in-memory buffer for testing func newTestLogger() (logr.Logger, *bytes.Buffer) { buf := &bytes.Buffer{} @@ -97,6 +109,7 @@ func TestReconcile(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) tests := []struct { name string @@ -247,6 +260,143 @@ func TestReconcileConflictSetsNotReadyState(t *testing.T) { require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState) } +func TestReconcileReturnsErrorWhenOwnerAssignmentFails(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-driver", + Namespace: "default", + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "a"}, + }, + } + cp := &gpuv1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + UseNvidiaDriverCRD: ptr.To(true), + }, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + "nodepool": "a", + "nvidia.com/gpu.present": "true", + }, + }} + patchErr := errors.New("patch failed") + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, driver, node).Build() + updater := &FakeConditionUpdater{} + + reconciler := &NVIDIADriverReconciler{ + Client: &patchFailingClient{Client: k8sClient, patchErr: patchErr}, + Scheme: scheme, + conditionUpdater: updater, + nodeSelectorValidator: &FakeNodeSelectorValidator{}, + } + + _, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: driver.Name, + Namespace: driver.Namespace, + }, + }) + + require.Error(t, err) + require.ErrorContains(t, err, patchErr.Error()) + require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState) +} + +func TestReconcileCleansOwnerLabelsWhenDeletedNVIDIADriverWasLast(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + cp := &gpuv1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + UseNvidiaDriverCRD: ptr.To(true), + }, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "demo-gold", + }, + }} + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, node).Build() + reconciler := &NVIDIADriverReconciler{ + Client: k8sClient, + Scheme: scheme, + } + + _, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "demo-gold"}, + }) + require.NoError(t, err) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: "gpu-node"}, node)) + require.NotContains(t, node.Labels, consts.NVIDIADriverOwnerLabel) +} + +func TestReconcileCleansOwnerLabelsForTerminatingNVIDIADriver(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + deleteTime := metav1.Now() + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "demo-gold", + DeletionTimestamp: &deleteTime, + Finalizers: []string{"test-finalizer"}, + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "gold"}, + }, + } + cp := &gpuv1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + UseNvidiaDriverCRD: ptr.To(true), + }, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "demo-gold", + "nodepool": "gold", + }, + }} + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, driver, node).Build() + reconciler := &NVIDIADriverReconciler{ + Client: k8sClient, + Scheme: scheme, + } + + _, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "demo-gold"}, + }) + require.NoError(t, err) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: "gpu-node"}, node)) + require.NotContains(t, node.Labels, consts.NVIDIADriverOwnerLabel) +} + func TestEnqueueAllNVIDIADrivers(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) @@ -267,3 +417,30 @@ func TestEnqueueAllNVIDIADrivers(t *testing.T) { sort.Strings(got) require.Equal(t, []string{"default/driver-a", "default/driver-b"}, got) } + +func TestEnqueueNVIDIADriverReconcilersIncludesDeletedDriver(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + + client := fake.NewClientBuilder().WithScheme(scheme).Build() + reconciler := &NVIDIADriverReconciler{Client: client} + requests := reconciler.enqueueNVIDIADriverReconcilers(context.Background(), &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "deleted-driver", Namespace: "default"}, + }) + + require.Len(t, requests, 1) + require.Equal(t, "default/deleted-driver", requests[0].String()) +} + +func TestEnqueueNVIDIADriverReconcilersDedupesEventDriver(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + + driver := &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver-a", Namespace: "default"}} + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(driver).Build() + reconciler := &NVIDIADriverReconciler{Client: client} + requests := reconciler.enqueueNVIDIADriverReconcilers(context.Background(), driver) + + require.Len(t, requests, 1) + require.Equal(t, "default/driver-a", requests[0].String()) +} diff --git a/controllers/nvidiadriver_migration.go b/controllers/nvidiadriver_migration.go new file mode 100644 index 000000000..295ef5955 --- /dev/null +++ b/controllers/nvidiadriver_migration.go @@ -0,0 +1,110 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package controllers + +import ( + "context" + "fmt" + + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" +) + +// labelNodesWithOrphanedDriverPods marks NVIDIADriver-owned nodes that still have orphaned +// ClusterPolicy driver pods so the driver upgrade controller can replace those pods in +// the normal controlled upgrade flow. +func (r *NVIDIADriverReconciler) labelNodesWithOrphanedDriverPods(ctx context.Context) error { + nvidiaDrivers := &nvidiav1alpha1.NVIDIADriverList{} + if err := r.List(ctx, nvidiaDrivers); err != nil { + return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err) + } + if len(nvidiaDrivers.Items) == 0 { + return nil + } + + pods := &corev1.PodList{} + if err := r.List(ctx, pods, + client.InNamespace(r.Namespace), + client.MatchingLabels{AppComponentLabelKey: DriverAppComponentLabelValue}, + ); err != nil { + return fmt.Errorf("failed to list NVIDIA driver pods: %w", err) + } + + for i := range pods.Items { + pod := &pods.Items[i] + if len(pod.OwnerReferences) > 0 || pod.Status.Phase != corev1.PodRunning || pod.Spec.NodeName == "" { + continue + } + + node := &corev1.Node{} + if err := r.Get(ctx, types.NamespacedName{Name: pod.Spec.NodeName}, node); err != nil { + log.FromContext(ctx).Error(err, "failed to get node for orphaned NVIDIA driver pod", "pod", pod.Name, "node", pod.Spec.NodeName) + continue + } + if !nodeOwnedByNVIDIADriver(node, nvidiaDrivers.Items) { + continue + } + + upgradeStateLabel := upgrade.GetUpgradeStateLabelKey() + upgradeState := upgrade.UpgradeStateUnknown + if node.Labels != nil { + upgradeState = node.Labels[upgradeStateLabel] + } + if !isDriverUpgradeRequestAllowed(upgradeState) { + continue + } + originalNode := node.DeepCopy() + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels[upgradeStateLabel] = upgrade.UpgradeStateUpgradeRequired + if err := r.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil { + log.FromContext(ctx).Error(err, "failed to label node with orphaned NVIDIA driver pod", "pod", pod.Name, "node", node.Name) + } + } + + return nil +} + +// isDriverUpgradeRequestAllowed returns true when migration can request a +// driver upgrade without rewinding an active or failed upgrade state. +func isDriverUpgradeRequestAllowed(upgradeState string) bool { + return upgradeState == upgrade.UpgradeStateUnknown || upgradeState == upgrade.UpgradeStateDone +} + +// nodeOwnedByNVIDIADriver returns true when the node has an owner label for a live NVIDIADriver. +func nodeOwnedByNVIDIADriver(node *corev1.Node, nvidiaDrivers []nvidiav1alpha1.NVIDIADriver) bool { + if node.Labels == nil || node.Labels[consts.NVIDIADriverOwnerLabel] == "" { + return false + } + + for _, nvidiaDriver := range nvidiaDrivers { + if !nvidiaDriver.GetDeletionTimestamp().IsZero() { + continue + } + if node.Labels[consts.NVIDIADriverOwnerLabel] == nvidiaDriver.Name { + return true + } + } + return false +} diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 964f3785b..299da2a45 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -4264,7 +4264,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) { return true, nil } -func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error { +func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, propagationPolicy metav1.DeletionPropagation) error { // Get all DaemonSets owned by ClusterPolicy // // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector @@ -4280,8 +4280,10 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) ds := ds // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) { - n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) - err = n.client.Delete(ctx, &ds) + n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name, "PropagationPolicy", propagationPolicy) + err = n.client.Delete(ctx, &ds, &client.DeleteOptions{ + PropagationPolicy: &propagationPolicy, + }) if err != nil { return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err) } diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 692486b37..bec477c0d 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -26,6 +26,7 @@ import ( "strings" "testing" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" secv1 "github.com/openshift/api/security/v1" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/stretchr/testify/require" @@ -49,6 +50,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" ) const ( @@ -313,6 +316,118 @@ func TestIsDaemonSetReadyUsesSelectorLabelsForOnDeletePods(t *testing.T) { require.Equal(t, gpuv1.NotReady, isDaemonSetReady(dsName, controller)) } +func TestLabelNodesWithOrphanedDriverPodsRequestsUpgradeOnlyForOwnedAllowedStates(t *testing.T) { + const namespace = "test-namespace" + const driverName = "demo-gold" + + testScheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(testScheme)) + require.NoError(t, nvidiav1alpha1.AddToScheme(testScheme)) + + upgradeStateLabel := upgrade.GetUpgradeStateLabelKey() + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: driverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"gpu": "true"}, + }, + } + nodeWithoutUpgradeState := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-without-upgrade-state", + Labels: map[string]string{ + "gpu": "true", + consts.NVIDIADriverOwnerLabel: driverName, + }, + }, + } + nodeWithDoneState := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-with-done-state", + Labels: map[string]string{ + "gpu": "true", + consts.NVIDIADriverOwnerLabel: driverName, + upgradeStateLabel: upgrade.UpgradeStateDone, + }, + }, + } + nodeWithActiveState := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-with-active-state", + Labels: map[string]string{ + "gpu": "true", + consts.NVIDIADriverOwnerLabel: driverName, + upgradeStateLabel: upgrade.UpgradeStatePodRestartRequired, + }, + }, + } + nodeWithFailedState := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-with-failed-state", + Labels: map[string]string{ + "gpu": "true", + consts.NVIDIADriverOwnerLabel: driverName, + upgradeStateLabel: upgrade.UpgradeStateFailed, + }, + }, + } + unownedMatchingNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unowned-matching-node", + Labels: map[string]string{"gpu": "true"}, + }, + } + pods := []client.Object{ + orphanedDriverPod("pod-without-upgrade-state", namespace, nodeWithoutUpgradeState.Name), + orphanedDriverPod("pod-with-done-state", namespace, nodeWithDoneState.Name), + orphanedDriverPod("pod-with-active-state", namespace, nodeWithActiveState.Name), + orphanedDriverPod("pod-with-failed-state", namespace, nodeWithFailedState.Name), + orphanedDriverPod("pod-on-unowned-matching-node", namespace, unownedMatchingNode.Name), + } + + objects := []client.Object{driver, nodeWithoutUpgradeState, nodeWithDoneState, nodeWithActiveState, nodeWithFailedState, unownedMatchingNode} + objects = append(objects, pods...) + k8sClient := fake.NewClientBuilder().WithScheme(testScheme).WithObjects(objects...).Build() + reconciler := &NVIDIADriverReconciler{ + Client: k8sClient, + Namespace: namespace, + } + + require.NoError(t, reconciler.labelNodesWithOrphanedDriverPods(context.Background())) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeWithoutUpgradeState.Name}, nodeWithoutUpgradeState)) + require.Equal(t, upgrade.UpgradeStateUpgradeRequired, nodeWithoutUpgradeState.Labels[upgradeStateLabel]) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeWithDoneState.Name}, nodeWithDoneState)) + require.Equal(t, upgrade.UpgradeStateUpgradeRequired, nodeWithDoneState.Labels[upgradeStateLabel]) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeWithActiveState.Name}, nodeWithActiveState)) + require.Equal(t, upgrade.UpgradeStatePodRestartRequired, nodeWithActiveState.Labels[upgradeStateLabel]) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeWithFailedState.Name}, nodeWithFailedState)) + require.Equal(t, upgrade.UpgradeStateFailed, nodeWithFailedState.Labels[upgradeStateLabel]) + + require.NoError(t, k8sClient.Get(context.Background(), types.NamespacedName{Name: unownedMatchingNode.Name}, unownedMatchingNode)) + require.Empty(t, unownedMatchingNode.Labels[upgradeStateLabel]) +} + +func orphanedDriverPod(name, namespace, nodeName string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + AppComponentLabelKey: DriverAppComponentLabelValue, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + } +} + func getModuleRoot(dir string) (string, error) { if dir == "" || dir == "/" { return "", fmt.Errorf("module root not found") diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 29eef5cf2..ba95cb5c9 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -1058,12 +1058,23 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) { // - In object_controls.go, check the OwnerRef for existing objects // before managing them. Clusterpolicy controller should not be creating / // updating / deleting objects owned by another controller. - if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") && - n.singleton.Spec.Driver.UseNvidiaDriverCRDType() { + if n.stateNames[n.idx] == "state-driver" && n.singleton.Spec.Driver.IsNVIDIADriverCRDEnabled() { n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") n.idx++ - // Cleanup all driver daemonsets owned by ClusterPolicy. - err := n.cleanupAllDriverDaemonSets(n.ctx) + // Cleanup all driver daemonsets owned by ClusterPolicy while keeping the + // running driver pods available until NVIDIADriver rolls replacements. + err := n.cleanupAllDriverDaemonSets(n.ctx, metav1.DeletePropagationOrphan) + if err != nil { + return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) + } + return gpuv1.Disabled, nil + } + if n.stateNames[n.idx] == "state-vgpu-manager" && n.singleton.Spec.Driver.IsNVIDIADriverCRDEnabled() { + n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") + n.idx++ + // Cleanup all driver daemonsets owned by ClusterPolicy while keeping the + // running driver pods available until NVIDIADriver rolls replacements. + err := n.cleanupAllDriverDaemonSets(n.ctx, metav1.DeletePropagationOrphan) if err != nil { return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) } diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index e82e2de35..5dcdf4160 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -67,8 +67,8 @@ const ( UpgradeSkipDrainLabelSelector = "nvidia.com/gpu-driver-upgrade-drain.skip!=true" // AppComponentLabelKey indicates the label key of the component AppComponentLabelKey = "app.kubernetes.io/component" - // AppComponentLabelValue indicates the label values of the nvidia-gpu-driver component - AppComponentLabelValue = "nvidia-driver" + // DriverAppComponentLabelValue indicates the label value of the NVIDIA driver component + DriverAppComponentLabelValue = "nvidia-driver" ) //nolint @@ -132,10 +132,10 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct driverLabelKey := DriverLabelKey driverLabelValue := DriverLabelValue - if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + if clusterPolicy.Spec.Driver.IsNVIDIADriverCRDEnabled() { // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller driverLabelKey = AppComponentLabelKey - driverLabelValue = AppComponentLabelValue + driverLabelValue = DriverAppComponentLabelValue } else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled { // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes // based on rhcos version. Hence use DTK label instead @@ -318,7 +318,7 @@ func (r *UpgradeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag }) componentLabelSelector := predicate.NewTypedPredicateFuncs(func(ds *appsv1.DaemonSet) bool { - ls := metav1.LabelSelector{MatchLabels: map[string]string{AppComponentLabelKey: AppComponentLabelValue}} + ls := metav1.LabelSelector{MatchLabels: map[string]string{AppComponentLabelKey: DriverAppComponentLabelValue}} selector, _ := metav1.LabelSelectorAsSelector(&ls) return selector.Matches(labels.Set(ds.GetLabels())) }) diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index 81c234157..02ebd2b8d 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -22,6 +22,9 @@ spec: - jsonPath: .status.state name: Status type: string + - jsonPath: .spec.default + name: Default + type: boolean - jsonPath: .metadata.creationTimestamp name: Age type: string @@ -48,7 +51,10 @@ spec: metadata: type: object spec: - description: NVIDIADriverSpec defines the desired state of NVIDIADriver + description: |- + NVIDIADriverSpec defines the desired state of NVIDIADriver. + The CEL validation allows non-default drivers to use nodeSelector, but requires + default drivers to leave nodeSelector unset or empty. properties: annotations: additionalProperties: @@ -70,6 +76,12 @@ spec: name: type: string type: object + default: + default: false + description: |- + Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes + that do not match any non-default NVIDIADriver nodeSelector. + type: boolean driverType: default: gpu description: DriverType defines NVIDIA driver type @@ -968,9 +980,14 @@ spec: type: string type: object required: + - default - driverType - image type: object + x-kubernetes-validations: + - message: default NVIDIADriver must not use nodeSelector + rule: 'has(self.default) && self.default ? !has(self.nodeSelector) || + size(self.nodeSelector) == 0 : true' status: description: NVIDIADriverStatus defines the observed state of NVIDIADriver properties: diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 121a00f15..38189b8a3 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -167,6 +167,7 @@ spec: driver: enabled: {{ .Values.driver.enabled }} useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} + {{- if not .Values.driver.nvidiaDriverCRD.enabled }} kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} {{- if .Values.driver.repository }} @@ -239,6 +240,7 @@ spec: {{- if .Values.driver.args }} args: {{ toYaml .Values.driver.args | nindent 6 }} {{- end }} + {{- end }} {{- if .Values.driver.upgradePolicy }} upgradePolicy: autoUpgrade: {{ .Values.driver.upgradePolicy.autoUpgrade | default false }} diff --git a/deployments/gpu-operator/templates/nvidiadriver.yaml b/deployments/gpu-operator/templates/nvidiadriver.yaml index 1a059a490..ecf656e84 100644 --- a/deployments/gpu-operator/templates/nvidiadriver.yaml +++ b/deployments/gpu-operator/templates/nvidiadriver.yaml @@ -5,6 +5,7 @@ kind: NVIDIADriver metadata: name: default spec: + default: true repository: {{ .Values.driver.repository }} image: {{ .Values.driver.image }} version: {{ .Values.driver.version }} @@ -17,9 +18,6 @@ spec: {{- if .Values.daemonsets.labels }} labels: {{ toYaml .Values.daemonsets.labels | nindent 6 }} {{- end }} - {{- if .Values.driver.nvidiaDriverCRD.nodeSelector }} - nodeSelector: {{ toYaml .Values.driver.nvidiaDriverCRD.nodeSelector | nindent 6 }} - {{- end }} {{- if .Values.driver.imagePullSecrets }} imagePullSecrets: {{ toYaml .Values.driver.imagePullSecrets | nindent 4 }} {{- end }} diff --git a/internal/consts/consts.go b/internal/consts/consts.go index c2850f419..f28c50794 100644 --- a/internal/consts/consts.go +++ b/internal/consts/consts.go @@ -65,6 +65,11 @@ const ( // NVIDIADriverControllerIndexKey provides quick lookups for DaemonSets owned by an NVIDIADriver instance NVIDIADriverControllerIndexKey = "metadata.nvidiadriver.controller" + // DefaultNVIDIADriverName is the Helm-managed fallback NVIDIADriver. + DefaultNVIDIADriverName = "default" + // NVIDIADriverOwnerLabel is an operator-managed node label used to route each GPU node to one NVIDIADriver. + NVIDIADriverOwnerLabel = "nvidia.com/gpu-operator.driver.owner" + // MinimumGDSVersionForOpenRM indicates the minimum GDS version that is supported only with OpenRM driver MinimumGDSVersionForOpenRM = "v2.17.5" ) diff --git a/internal/nvidiadriver/nvidiadriver.go b/internal/nvidiadriver/nvidiadriver.go new file mode 100644 index 000000000..f3f9ad3ac --- /dev/null +++ b/internal/nvidiadriver/nvidiadriver.go @@ -0,0 +1,143 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvidiadriver + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" +) + +// ValidateNodeSelector rejects selectors that use operator-managed routing labels +// or scope the default fallback driver. +func ValidateNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error { + if driver == nil || driver.Spec.NodeSelector == nil { + return nil + } + if driver.IsDefault() && len(driver.Spec.NodeSelector) > 0 { + return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", driver.Name) + } + if _, ok := driver.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok { + return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", driver.Name, consts.NVIDIADriverOwnerLabel) + } + return nil +} + +// NodeMatchesSelector returns true when all selector labels are present on the node. +func NodeMatchesSelector(node *corev1.Node, selector map[string]string) bool { + for key, value := range selector { + if node.Labels[key] != value { + return false + } + } + return true +} + +// AssignOwners labels GPU nodes with the NVIDIADriver that should manage their driver pods. +// Non-default NVIDIADrivers take precedence over the default fallback, and conflicts fail closed before +// node owner labels are changed. It returns true when any node label was changed. +func AssignOwners(ctx context.Context, c client.Client) (bool, error) { + drivers := &nvidiav1alpha1.NVIDIADriverList{} + if err := c.List(ctx, drivers); err != nil { + return false, fmt.Errorf("failed to list NVIDIADriver CRs: %w", err) + } + + var defaultDriver *nvidiav1alpha1.NVIDIADriver + defaultDriverNames := []string{} + nonDefaultDrivers := make([]nvidiav1alpha1.NVIDIADriver, 0, len(drivers.Items)) + for i := range drivers.Items { + if !drivers.Items[i].GetDeletionTimestamp().IsZero() { + continue + } + if err := ValidateNodeSelector(&drivers.Items[i]); err != nil { + return false, err + } + if drivers.Items[i].IsDefault() { + defaultDriverNames = append(defaultDriverNames, drivers.Items[i].Name) + defaultDriver = &drivers.Items[i] + continue + } + nonDefaultDrivers = append(nonDefaultDrivers, drivers.Items[i]) + } + if len(defaultDriverNames) > 1 { + return false, fmt.Errorf("multiple default NVIDIADrivers found: %s", strings.Join(defaultDriverNames, ", ")) + } + nodes := &corev1.NodeList{} + if err := c.List(ctx, nodes, client.MatchingLabels{consts.GPUPresentLabel: "true"}); err != nil { + return false, fmt.Errorf("failed to list GPU nodes: %w", err) + } + + for i := range nodes.Items { + matchingDrivers := []string{} + for _, driver := range nonDefaultDrivers { + if NodeMatchesSelector(&nodes.Items[i], driver.GetNodeSelector()) { + matchingDrivers = append(matchingDrivers, driver.Name) + } + } + if len(matchingDrivers) > 1 { + return false, fmt.Errorf("multiple NVIDIADrivers match the same node %s: %v", nodes.Items[i].Name, matchingDrivers) + } + } + + changed := false + for i := range nodes.Items { + node := &nodes.Items[i] + originalNode := node.DeepCopy() + owner := "" + for _, driver := range nonDefaultDrivers { + if NodeMatchesSelector(node, driver.GetNodeSelector()) { + owner = driver.Name + } + } + if owner == "" && defaultDriver != nil && NodeMatchesSelector(node, defaultDriver.GetNodeSelector()) { + owner = defaultDriver.Name + } + if owner == "" { + if node.Labels == nil { + continue + } + if _, ok := node.Labels[consts.NVIDIADriverOwnerLabel]; !ok { + continue + } + delete(node.Labels, consts.NVIDIADriverOwnerLabel) + if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil { + return false, fmt.Errorf("failed to remove NVIDIADriver owner label for node %s: %w", node.Name, err) + } + changed = true + continue + } + if node.Labels != nil && node.Labels[consts.NVIDIADriverOwnerLabel] == owner { + continue + } + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels[consts.NVIDIADriverOwnerLabel] = owner + if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil { + return false, fmt.Errorf("failed to update NVIDIADriver owner label for node %s: %w", node.Name, err) + } + changed = true + } + + return changed, nil +} diff --git a/internal/nvidiadriver/nvidiadriver_test.go b/internal/nvidiadriver/nvidiadriver_test.go new file mode 100644 index 000000000..d7cbbdf37 --- /dev/null +++ b/internal/nvidiadriver/nvidiadriver_test.go @@ -0,0 +1,393 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvidiadriver + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" +) + +func TestAssignNVIDIADriverOwnersGivesSpecificDriversPrecedence(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, specificNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.NoError(t, err) + require.True(t, changed) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersAllowsMissingDefaultDriver(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + unmatchedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "unmatched-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(specificDriver, unmatchedNode, specificNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.NoError(t, err) + require.True(t, changed) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "unmatched-node"}, unmatchedNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.NotContains(t, unmatchedNode.Labels, consts.NVIDIADriverOwnerLabel) + require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersIgnoresDeletingDrivers(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + deleteTime := metav1.Now() + deletingDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + Name: "demo-gold", + DeletionTimestamp: &deleteTime, + Finalizers: []string{"test-finalizer"}, + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "gold"}, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "demo-gold", + "nodepool": "gold", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(deletingDriver, node).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.NoError(t, err) + require.True(t, changed) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.NotContains(t, node.Labels, consts.NVIDIADriverOwnerLabel) +} + +func TestAssignNVIDIADriverOwnersUsesDefaultDriverWithArbitraryName(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "fallback-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, node).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.NoError(t, err) + require.True(t, changed) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.Equal(t, "fallback-driver", node.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersReturnsFalseWhenOwnersAreCurrent(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName, + }, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "h100-driver", + "nodepool": "h100", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, specificNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.NoError(t, err) + require.False(t, changed) +} + +func TestAssignNVIDIADriverOwnersErrorsOnMultipleDefaultDrivers(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriverA := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "fallback-a"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + defaultDriverB := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "fallback-b"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{consts.GPUPresentLabel: "true"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriverA, defaultDriverB, node).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.Error(t, err) + require.False(t, changed) + require.Contains(t, err.Error(), "multiple default NVIDIADrivers found") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.NotContains(t, node.Labels, consts.NVIDIADriverOwnerLabel) +} + +func TestAssignNVIDIADriverOwnersRejectsReservedOwnerLabelSelector(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "bad-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{consts.NVIDIADriverOwnerLabel: "other-driver"}, + }, + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "existing-driver", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(driver, node).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.Error(t, err) + require.False(t, changed) + require.Contains(t, err.Error(), "reserved label") + require.Contains(t, err.Error(), consts.NVIDIADriverOwnerLabel) + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gpu-node"}, node)) + require.Equal(t, "existing-driver", node.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersRejectsDefaultDriverNodeSelector(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + Default: true, + NodeSelector: map[string]string{"driver-default": "true"}, + }, + } + specificDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "h100"}, + }, + } + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "driver-default": "true"}, + }} + unmatchedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "unmatched-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName}, + }} + specificNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "specific-node", + Labels: map[string]string{consts.GPUPresentLabel: "true", "driver-default": "true", "nodepool": "h100"}, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, unmatchedNode, specificNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.Error(t, err) + require.False(t, changed) + require.Contains(t, err.Error(), "default NVIDIADriver") + require.Contains(t, err.Error(), "cannot use nodeSelector") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "unmatched-node"}, unmatchedNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode)) + require.NotContains(t, defaultNode.Labels, consts.NVIDIADriverOwnerLabel) + require.Equal(t, consts.DefaultNVIDIADriverName, unmatchedNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.NotContains(t, specificNode.Labels, consts.NVIDIADriverOwnerLabel) +} + +func TestAssignNVIDIADriverOwnersDoesNotFallbackToDefaultOnUserDriverConflict(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + driverA := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "driver-a"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "shared"}, + }, + } + driverB := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "driver-b"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"nodepool": "shared"}, + }, + } + conflictedNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "conflicted-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "driver-a", + "nodepool": "shared", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, driverA, driverB, conflictedNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.Error(t, err) + require.False(t, changed) + require.Contains(t, err.Error(), "multiple NVIDIADrivers match the same node") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "conflicted-node"}, conflictedNode)) + require.Equal(t, "driver-a", conflictedNode.Labels[consts.NVIDIADriverOwnerLabel]) +} + +func TestAssignNVIDIADriverOwnersDoesNotChangeOwnersWhenAnyUserDriverConflicts(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + defaultDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true}, + } + goldDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "demo-gold"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{"region": "us-east-1"}, + }, + } + silverDriver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "demo-silver"}, + } + goldNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gold-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "demo-gold", + "region": "us-east-1", + }, + }} + defaultNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "default-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: consts.DefaultNVIDIADriverName, + "region": "us-east-2", + }, + }} + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, goldDriver, silverDriver, goldNode, defaultNode).Build() + + changed, err := AssignOwners(context.Background(), k8sClient) + require.Error(t, err) + require.False(t, changed) + require.Contains(t, err.Error(), "multiple NVIDIADrivers match the same node") + + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "gold-node"}, goldNode)) + require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode)) + require.Equal(t, "demo-gold", goldNode.Labels[consts.NVIDIADriverOwnerLabel]) + require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel]) +} diff --git a/internal/state/driver.go b/internal/state/driver.go index 355910a8e..0488b0efb 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -56,8 +56,8 @@ const ( // AppComponentLabelKey indicates the label key of the component AppComponentLabelKey = "app.kubernetes.io/component" - // AppComponentLabelValue indicates the label values of the nvidia-gpu-driver component - AppComponentLabelValue = "nvidia-driver" + // DriverAppComponentLabelValue indicates the label value of the NVIDIA driver component + DriverAppComponentLabelValue = "nvidia-driver" ) type stateDriver struct { @@ -169,7 +169,7 @@ func (s *stateDriver) Sync(ctx context.Context, customResource interface{}, info func (s *stateDriver) GetWatchSources(mgr ctrlManager) map[string]SyncingSource { wr := make(map[string]SyncingSource) nvDriverPredicate := predicate.NewTypedPredicateFuncs(func(ds *appsv1.DaemonSet) bool { - ls := metav1.LabelSelector{MatchLabels: map[string]string{AppComponentLabelKey: AppComponentLabelValue}} + ls := metav1.LabelSelector{MatchLabels: map[string]string{AppComponentLabelKey: DriverAppComponentLabelValue}} selector, _ := metav1.LabelSelectorAsSelector(&ls) return selector.Matches(labels.Set(ds.GetLabels())) }) @@ -270,7 +270,7 @@ func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1 } isOpenshift := runtimeSpec.OpenshiftVersion != "" - nodePools, err := getNodePools(ctx, s.client, cr.Spec.NodeSelector, cr.Spec.UsePrecompiledDrivers(), isOpenshift) + nodePools, err := getNodePools(ctx, s.client, cr, isOpenshift) if err != nil { return nil, fmt.Errorf("failed to get node pools: %w", err) } diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 02b2736f2..07690c04d 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -603,6 +603,33 @@ func TestDriverOpenshiftDriverToolkit(t *testing.T) { require.Equal(t, string(o), actual) } +func TestGetNodePoolsDoesNotAllowSelectorToOverrideOwnerLabel(t *testing.T) { + require.NoError(t, corev1.AddToScheme(scheme.Scheme)) + + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{ + consts.GPUPresentLabel: "true", + consts.NVIDIADriverOwnerLabel: "driver-a", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "22.04", + }, + }} + k8sClient := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(node).Build() + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "driver-a"}, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + NodeSelector: map[string]string{consts.NVIDIADriverOwnerLabel: "driver-b"}, + }, + } + + nodePools, err := getNodePools(context.Background(), k8sClient, driver, false) + + require.NoError(t, err) + require.Len(t, nodePools, 1) + require.Equal(t, "driver-a", nodePools[0].nodeSelector[consts.NVIDIADriverOwnerLabel]) +} + func TestDriverPrecompiled(t *testing.T) { const ( testName = "driver-precompiled" diff --git a/internal/state/nodepool.go b/internal/state/nodepool.go index c9781cce0..5f0dd5309 100644 --- a/internal/state/nodepool.go +++ b/internal/state/nodepool.go @@ -26,6 +26,9 @@ import ( corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" ) const ( @@ -55,16 +58,15 @@ type nodePool struct { // // Each nodePool object contains information needed to identify the corresonding node pool. // Most importantly, it contains a nodeSelector used to identify the node pool. -func getNodePools(ctx context.Context, k8sClient client.Client, selector map[string]string, precompiled bool, openshift bool) ([]nodePool, error) { +func getNodePools(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver, openshift bool) ([]nodePool, error) { nodePoolMap := make(map[string]nodePool) logger := log.FromContext(ctx) - nodeSelector := map[string]string{ - "nvidia.com/gpu.present": "true", - } - - maps.Copy(nodeSelector, selector) + nodeSelector := map[string]string{} + maps.Copy(nodeSelector, cr.Spec.NodeSelector) + nodeSelector[consts.GPUPresentLabel] = "true" + nodeSelector[consts.NVIDIADriverOwnerLabel] = cr.Name nodeList := &corev1.NodeList{} err := k8sClient.List(ctx, nodeList, client.MatchingLabels(nodeSelector)) @@ -104,7 +106,7 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str nodePool.osTag = osTag nodePool.name = osTag - if precompiled { + if cr.Spec.UsePrecompiledDrivers() { kernelVersion, ok := nodeLabels[nfdKernelLabelKey] if !ok { logger.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) @@ -115,7 +117,7 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str nodePool.name = fmt.Sprintf("%s-%s", nodePool.name, getSanitizedKernelVersion(kernelVersion)) } - if !precompiled && openshift { + if !cr.Spec.UsePrecompiledDrivers() && openshift { rhcosVersion, ok := nodeLabels[nfdOSTreeVersionLabelKey] if !ok { logger.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) diff --git a/internal/validator/validator.go b/internal/validator/validator.go index e43c8127c..25ee5d922 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -19,12 +19,14 @@ package validator import ( "context" "fmt" + "sort" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/controller-runtime/pkg/client" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver" ) // Validator provides interface to validate NVIDIADriver fields @@ -45,25 +47,37 @@ func NewNodeSelectorValidator(c client.Client) Validator { // Check returns error when nodes matching with the selector labels of current instance of NVIDIADriver // are conflicting with other instances of NVIDIADriver func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error { + if err := nvidiadriverutil.ValidateNodeSelector(cr); err != nil { + return err + } + drivers := &nvidiav1alpha1.NVIDIADriverList{} err := nsv.client.List(ctx, drivers) if err != nil { return err } - names := map[string]struct{}{} + selectedNodeOwners := map[string][]string{} for di := range drivers.Items { + if err := nvidiadriverutil.ValidateNodeSelector(&drivers.Items[di]); err != nil { + return err + } + if drivers.Items[di].IsDefault() { + continue + } + driverName := drivers.Items[di].Name nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di]) if err != nil { return err } for ni := range nodeList.Items { - if _, ok := names[nodeList.Items[ni].Name]; ok { - return fmt.Errorf("conflicting NVIDIADriver NodeSelectors found for resource: %s, nodeSelector: %q", cr.Name, cr.Spec.NodeSelector) + nodeName := nodeList.Items[ni].Name + selectedNodeOwners[nodeName] = append(selectedNodeOwners[nodeName], driverName) + if len(selectedNodeOwners[nodeName]) > 1 { + sort.Strings(selectedNodeOwners[nodeName]) + return fmt.Errorf("multiple NVIDIADrivers match the same node %s: %v", nodeName, selectedNodeOwners[nodeName]) } - - names[nodeList.Items[ni].Name] = struct{}{} } } diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 13fc899e4..75b737e2c 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -18,7 +18,6 @@ package validator import ( "context" - "reflect" "testing" "github.com/stretchr/testify/assert" @@ -29,6 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/consts" ) const ( @@ -36,68 +36,42 @@ const ( testNodeName = "my-test-node" ) -type driverOptions func(*nvidiav1alpha1.NVIDIADriver) - -func makeTestDriver(opts ...driverOptions) *nvidiav1alpha1.NVIDIADriver { - c := &nvidiav1alpha1.NVIDIADriver{ +func makeTestDriver(name string, labels map[string]string, isDefault bool) *nvidiav1alpha1.NVIDIADriver { + if name == "" { + name = testDriverName + } + driver := &nvidiav1alpha1.NVIDIADriver{ + TypeMeta: metav1.TypeMeta{ + Kind: "NVIDIADriver", + APIVersion: nvidiav1alpha1.SchemeGroupVersion.String(), + }, ObjectMeta: metav1.ObjectMeta{ - Name: testDriverName, + Name: name, }, Spec: nvidiav1alpha1.NVIDIADriverSpec{ - Image: "", - Version: "", + Image: "", + Version: "", + Default: isDefault, + NodeSelector: labels, }, } - - c.Kind = reflect.TypeOf(nvidiav1alpha1.NVIDIADriver{}).Name() - - gvk := nvidiav1alpha1.SchemeGroupVersion.WithKind(c.Kind) - - c.APIVersion = gvk.GroupVersion().String() - - for _, o := range opts { - o(c) - } - return c -} - -func named(name string) driverOptions { - return func(c *nvidiav1alpha1.NVIDIADriver) { - c.Name = name - } -} - -func nodeSelector(labels map[string]string) driverOptions { - return func(c *nvidiav1alpha1.NVIDIADriver) { - c.Spec.NodeSelector = labels - } + return driver } -func labelled(labels map[string]string) nodeOptions { - return func(n *corev1.Node) { - n.Labels = labels - } -} - -type nodeOptions func(*corev1.Node) - -func makeTestNode(opts ...nodeOptions) *corev1.Node { - n := &corev1.Node{ +func makeTestNode(labels map[string]string) *corev1.Node { + return &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ - Name: testNodeName, + Name: testNodeName, + Labels: labels, }, } - for _, o := range opts { - o(n) - } - return n } func TestCheckNodeSelector(t *testing.T) { - node := makeTestNode(labelled(map[string]string{"os-version": "ubuntu20.04"})) - driver := makeTestDriver(nodeSelector(node.Labels)) - conflictingDriver := makeTestDriver(named("conflictingDriver"), nodeSelector(node.Labels)) - nonconflictingDriver := makeTestDriver(named("nonconflictingDriver")) + node := makeTestNode(map[string]string{"os-version": "ubuntu20.04"}) + driver := makeTestDriver("", node.Labels, false) + conflictingDriver := makeTestDriver("conflictingDriver", node.Labels, false) + nonconflictingDriver := makeTestDriver("nonconflictingDriver", nil, false) tests := []struct { node *corev1.Node @@ -123,8 +97,96 @@ func TestCheckNodeSelector(t *testing.T) { err = nsv.Validate(context.Background(), tc.requestedDriver) if tc.shouldError { assert.Error(t, err) + assert.Contains(t, err.Error(), "multiple NVIDIADrivers match the same node my-test-node") + assert.Contains(t, err.Error(), "conflictingDriver") + assert.Contains(t, err.Error(), testDriverName) } else { assert.NoError(t, err) } } } + +func TestCheckNodeSelectorIgnoresDefaultDriver(t *testing.T) { + node := makeTestNode(map[string]string{ + "nvidia.com/gpu.present": "true", + "nodepool": "a", + }) + defaultDriver := makeTestDriver("", nil, true) + requestedDriver := makeTestDriver("specificDriver", map[string]string{"nodepool": "a"}, false) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(node, defaultDriver, requestedDriver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), requestedDriver) + assert.NoError(t, err) +} + +func TestCheckNodeSelectorRejectsReservedOwnerLabel(t *testing.T) { + driver := makeTestDriver("", map[string]string{consts.NVIDIADriverOwnerLabel: "other-driver"}, false) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(driver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), driver) + assert.Error(t, err) + assert.Contains(t, err.Error(), "reserved label") + assert.Contains(t, err.Error(), consts.NVIDIADriverOwnerLabel) +} + +func TestCheckNodeSelectorRejectsDefaultDriverNodeSelector(t *testing.T) { + driver := makeTestDriver("", map[string]string{"nodepool": "default"}, true) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(driver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), driver) + assert.Error(t, err) + assert.Contains(t, err.Error(), "default NVIDIADriver") + assert.Contains(t, err.Error(), "cannot use nodeSelector") +} + +func TestCheckNodeSelectorDoesNotIgnoreDefaultNameWithoutDefaultField(t *testing.T) { + node := makeTestNode(map[string]string{ + "nvidia.com/gpu.present": "true", + "nodepool": "a", + }) + nonDefaultNameDriver := makeTestDriver(consts.DefaultNVIDIADriverName, map[string]string{"nodepool": "a"}, false) + requestedDriver := makeTestDriver("specificDriver", map[string]string{"nodepool": "a"}, false) + + s := scheme.Scheme + err := nvidiav1alpha1.AddToScheme(s) + require.NoError(t, err) + c := fake. + NewClientBuilder(). + WithScheme(s). + WithObjects(node, nonDefaultNameDriver, requestedDriver). + Build() + nsv := NewNodeSelectorValidator(c) + + err = nsv.Validate(context.Background(), requestedDriver) + assert.Error(t, err) + assert.Contains(t, err.Error(), "multiple NVIDIADrivers match the same node my-test-node") + assert.Contains(t, err.Error(), consts.DefaultNVIDIADriverName) + assert.Contains(t, err.Error(), "specificDriver") +} diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh index c0b2a0b7b..5056d48b2 100755 --- a/tests/scripts/checks.sh +++ b/tests/scripts/checks.sh @@ -200,6 +200,46 @@ check_no_driver_pod_restarts() { return 0 } +print_driver_upgrade_debug() { + echo "current state of driver upgrade" + kubectl get node -l nvidia.com/gpu.present \ + -o custom-columns=NODE:.metadata.name,OWNER:.metadata.labels.nvidia\\.com/gpu-operator\\.driver\\.owner,UPGRADE_STATE:.metadata.labels.nvidia\\.com/gpu-driver-upgrade-state --no-headers + + echo "" + echo "driver pods" + kubectl get pods -l "app.kubernetes.io/component=nvidia-driver" -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "gpu operator operands" + kubectl get pods -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "driver daemonsets" + kubectl get daemonsets -l "app.kubernetes.io/component=nvidia-driver" -n ${TEST_NAMESPACE} -o wide || true + + echo "" + echo "NVIDIADriver status" + local nvidiadriver_status + if nvidiadriver_status=$(kubectl get nvidiadriver -o json 2>/dev/null); then + echo "${nvidiadriver_status}" | jq -r ' + (["NAME", "DEFAULT", "STATE", "REASON", "MESSAGE"] | @tsv), + ( + .items[] + | (.status.conditions // []) as $conditions + | ($conditions[-1] // {}) as $latest + | [ + .metadata.name, + (if .spec.default == true then "true" else "-" end), + (.status.state // "-"), + ($latest.reason // "-"), + ($latest.message // "-") + ] + | @tsv + ) + ' + fi +} + wait_for_driver_upgrade_done() { gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present --no-headers | wc -l) local current_time=0 @@ -221,13 +261,16 @@ wait_for_driver_upgrade_done() { if [[ "${current_time}" -gt $((60 * 45)) ]]; then echo "timeout reached" + print_driver_upgrade_debug exit 1; fi - echo "current state of driver upgrade" - kubectl get node -l nvidia.com/gpu.present \ - -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}{"\n"}{end}' - + if [[ $((current_time % 30)) -eq 0 ]]; then + print_driver_upgrade_debug + else + kubectl get node -l nvidia.com/gpu.present \ + -o custom-columns=NODE:.metadata.name,OWNER:.metadata.labels.nvidia\\.com/gpu-operator\\.driver\\.owner,UPGRADE_STATE:.metadata.labels.nvidia\\.com/gpu-driver-upgrade-state --no-headers + fi echo "Sleeping 5 seconds" current_time=$((${current_time} + 5)) diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index bab6e9ac0..cd5408645 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -3,12 +3,50 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source "${SCRIPT_DIR}"/.definitions.sh +test_nvidiadriver_helm_render_options() { + local render_file + render_file=$(mktemp) + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true > "${render_file}" + grep -q "kind: NVIDIADriver" "${render_file}" + grep -q "default: true" "${render_file}" + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=false > "${render_file}" + if grep -q "kind: NVIDIADriver" "${render_file}"; then + echo "NVIDIADriver rendered when driver.nvidiaDriverCRD.deployDefaultCR=false" + exit 1 + fi + + ${HELM} template gpu-operator "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --set driver.nvidiaDriverCRD.enabled=false \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true > "${render_file}" + if grep -q "kind: NVIDIADriver" "${render_file}"; then + echo "NVIDIADriver rendered when driver.nvidiaDriverCRD.enabled=false" + exit 1 + fi + + rm -f "${render_file}" +} + +echo "" +echo "" +echo "------------------------Checking NvidiaDriver Helm Rendering------------------------" +echo "-----------------------------------------------------------------------------------" +test_nvidiadriver_helm_render_options + echo "" echo "" echo "--------------Installing the GPU Operator with NvidiaDriverCR Enabled--------------" echo "-----------------------------------------------------------------------------------" # Install the operator with nvidiaDriver mode set to true -OPERATOR_OPTIONS="--set driver.nvidiaDriverCRD.enabled=true" ${SCRIPT_DIR}/install-operator.sh +OPERATOR_OPTIONS="${OPERATOR_OPTIONS:-} --set driver.nvidiaDriverCRD.enabled=true --set driver.nvidiaDriverCRD.deployDefaultCR=true" ${SCRIPT_DIR}/install-operator.sh USE_NVIDIA_DRIVER_CR="true" "${SCRIPT_DIR}"/verify-operator.sh USE_NVIDIA_DRIVER_CR="true" "${SCRIPT_DIR}"/verify-operand-restarts.sh diff --git a/tests/scripts/end-to-end.sh b/tests/scripts/end-to-end.sh index 46fceb949..03ce90d18 100755 --- a/tests/scripts/end-to-end.sh +++ b/tests/scripts/end-to-end.sh @@ -43,7 +43,18 @@ test_restart_operator ${TEST_NAMESPACE} ${CONTAINER_RUNTIME} "${SCRIPT_DIR}"/enable-operands.sh "${SCRIPT_DIR}"/verify-operator.sh -# Uninstall the workload and operator +echo "" +echo "" +echo "--------------ClusterPolicy to NVIDIADriver Migration Test--------------------------" +echo "------------------------------------------------------------------------------------" +"${SCRIPT_DIR}"/migrate-clusterpolicy-to-nvidiadriver.sh + +echo "Run GPU test workload after migration" +"${SCRIPT_DIR}"/uninstall-workload.sh +"${SCRIPT_DIR}"/install-workload.sh +"${SCRIPT_DIR}"/verify-workload.sh + +# Remove the completed workload before the next workload check. "${SCRIPT_DIR}"/uninstall-workload.sh "${SCRIPT_DIR}"/uninstall-operator.sh diff --git a/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh b/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh new file mode 100755 index 000000000..8cbaae72c --- /dev/null +++ b/tests/scripts/migrate-clusterpolicy-to-nvidiadriver.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +if [[ "${SKIP_MIGRATION}" == "true" ]]; then + echo "Skipping migration: SKIP_MIGRATION=${SKIP_MIGRATION}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +# Import the check definitions +source ${SCRIPT_DIR}/checks.sh + +OPERATOR_REPOSITORY=$(dirname "${OPERATOR_IMAGE}") +OPERATOR_OPTIONS="${OPERATOR_OPTIONS:-} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" +if [[ -n "${OPERATOR_VERSION}" ]]; then + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" +fi +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" + +get_helm_release_name() { + ${HELM} list -n "${TEST_NAMESPACE}" | grep gpu-operator | awk '{print $1}' +} + +wait_for_legacy_driver_daemonset_deleted() { + local elapsed_time=0 + + echo "Waiting for ClusterPolicy-owned driver DaemonSet to be deleted" + while :; do + daemonset_count=$(kubectl get daemonset -l app=nvidia-driver-daemonset -n "${TEST_NAMESPACE}" --no-headers 2>/dev/null | wc -l) + if [[ "${daemonset_count}" -eq 0 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for legacy driver DaemonSet deletion" + kubectl get daemonset -n "${TEST_NAMESPACE}" -o wide + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_orphaned_legacy_driver_pod() { + local pod_name=$1 + local elapsed_time=0 + + echo "Waiting for legacy driver pod/${pod_name} to become orphaned" + while :; do + owner_count=$(kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o json | jq '.metadata.ownerReferences | length') + if [[ "${owner_count}" -eq 0 ]]; then + echo "legacy driver pod/${pod_name} is orphaned" + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for legacy driver pod to become orphaned" + kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o yaml + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_default_nvidiadriver() { + local elapsed_time=0 + + echo "Waiting for default NVIDIADriver to be rendered" + while :; do + default_count=$(kubectl get nvidiadriver -o json 2>/dev/null | jq '[.items[] | select(.spec.default == true)] | length') + if [[ "${default_count}" -eq 1 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for default NVIDIADriver" + kubectl get nvidiadriver || true + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_nvidiadriver_owner_labels() { + local driver_name=$1 + local elapsed_time=0 + local gpu_node_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + echo "Waiting for ${gpu_node_count} GPU node(s) to be owned by NVIDIADriver/${driver_name}" + + while :; do + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu-operator.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -eq "${gpu_node_count}" ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for NVIDIADriver owner labels" + kubectl get nodes -l nvidia.com/gpu.present=true -o json | + jq -r '.items[] | [.metadata.name, (.metadata.labels["nvidia.com/gpu-operator.driver.owner"] // "-")] | @tsv' + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_nvidiadriver_daemonset() { + local driver_name=$1 + local elapsed_time=0 + + echo "Waiting for NVIDIADriver-owned driver DaemonSet" + while :; do + daemonset_count=$(kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "${TEST_NAMESPACE}" -o json | + jq --arg driver_name "${driver_name}" '[.items[] | select(.spec.template.spec.nodeSelector["nvidia.com/gpu-operator.driver.owner"] == $driver_name)] | length') + if [[ "${daemonset_count}" -gt 0 ]]; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for NVIDIADriver-owned driver DaemonSet" + kubectl get daemonset -n "${TEST_NAMESPACE}" -o yaml + exit 1 + fi + + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +wait_for_legacy_driver_pod_deleted() { + local pod_name=$1 + local elapsed_time=0 + + echo "Waiting for orphaned legacy driver pod/${pod_name} to be deleted by the upgrade flow" + while :; do + if ! kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" >/dev/null 2>&1; then + break + fi + + if [[ "${elapsed_time}" -gt 300 ]]; then + echo "timeout reached waiting for orphaned legacy driver pod deletion" + print_driver_upgrade_debug + kubectl get pod "${pod_name}" -n "${TEST_NAMESPACE}" -o yaml || true + exit 1 + fi + + print_driver_upgrade_debug + sleep 5 + elapsed_time=$((${elapsed_time} + 5)) + done +} + +legacy_driver_pod=$(kubectl get pod -l app=nvidia-driver-daemonset -n "${TEST_NAMESPACE}" -o jsonpath='{.items[0].metadata.name}') +if [[ -z "${legacy_driver_pod}" ]]; then + echo "legacy ClusterPolicy driver pod not found" + kubectl get pods -n "${TEST_NAMESPACE}" -o wide + exit 1 +fi + +operator_name=$(get_helm_release_name) +if [[ -z "${operator_name}" ]]; then + echo "GPU Operator Helm release not found in namespace ${TEST_NAMESPACE}" + ${HELM} list -n "${TEST_NAMESPACE}" + exit 1 +fi + +echo "Migrating Helm release/${operator_name} from ClusterPolicy driver management to NVIDIADriver" +${HELM} upgrade "${operator_name}" "${PROJECT_DIR}/deployments/gpu-operator" \ + -n "${TEST_NAMESPACE}" \ + --reuse-values \ + ${OPERATOR_OPTIONS:-} \ + --set driver.nvidiaDriverCRD.enabled=true \ + --set driver.nvidiaDriverCRD.deployDefaultCR=true \ + --wait + +wait_for_legacy_driver_daemonset_deleted +wait_for_orphaned_legacy_driver_pod "${legacy_driver_pod}" +wait_for_default_nvidiadriver +wait_for_nvidiadriver_owner_labels default +wait_for_nvidiadriver_daemonset default + +wait_for_legacy_driver_pod_deleted "${legacy_driver_pod}" +wait_for_driver_upgrade_done +check_nvidia_driver_pods_ready diff --git a/tests/scripts/update-nvidiadriver.sh b/tests/scripts/update-nvidiadriver.sh index 70a49ff5b..384bb3c99 100755 --- a/tests/scripts/update-nvidiadriver.sh +++ b/tests/scripts/update-nvidiadriver.sh @@ -11,23 +11,193 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh +NVIDIA_DRIVER_NAME="${NVIDIA_DRIVER_NAME:-e2e-driver}" +DEFAULT_NVIDIA_DRIVER_NAME="${DEFAULT_NVIDIA_DRIVER_NAME:-e2e-default-driver}" + +get_default_nvidiadriver_name() { + kubectl get nvidiadriver -o json | + jq -r '.items[] | select(.spec.default == true) | .metadata.name' | head -n 1 +} + +get_default_nvidiadriver_count() { + kubectl get nvidiadriver -o json | + jq '[.items[] | select(.spec.default == true)] | length' +} + +create_nvidiadriver_from() { + local source_name=$1 + local target_name=$2 + local default_driver=${3:-false} + + kubectl get nvidiadriver/"${source_name}" -o json | jq --arg name "${target_name}" --argjson default_driver "${default_driver}" ' + { + apiVersion: .apiVersion, + kind: .kind, + metadata: { + name: $name + }, + spec: (.spec + {default: $default_driver}) + } + ' | kubectl apply -f - +} + +unset_default_driver() { + local driver_name=$1 + + kubectl patch nvidiadriver/"${driver_name}" --type='merge' -p='{"spec":{"default":false}}' +} + +set_default_driver() { + local driver_name=$1 + + kubectl patch nvidiadriver/"${driver_name}" --type='merge' -p='{"spec":{"default":true}}' +} + +wait_for_default_nvidiadriver() { + local expected_name=$1 + local current_time=0 + + echo "Waiting for NVIDIADriver/${expected_name} to be the only default" + while :; do + default_count=$(get_default_nvidiadriver_count) + default_name=$(get_default_nvidiadriver_name) + if [[ "${default_count}" -eq 1 && "${default_name}" == "${expected_name}" ]]; then + echo "NVIDIADriver/${expected_name} is the only default" + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for NVIDIADriver/${expected_name} to be the only default" + kubectl get nvidiadriver + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +test_arbitrary_name_default_nvidiadriver() { + local current_default + current_default=$(get_default_nvidiadriver_name) + if [[ -z "${current_default}" ]]; then + echo "default NVIDIADriver not found" + kubectl get nvidiadriver + exit 1 + fi + + if [[ "${current_default}" == "${DEFAULT_NVIDIA_DRIVER_NAME}" ]]; then + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + return + fi + + echo "Moving default field from NVIDIADriver/${current_default} to NVIDIADriver/${DEFAULT_NVIDIA_DRIVER_NAME}" + unset_default_driver "${current_default}" + if ! create_nvidiadriver_from "${current_default}" "${DEFAULT_NVIDIA_DRIVER_NAME}" true; then + echo "failed to create NVIDIADriver/${DEFAULT_NVIDIA_DRIVER_NAME}; restoring NVIDIADriver/${current_default} default field before failing" + set_default_driver "${current_default}" + exit 1 + fi + kubectl delete nvidiadriver/"${current_default}" + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_owner "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_daemonsets "${DEFAULT_NVIDIA_DRIVER_NAME}" +} + +create_nvidiadriver() { + local default_name + default_name=$(get_default_nvidiadriver_name) + if [[ -z "${default_name}" ]]; then + echo "default NVIDIADriver not found" + kubectl get nvidiadriver + exit 1 + fi + + echo "Creating user-defined NVIDIADriver/${NVIDIA_DRIVER_NAME} from NVIDIADriver/${default_name}" + create_nvidiadriver_from "${default_name}" "${NVIDIA_DRIVER_NAME}" false +} + +wait_for_nvidiadriver_owner() { + local driver_name=$1 + local current_time=0 + local gpu_node_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + echo "Waiting for ${gpu_node_count} GPU node(s) to be owned by NVIDIADriver/${driver_name}" + + while :; do + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu-operator.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -eq "${gpu_node_count}" ]]; then + echo "All GPU nodes are owned by NVIDIADriver/${driver_name}" + break + fi + + if [[ "${current_time}" -gt $((60 * 15)) ]]; then + echo "timeout reached waiting for NVIDIADriver/${driver_name} ownership" + kubectl get nodes -l nvidia.com/gpu.present=true -o json | + jq -r '.items[] | [.metadata.name, (.metadata.labels["nvidia.com/gpu-operator.driver.owner"] // "-")] | @tsv' + exit 1 + fi + + echo "NVIDIADriver/${driver_name} owns ${owned_count}/${gpu_node_count} GPU node(s)" + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +get_nvidiadriver_daemonsets() { + local driver_name=$1 + kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o json | + jq --arg driver_name "${driver_name}" '.items | map(select(.spec.template.spec.nodeSelector["nvidia.com/gpu-operator.driver.owner"] == $driver_name))' +} + +wait_for_nvidiadriver_daemonsets() { + local driver_name=$1 + local current_time=0 + + echo "Waiting for daemonsets owned by NVIDIADriver/${driver_name}" + while :; do + daemonset_count=$(get_nvidiadriver_daemonsets "${driver_name}" | jq length) + if [[ "${daemonset_count}" -gt 0 ]]; then + echo "Found ${daemonset_count} daemonset(s) owned by NVIDIADriver/${driver_name}" + break + fi + + if [[ "${current_time}" -gt $((60 * 15)) ]]; then + echo "timeout reached waiting for daemonsets owned by NVIDIADriver/${driver_name}" + kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o yaml + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + test_driver_image_updates() { # Update driver image version - kubectl patch nvidiadriver/default --type='json' -p='[{"op": "replace", "path": "/spec/version", "value": '"$TARGET_DRIVER_VERSION"'}]' + kubectl patch nvidiadriver/"${NVIDIA_DRIVER_NAME}" --type='json' -p='[{"op": "replace", "path": "/spec/version", "value": '"$TARGET_DRIVER_VERSION"'}]' if [ "$?" -ne 0 ]; then echo "cannot update driver image with version $TARGET_DRIVER_VERSION for driver-daemonset" exit 1 fi - # Wait for 10 seconds for the change to be applied by operator - sleep 10 - # Verify update is applied to Driver Daemonset - UPDATED_IMAGE=$(kubectl get daemonset -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" -o json | jq '.items[0].spec.template.spec.containers[0].image') - if [[ "$UPDATED_IMAGE" != *"$TARGET_DRIVER_VERSION"* ]]; then - echo "Image update failed for driver daemonset to version $TARGET_DRIVER_VERSION" - exit 1 - fi + local current_time=0 + while :; do + if get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" | jq -e --arg version "${TARGET_DRIVER_VERSION}" 'length > 0 and all(.[]; .spec.template.spec.containers[0].image | contains($version))' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "Image update failed for driver daemonset to version $TARGET_DRIVER_VERSION" + get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done echo "driver daemonset image updated successfully to version $TARGET_DRIVER_VERSION" # Delete driver pod to trigger update due to OnDelete policy @@ -44,7 +214,7 @@ test_driver_image_updates() { } test_custom_labels_override() { - if ! kubectl patch nvidiadriver/default --type='json' -p='[{"op": "add", "path": "/spec/labels", "value": {"cloudprovider": "aws", "platform": "kubernetes"}}]'; + if ! kubectl patch nvidiadriver/"${NVIDIA_DRIVER_NAME}" --type='json' -p='[{"op": "add", "path": "/spec/labels", "value": {"cloudprovider": "aws", "platform": "kubernetes"}}]'; then echo "cannot update the labels of the NVIDIADriver resource" exit 1 @@ -52,11 +222,21 @@ test_custom_labels_override() { # Wait for the operator to update the pod template with new labels echo "Waiting for DaemonSet pod template to be updated with new labels..." - kubectl wait daemonset \ - -l "app.kubernetes.io/component=nvidia-driver" \ - -n "$TEST_NAMESPACE" \ - --for=jsonpath='{.spec.template.metadata.labels.cloudprovider}'=aws \ - --timeout=120s + local current_time=0 + while :; do + if get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" | jq -e 'length > 0 and all(.[]; .spec.template.metadata.labels.cloudprovider == "aws" and .spec.template.metadata.labels.platform == "kubernetes")' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for DaemonSet pod template labels" + get_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done # Delete driver pod to force recreation with updated labels. Existing pods are not automatically restarted due to the DaemonSet's 'OnDelete` updateStrategy. echo "Deleting driver pod to trigger recreation with updated labels..." @@ -68,20 +248,71 @@ test_custom_labels_override() { check_nvidia_driver_pods_ready echo "checking nvidia-driver-daemonset labels" - for pod in $(kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver" --output=jsonpath={.items..metadata.name}) - do - cp_label_value=$(kubectl get pod -n "$TEST_NAMESPACE" "$pod" --output jsonpath={.metadata.labels.cloudprovider}) - if [ "$cp_label_value" != "aws" ]; then - echo "Custom Label cloudprovider is incorrect when nvidiadriver labels are overridden - $pod" - exit 1 - fi - platform_label_value=$(kubectl get pod -n "$TEST_NAMESPACE" "$pod" --output jsonpath={.metadata.labels.platform}) - if [ "$platform_label_value" != "kubernetes" ]; then - echo "Custom Label platform is incorrect when nvidiadriver labels are overridden - $pod" + labeled_pod_count=$(kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver,cloudprovider=aws,platform=kubernetes" --no-headers | wc -l) + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + if [[ "${labeled_pod_count}" -ne "${gpu_node_count}" ]]; then + echo "Custom labels are missing from one or more NVIDIADriver/${NVIDIA_DRIVER_NAME} pods" + kubectl get pods -n "$TEST_NAMESPACE" -l "app.kubernetes.io/component=nvidia-driver" --show-labels + exit 1 + fi +} + +assert_nvidiadriver_owner_count() { + local driver_name=$1 + local gpu_node_count + local owned_count + + gpu_node_count=$(kubectl get node -l nvidia.com/gpu.present=true --no-headers | wc -l) + owned_count=$(kubectl get nodes -l "nvidia.com/gpu.present=true,nvidia.com/gpu-operator.driver.owner=${driver_name}" --no-headers | wc -l) + if [[ "${owned_count}" -ne "${gpu_node_count}" ]]; then + echo "Expected ${gpu_node_count} GPU node(s) to remain owned by NVIDIADriver/${driver_name}, found ${owned_count}" + kubectl get nodes -l nvidia.com/gpu.present=true -o json | + jq -r '.items[] | [.metadata.name, (.metadata.labels["nvidia.com/gpu-operator.driver.owner"] // "-")] | @tsv' exit 1 fi - done } +wait_for_nvidiadriver_condition_message() { + local driver_name=$1 + local message=$2 + local current_time=0 + + echo "Waiting for NVIDIADriver/${driver_name} status message to contain: ${message}" + while :; do + if kubectl get nvidiadriver/"${driver_name}" -o json | jq -e --arg message "${message}" ' + (.status.state // "") == "notReady" and + ([.status.conditions[]?.message // ""] | any(contains($message))) + ' >/dev/null; then + break + fi + + if [[ "${current_time}" -gt 120 ]]; then + echo "timeout reached waiting for NVIDIADriver/${driver_name} status message" + kubectl get nvidiadriver/"${driver_name}" -o yaml + exit 1 + fi + + sleep 5 + current_time=$((${current_time} + 5)) + done +} + +test_removed_default_field_conflict_preserves_owners() { + echo "Testing that clearing the default field makes the CR a normal conflicting NVIDIADriver" + unset_default_driver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_condition_message "${NVIDIA_DRIVER_NAME}" "multiple NVIDIADrivers match the same node" + wait_for_nvidiadriver_condition_message "${DEFAULT_NVIDIA_DRIVER_NAME}" "multiple NVIDIADrivers match the same node" + assert_nvidiadriver_owner_count "${NVIDIA_DRIVER_NAME}" + set_default_driver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_default_nvidiadriver "${DEFAULT_NVIDIA_DRIVER_NAME}" + wait_for_nvidiadriver_owner "${NVIDIA_DRIVER_NAME}" +} + +test_arbitrary_name_default_nvidiadriver +create_nvidiadriver +wait_for_nvidiadriver_owner "${NVIDIA_DRIVER_NAME}" +wait_for_nvidiadriver_daemonsets "${NVIDIA_DRIVER_NAME}" +check_nvidia_driver_pods_ready test_driver_image_updates test_custom_labels_override +test_removed_default_field_conflict_preserves_owners