Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2183,6 +2183,11 @@ func (d *DriverSpec) UseNvidiaDriverCRDType() bool {
return *d.UseNvidiaDriverCRD
}

// IsNVIDIADriverCRDEnabled returns true if driver management is enabled through NVIDIADriver CRs.
func (d *DriverSpec) IsNVIDIADriverCRDEnabled() bool {
return d != nil && d.IsEnabled() && d.UseNvidiaDriverCRDType()
}

// UsePrecompiledDrivers returns true if driver install is enabled using pre-compiled modules
func (d *DriverSpec) UsePrecompiledDrivers() bool {
if d.UsePrecompiled == nil {
Expand Down
76 changes: 76 additions & 0 deletions api/nvidia/v1/clusterpolicy_types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Copyright (c) NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package v1

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestDriverSpecIsNVIDIADriverCRDEnabled(t *testing.T) {
driverEnabled := true
driverDisabled := false
crdEnabled := true
crdDisabled := false

tests := []struct {
name string
driverEnabled *bool
crdEnabled *bool
expected bool
}{
{
name: "driver enabled by default and CRD enabled",
crdEnabled: &crdEnabled,
expected: true,
},
{
name: "CRD disabled",
crdEnabled: &crdDisabled,
expected: false,
},
{
name: "driver disabled",
driverEnabled: &driverDisabled,
crdEnabled: &crdEnabled,
expected: false,
},
{
name: "driver explicitly enabled and CRD enabled",
driverEnabled: &driverEnabled,
crdEnabled: &crdEnabled,
expected: true,
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
driverSpec := DriverSpec{
Enabled: tc.driverEnabled,
UseNvidiaDriverCRD: tc.crdEnabled,
}

require.Equal(t, tc.expected, driverSpec.IsNVIDIADriverCRDEnabled())
})
}

t.Run("nil driver spec", func(t *testing.T) {
var driverSpec *DriverSpec
require.False(t, driverSpec.IsNVIDIADriverCRDEnabled())
})
}
16 changes: 15 additions & 1 deletion api/nvidia/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ const (
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// NVIDIADriverSpec defines the desired state of NVIDIADriver
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
// The CEL validation allows non-default drivers to use nodeSelector, but requires
// default drivers to leave nodeSelector unset or empty.
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
type NVIDIADriverSpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
// Important: Run "make" to regenerate code after modifying this file

// Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
// that do not match any non-default NVIDIADriver nodeSelector.
// +kubebuilder:default=false
Default bool `json:"default"`

// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
// +kubebuilder:default=gpu
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
Expand Down Expand Up @@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
//+kubebuilder:subresource:status
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0

// NVIDIADriver is the Schema for the nvidiadrivers API
Expand All @@ -524,6 +533,11 @@ type NVIDIADriverList struct {
Items []NVIDIADriver `json:"items"`
}

// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
func (d *NVIDIADriver) IsDefault() bool {
return d != nil && d.Spec.Default
}

// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
if d.UsePrecompiled == nil {
Expand Down
19 changes: 18 additions & 1 deletion bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ spec:
- jsonPath: .status.state
name: Status
type: string
- jsonPath: .spec.default
name: Default
type: boolean
- jsonPath: .metadata.creationTimestamp
name: Age
type: string
Expand All @@ -48,7 +51,10 @@ spec:
metadata:
type: object
spec:
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
description: |-
NVIDIADriverSpec defines the desired state of NVIDIADriver.
The CEL validation allows non-default drivers to use nodeSelector, but requires
default drivers to leave nodeSelector unset or empty.
properties:
annotations:
additionalProperties:
Expand All @@ -70,6 +76,12 @@ spec:
name:
type: string
type: object
default:
default: false
description: |-
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
that do not match any non-default NVIDIADriver nodeSelector.
type: boolean
driverType:
default: gpu
description: DriverType defines NVIDIA driver type
Expand Down Expand Up @@ -968,9 +980,14 @@ spec:
type: string
type: object
required:
- default
- driverType
- image
type: object
x-kubernetes-validations:
- message: default NVIDIADriver must not use nodeSelector
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
size(self.nodeSelector) == 0 : true'
status:
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
properties:
Expand Down
19 changes: 18 additions & 1 deletion config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ spec:
- jsonPath: .status.state
name: Status
type: string
- jsonPath: .spec.default
name: Default
type: boolean
- jsonPath: .metadata.creationTimestamp
name: Age
type: string
Expand All @@ -48,7 +51,10 @@ spec:
metadata:
type: object
spec:
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
description: |-
NVIDIADriverSpec defines the desired state of NVIDIADriver.
The CEL validation allows non-default drivers to use nodeSelector, but requires
default drivers to leave nodeSelector unset or empty.
properties:
annotations:
additionalProperties:
Expand All @@ -70,6 +76,12 @@ spec:
name:
type: string
type: object
default:
default: false
description: |-
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
that do not match any non-default NVIDIADriver nodeSelector.
type: boolean
driverType:
default: gpu
description: DriverType defines NVIDIA driver type
Expand Down Expand Up @@ -968,9 +980,14 @@ spec:
type: string
type: object
required:
- default
- driverType
- image
type: object
x-kubernetes-validations:
- message: default NVIDIADriver must not use nodeSelector
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
size(self.nodeSelector) == 0 : true'
status:
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
properties:
Expand Down
93 changes: 87 additions & 6 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
"github.com/NVIDIA/gpu-operator/internal/conditions"
"github.com/NVIDIA/gpu-operator/internal/consts"
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
"github.com/NVIDIA/gpu-operator/internal/state"
"github.com/NVIDIA/gpu-operator/internal/validator"
)
Expand Down Expand Up @@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
if apierrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
return reconcile.Result{}, nil
// Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
}
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
logger.Error(err, "error getting NVIDIADriver object")
Expand All @@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Error reading the object - requeue the request.
return reconcile.Result{}, wrappedErr
}
if !instance.GetDeletionTimestamp().IsZero() {
logger.Info("NVIDIADriver delete requested; cleaning up owner labels")
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
}

// Get the singleton NVIDIA ClusterPolicy object in the cluster.
clusterPolicyList := &gpuv1.ClusterPolicyList{}
Expand All @@ -121,7 +125,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
clusterPolicyInstance := clusterPolicyList.Items[0]

// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() {
if !clusterPolicyInstance.Spec.Driver.IsNVIDIADriverCRDEnabled() {
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
instance.Status.State = nvidiav1alpha1.Disabled
Expand Down Expand Up @@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
return reconcile.Result{}, nil
}

changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
if err != nil {
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
instance.Status.State = nvidiav1alpha1.NotReady
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, err
}
if changed {
return reconcile.Result{RequeueAfter: time.Second}, nil
}
Comment thread
cdesiniotis marked this conversation as resolved.

if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
logger.Error(err, "unsupported driver combination detected")
Expand Down Expand Up @@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Sync state and update status
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil {
logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
return reconcile.Result{}, err
}

// update CR status
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
return ctrl.Result{}, err
Expand Down Expand Up @@ -276,6 +298,65 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
return reconcileRequests
}

// enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
// event and all current NVIDIADriver instances. The triggering object is
// included even for delete events so the NotFound reconcile path can clear
// stale node owner labels.
func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
requests := r.enqueueAllNVIDIADrivers(ctx)
if driver != nil {
requests = append(requests, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: driver.GetName(),
Namespace: driver.GetNamespace(),
},
})
}
return dedupeReconcileRequests(requests)
}

func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request {
seen := map[types.NamespacedName]struct{}{}
deduped := make([]reconcile.Request, 0, len(requests))
for _, request := range requests {
if _, ok := seen[request.NamespacedName]; ok {
continue
}
seen[request.NamespacedName] = struct{}{}
deduped = append(deduped, request)
}
return deduped
}

func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabelsAndReturn(ctx context.Context) (reconcile.Result, error) {
if err := r.cleanupNVIDIADriverOwnerLabels(ctx); err != nil {
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
return reconcile.Result{}, err
}
return reconcile.Result{}, nil
}

// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
// when ClusterPolicy is still configured for NVIDIADriver mode. This lets
// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) error {
clusterPolicyList := &gpuv1.ClusterPolicyList{}
if err := r.List(ctx, clusterPolicyList); err != nil {
return fmt.Errorf("error getting ClusterPolicy list: %w", err)
}

for i := range clusterPolicyList.Items {
if !clusterPolicyList.Items[i].Spec.Driver.IsNVIDIADriverCRDEnabled() {
continue
}
_, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
return err
}

return nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
// Create state manager
Expand Down Expand Up @@ -307,8 +388,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.

// Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
// enqueue a reconcile request for all NVIDIADriver instances.
nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
return r.enqueueAllNVIDIADrivers(ctx)
nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
return r.enqueueNVIDIADriverReconcilers(ctx, driver)
}

// Watch for changes to the primary resource NVIDIADriver
Expand Down
Loading
Loading