Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2bbe2ea
Backport DNS-ready graceful shutdown fixes to 25.8.0
Al-assad May 28, 2026
7b5a75b
feat: implement two-phase graceful BE restart for disaggregated compu…
Al-assad May 28, 2026
e37c65a
chore: regenerate CRD manifests and apply gofmt for two-phase gracefu…
Al-assad May 28, 2026
cdf225f
fix: resolve graceful rollout status not converging to Ready
Al-assad May 29, 2026
ff9d6a7
fix: create external service for disaggregated BE
Al-assad May 29, 2026
63acdbe
fix: wait be alive before advancing rollout
Al-assad May 29, 2026
ffd194a
fix: requeue reconciling disaggregated compute groups
Al-assad Jun 1, 2026
8eca95e
fix: avoid stale graceful rollout recovery
Al-assad Jun 1, 2026
03e2b55
fix: keep graceful rolling status active
Al-assad Jun 1, 2026
e754963
fix: explicitly clear graceful rollout state
Al-assad Jun 1, 2026
c8156b6
fix: preserve graceful compute group phase
Al-assad Jun 1, 2026
fc9a655
fix: disable be query before graceful drain
Al-assad Jun 1, 2026
1617c3d
fix: skip disable query in cloud mode
Al-assad Jun 1, 2026
337aa80
Revert "fix: skip disable query in cloud mode"
Al-assad Jun 1, 2026
60737cd
Revert "fix: disable be query before graceful drain"
Al-assad Jun 1, 2026
be2a465
fix: keep ddc be statefulset on ondelete
Al-assad Jun 3, 2026
9044c57
fix: harden ddc graceful rollout state machine
Al-assad Jun 4, 2026
9733ef7
feat: harden graceful be replacement generation checks
Al-assad Jun 4, 2026
4ad541b
chore: expand graceful rollout diagnostic logging
Al-assad Jun 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions api/disaggregated/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,107 @@ const (
ResumeFailed Phase = "ResumeFailed"
SuspendFailed Phase = "SuspendFailed"
Suspended Phase = "Suspended"

// Graceful two-phase restart/shutdown phases
GracefulRolling Phase = "GracefulRolling"
GracefulScaling Phase = "GracefulScaling"
GracefulDeleting Phase = "GracefulDeleting"
)

// GracefulActionType describes the type of graceful action being performed.
type GracefulActionType string

const (
GracefulActionRollingUpdate GracefulActionType = "RollingUpdate"
GracefulActionScaleDown GracefulActionType = "ScaleDown"
GracefulActionDelete GracefulActionType = "Delete"
)

// GracefulActionPhase describes the current phase of a graceful action on a single pod.
type GracefulActionPhase string

const (
GracefulPhaseTriggerDrain GracefulActionPhase = "TriggerDrain"
GracefulPhaseWaitDrain GracefulActionPhase = "WaitDrain"
GracefulPhaseDeletePod GracefulActionPhase = "DeletePod"
GracefulPhaseWaitPodReady GracefulActionPhase = "WaitPodReady"
GracefulPhaseWaitBEAlive GracefulActionPhase = "WaitBEAlive"
GracefulPhaseDone GracefulActionPhase = "Done"
GracefulPhaseFailed GracefulActionPhase = "Failed"
)

// GracefulAction tracks the state of an in-progress graceful two-phase restart/shutdown operation.
type GracefulAction struct {
// Type is the kind of graceful action: RollingUpdate, ScaleDown, or Delete.
Type GracefulActionType `json:"type,omitempty"`

// Phase is the current step in the graceful action state machine.
Phase GracefulActionPhase `json:"phase,omitempty"`

// CurrentPod is the name of the pod currently being processed.
CurrentPod string `json:"currentPod,omitempty"`

// CurrentOrdinal is the ordinal index of the pod currently being processed.
CurrentOrdinal int32 `json:"currentOrdinal,omitempty"`

// TargetRevision is the StatefulSet updateRevision being rolled out to (for RollingUpdate).
TargetRevision string `json:"targetRevision,omitempty"`

// DesiredReplicas is the target replica count (for ScaleDown).
DesiredReplicas *int32 `json:"desiredReplicas,omitempty"`

// StartedAt is when the current pod's graceful action began.
StartedAt metav1.Time `json:"startedAt,omitempty"`

// DeadlineAt is when the current pod's drain timeout expires.
DeadlineAt metav1.Time `json:"deadlineAt,omitempty"`

// LastMessage is a human-readable message about the current action state.
LastMessage string `json:"lastMessage,omitempty"`

// DrainTriggered indicates whether the drain exec has been triggered for the current pod.
DrainTriggered bool `json:"drainTriggered,omitempty"`

// InitialRestartCount records the BE container's restart count before drain, to detect kubelet restarts.
InitialRestartCount int32 `json:"initialRestartCount,omitempty"`

// SentinelWritten indicates whether the operator has written the terminating sentinel
// into the current pod before triggering graceful drain.
SentinelWritten bool `json:"sentinelWritten,omitempty"`

// RestartAnomalyDetected indicates that kubelet restarted the main container
// after the graceful drain was triggered.
RestartAnomalyDetected bool `json:"restartAnomalyDetected,omitempty"`

// InitialPodUID is the UID of the pod generation being drained.
InitialPodUID string `json:"initialPodUID,omitempty"`

// InitialContainerID is the main container ID of the pod generation being drained.
InitialContainerID string `json:"initialContainerID,omitempty"`

// InitialBackendStartTime is the FE-observed LastStartTime for the backend generation
// being drained. It is used to reject stale alive=true views during replacement.
InitialBackendStartTime string `json:"initialBackendStartTime,omitempty"`

// InitialBackendEpoch is reserved for FE-observed backend process epoch when available.
InitialBackendEpoch string `json:"initialBackendEpoch,omitempty"`

// ReplacementPodUID tracks the replacement pod generation once it is observed ready.
ReplacementPodUID string `json:"replacementPodUID,omitempty"`

// ReplacementContainerID tracks the replacement pod's main container ID.
ReplacementContainerID string `json:"replacementContainerID,omitempty"`

// ReplacementBackendStartTime records the FE-observed LastStartTime accepted for the replacement generation.
ReplacementBackendStartTime string `json:"replacementBackendStartTime,omitempty"`

// ReplacementBackendEpoch records the FE-observed backend process epoch accepted for the replacement generation.
ReplacementBackendEpoch string `json:"replacementBackendEpoch,omitempty"`

// StableBackendObservations counts consecutive WaitBEAlive polls that observed the same accepted replacement generation.
StableBackendObservations int32 `json:"stableBackendObservations,omitempty"`
}

type AvailableStatus string

const (
Expand Down Expand Up @@ -424,6 +523,10 @@ type ComputeGroupStatus struct {
// Total number of available pods (ready for at least minReadySeconds) targeted by this statefulset.
// +optional
AvailableReplicas int32 `json:"availableReplicas,omitempty"`

// GracefulAction tracks the state of an in-progress graceful two-phase restart/shutdown action.
// +optional
GracefulAction *GracefulAction `json:"gracefulAction,omitempty"`
}

type FEStatus struct {
Expand Down
12 changes: 8 additions & 4 deletions api/disaggregated/v1/unique_id.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
package v1

import (
"crypto/sha256"
"math/big"
"strings"
"crypto/sha256"
"math/big"
"strings"
)

/*
Expand Down Expand Up @@ -64,6 +64,10 @@ func (ddc *DorisDisaggregatedCluster) GetCGServiceName(cg *ComputeGroup) string
return svcName
}

func (ddc *DorisDisaggregatedCluster) GetCGExternalServiceName(cg *ComputeGroup) string {
return ddc.GetCGServiceName(cg) + "-external"
}

func (ddc *DorisDisaggregatedCluster) GetFEServiceName() string {
return ddc.Name + "-" + "fe"
}
Expand All @@ -80,7 +84,7 @@ func (ddc *DorisDisaggregatedCluster) GetMSServiceName() string {
return ddc.Name + "-" + "ms"
}

//the first deployed used computegroup name, when user rename the compute group name by sql command `ALTER SYSTEM RENAME COMPUTE GROUP <old_name> <new_name>`, this function will not right.
// the first deployed used computegroup name, when user rename the compute group name by sql command `ALTER SYSTEM RENAME COMPUTE GROUP <old_name> <new_name>`, this function will not right.
func (ddc *DorisDisaggregatedCluster) GetCGName(cg *ComputeGroup) string {
// use uniqueId as compute group name, the uniqueId restrict not empty, and the computegroup's name should use "_" not "-"
return strings.ReplaceAll(cg.UniqueId, "-", "_")
Expand Down
31 changes: 30 additions & 1 deletion api/disaggregated/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 54 additions & 0 deletions config/crd/bases/crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16381,6 +16381,60 @@ spec:
description: the compute group id in doris meta, this response
to the backend's tag "compute_group_id";
type: string
gracefulAction:
description: GracefulAction tracks the state of an in-progress
graceful two-phase restart/shutdown action.
properties:
currentOrdinal:
description: CurrentOrdinal is the ordinal index of the
pod currently being processed.
format: int32
type: integer
currentPod:
description: CurrentPod is the name of the pod currently
being processed.
type: string
deadlineAt:
description: DeadlineAt is when the current pod's drain
timeout expires.
format: date-time
type: string
desiredReplicas:
description: DesiredReplicas is the target replica count
(for ScaleDown).
format: int32
type: integer
drainTriggered:
description: DrainTriggered indicates whether the drain
exec has been triggered for the current pod.
type: boolean
initialRestartCount:
description: InitialRestartCount records the BE container's
restart count before drain, to detect kubelet restarts.
format: int32
type: integer
lastMessage:
description: LastMessage is a human-readable message about
the current action state.
type: string
phase:
description: Phase is the current step in the graceful action
state machine.
type: string
startedAt:
description: StartedAt is when the current pod's graceful
action began.
format: date-time
type: string
targetRevision:
description: TargetRevision is the StatefulSet updateRevision
being rolled out to (for RollingUpdate).
type: string
type:
description: 'Type is the kind of graceful action: RollingUpdate,
ScaleDown, or Delete.'
type: string
type: object
phase:
description: Phase represent the stage of reconciling.
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7020,6 +7020,60 @@ spec:
description: the compute group id in doris meta, this response
to the backend's tag "compute_group_id";
type: string
gracefulAction:
description: GracefulAction tracks the state of an in-progress
graceful two-phase restart/shutdown action.
properties:
currentOrdinal:
description: CurrentOrdinal is the ordinal index of the
pod currently being processed.
format: int32
type: integer
currentPod:
description: CurrentPod is the name of the pod currently
being processed.
type: string
deadlineAt:
description: DeadlineAt is when the current pod's drain
timeout expires.
format: date-time
type: string
desiredReplicas:
description: DesiredReplicas is the target replica count
(for ScaleDown).
format: int32
type: integer
drainTriggered:
description: DrainTriggered indicates whether the drain
exec has been triggered for the current pod.
type: boolean
initialRestartCount:
description: InitialRestartCount records the BE container's
restart count before drain, to detect kubelet restarts.
format: int32
type: integer
lastMessage:
description: LastMessage is a human-readable message about
the current action state.
type: string
phase:
description: Phase is the current step in the graceful action
state machine.
type: string
startedAt:
description: StartedAt is when the current pod's graceful
action began.
format: date-time
type: string
targetRevision:
description: TargetRevision is the StatefulSet updateRevision
being rolled out to (for RollingUpdate).
type: string
type:
description: 'Type is the kind of graceful action: RollingUpdate,
ScaleDown, or Delete.'
type: string
type: object
phase:
description: Phase represent the stage of reconciling.
type: string
Expand Down
8 changes: 8 additions & 0 deletions config/operator/disaggregated-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ rules:
verbs:
- create
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
Expand Down Expand Up @@ -154,6 +155,13 @@ rules:
- list
- watch
- update
- delete
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
- apiGroups:
- ""
resources:
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,18 @@ require (
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.19.1 // indirect
Expand Down
Loading
Loading