Skip to content

Commit 544df6f

Browse files
Merge branch 'master' into agent-skills
2 parents 530bb7b + 9c07b4c commit 544df6f

File tree

17 files changed

+8018
-3887
lines changed

17 files changed

+8018
-3887
lines changed

.secrets.baseline

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@
934934
"filename": "infra/feast-operator/api/v1/featurestore_types.go",
935935
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
936936
"is_verified": false,
937-
"line_number": 695
937+
"line_number": 725
938938
}
939939
],
940940
"infra/feast-operator/api/v1/zz_generated.deepcopy.go": [
@@ -943,21 +943,21 @@
943943
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
944944
"hashed_secret": "f914fc9324de1bec1ad13dec94a8ea2ddb41fc87",
945945
"is_verified": false,
946-
"line_number": 663
946+
"line_number": 681
947947
},
948948
{
949949
"type": "Secret Keyword",
950950
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
951951
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
952952
"is_verified": false,
953-
"line_number": 1206
953+
"line_number": 1249
954954
},
955955
{
956956
"type": "Secret Keyword",
957957
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
958958
"hashed_secret": "c2028031c154bbe86fd69bef740855c74b927dcf",
959959
"is_verified": false,
960-
"line_number": 1211
960+
"line_number": 1254
961961
}
962962
],
963963
"infra/feast-operator/api/v1alpha1/featurestore_types.go": [
@@ -1156,7 +1156,7 @@
11561156
"filename": "infra/feast-operator/internal/controller/services/services.go",
11571157
"hashed_secret": "36dc326eb15c7bdd8d91a6b87905bcea20b637d1",
11581158
"is_verified": false,
1159-
"line_number": 173
1159+
"line_number": 176
11601160
}
11611161
],
11621162
"infra/feast-operator/internal/controller/services/tls_test.go": [

docs/how-to-guides/feast-on-kubernetes.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,11 @@ spec:
6565
> _More advanced FeatureStore CR examples can be found in the feast-operator [samples directory](../../infra/feast-operator/config/samples)._
6666
6767
{% hint style="success" %}
68-
**Scaling:** The Feast Operator supports horizontal scaling via static replicas, HPA autoscaling, or external autoscalers like [KEDA](https://keda.sh). Scaling requires DB-backed persistence for all enabled services.
68+
**Scaling & High Availability:** The Feast Operator supports horizontal scaling via static replicas, HPA autoscaling, or external autoscalers like [KEDA](https://keda.sh). Scaling requires DB-backed persistence for all enabled services.
6969
70-
See the [Horizontal Scaling with the Feast Operator](./scaling-feast.md#horizontal-scaling-with-the-feast-operator) guide for configuration details, or check the general recommendations on [how to scale Feast](./scaling-feast.md).
70+
When scaling is enabled, the operator auto-injects soft pod anti-affinity and zone topology spread constraints for resilience. You can also configure a PodDisruptionBudget to protect against voluntary disruptions.
71+
72+
See the [Horizontal Scaling with the Feast Operator](./scaling-feast.md#horizontal-scaling-with-the-feast-operator) guide for configuration details, including [HA options](./scaling-feast.md#high-availability), or check the general recommendations on [how to scale Feast](./scaling-feast.md).
7173
{% endhint %}
7274
7375
> _Sample scaling CRs are available at [`v1_featurestore_scaling_static.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_static.yaml) and [`v1_featurestore_scaling_hpa.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_hpa.yaml)._

docs/how-to-guides/scaling-feast.md

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ spec:
8686
target:
8787
type: Utilization
8888
averageUtilization: 70
89+
podDisruptionBudgets:
90+
maxUnavailable: 1
8991
onlineStore:
9092
persistence:
9193
store:
@@ -107,7 +109,7 @@ spec:
107109
```
108110

109111
{% hint style="info" %}
110-
When autoscaling is configured, the operator automatically sets the deployment strategy to `RollingUpdate` (instead of the default `Recreate`) to ensure zero-downtime scaling. You can override this by explicitly setting `deploymentStrategy` in the CR.
112+
When autoscaling is configured, the operator automatically sets the deployment strategy to `RollingUpdate` (instead of the default `Recreate`) to ensure zero-downtime scaling, and auto-injects soft pod anti-affinity and zone topology spread constraints. You can override any of these by explicitly setting `deploymentStrategy`, `affinity`, or `topologySpreadConstraints` in the CR.
111113
{% endhint %}
112114

113115
#### Validation Rules
@@ -117,6 +119,72 @@ The operator enforces the following rules:
117119
- Scaling with `replicas > 1` or any `autoscaling` config is **rejected** if any enabled service uses file-based persistence.
118120
- S3 (`s3://`) and GCS (`gs://`) backed registry file persistence is allowed with scaling, since these object stores support concurrent readers.
119121

122+
#### High Availability
123+
124+
When scaling is enabled (`replicas > 1` or `autoscaling`), the operator provides HA features to improve resilience:
125+
126+
**Pod Anti-Affinity** — The operator automatically injects a soft (`preferredDuringSchedulingIgnoredDuringExecution`) pod anti-affinity rule that prefers spreading pods across different nodes. This prevents multiple replicas from being co-located on the same node, improving resilience to node failures. You can override this by providing your own `affinity` configuration:
127+
128+
```yaml
129+
spec:
130+
replicas: 3
131+
services:
132+
# Override with custom affinity (e.g. strict anti-affinity)
133+
affinity:
134+
podAntiAffinity:
135+
requiredDuringSchedulingIgnoredDuringExecution:
136+
- topologyKey: kubernetes.io/hostname
137+
labelSelector:
138+
matchLabels:
139+
feast.dev/name: my-feast
140+
# ...
141+
```
142+
143+
**Topology Spread Constraints** — The operator automatically injects a soft zone-spread constraint (`whenUnsatisfiable: ScheduleAnyway`) that distributes pods across availability zones. This is a best-effort spread — if zones are unavailable, pods will still be scheduled. You can override this with explicit constraints or disable it with an empty array:
144+
145+
```yaml
146+
spec:
147+
replicas: 3
148+
services:
149+
# Override with custom topology spread (e.g. strict zone spreading)
150+
topologySpreadConstraints:
151+
- maxSkew: 1
152+
topologyKey: topology.kubernetes.io/zone
153+
whenUnsatisfiable: DoNotSchedule
154+
labelSelector:
155+
matchLabels:
156+
feast.dev/name: my-feast
157+
# ...
158+
```
159+
160+
To disable the auto-injected topology spread:
161+
162+
```yaml
163+
spec:
164+
replicas: 3
165+
services:
166+
topologySpreadConstraints: []
167+
# ...
168+
```
169+
170+
**PodDisruptionBudget** — You can configure a PDB to limit voluntary disruptions (e.g. during node drains or cluster upgrades). The PDB is only created when scaling is enabled. Exactly one of `minAvailable` or `maxUnavailable` must be set:
171+
172+
```yaml
173+
spec:
174+
replicas: 3
175+
services:
176+
podDisruptionBudgets:
177+
maxUnavailable: 1 # at most 1 pod unavailable during disruptions
178+
# -- OR --
179+
# podDisruptionBudgets:
180+
# minAvailable: "50%" # at least 50% of pods must remain available
181+
# ...
182+
```
183+
184+
{% hint style="info" %}
185+
The PDB is not auto-injected — you must explicitly configure it. This is intentional because a misconfigured PDB (e.g. `minAvailable` equal to the replica count) can block node drains and cluster upgrades.
186+
{% endhint %}
187+
120188
#### Using KEDA (Kubernetes Event-Driven Autoscaling)
121189

122190
[KEDA](https://keda.sh) is also supported as an external autoscaler. KEDA should target the FeatureStore's scale sub-resource directly (since it implements the Kubernetes scale API). This is the recommended approach because the operator manages the Deployment's replica count from `spec.replicas` — targeting the Deployment directly would conflict with the operator's reconciliation.

infra/feast-operator/api/v1/featurestore_types.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
batchv1 "k8s.io/api/batch/v1"
2323
corev1 "k8s.io/api/core/v1"
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
"k8s.io/apimachinery/pkg/util/intstr"
2526
)
2627

2728
const (
@@ -314,6 +315,21 @@ type FeatureStoreServices struct {
314315
// Scaling configures horizontal scaling for the FeatureStore deployment (e.g. HPA autoscaling).
315316
// For static replicas, use spec.replicas instead.
316317
Scaling *ScalingConfig `json:"scaling,omitempty"`
318+
// PodDisruptionBudgets configures a PodDisruptionBudget for the FeatureStore deployment.
319+
// Only created when scaling is enabled (replicas > 1 or autoscaling).
320+
// +optional
321+
PodDisruptionBudgets *PDBConfig `json:"podDisruptionBudgets,omitempty"`
322+
// TopologySpreadConstraints defines how pods are spread across topology domains.
323+
// When scaling is enabled and this is not set, the operator auto-injects a soft
324+
// zone-spread constraint (whenUnsatisfiable: ScheduleAnyway).
325+
// Set to an empty array to disable auto-injection.
326+
// +optional
327+
TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
328+
// Affinity defines the pod scheduling constraints for the FeatureStore deployment.
329+
// When scaling is enabled and this is not set, the operator auto-injects a soft
330+
// pod anti-affinity rule to prefer spreading pods across nodes.
331+
// +optional
332+
Affinity *corev1.Affinity `json:"affinity,omitempty"`
317333
}
318334

319335
// ScalingConfig configures horizontal scaling for the FeatureStore deployment.
@@ -342,6 +358,20 @@ type AutoscalingConfig struct {
342358
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
343359
}
344360

361+
// PDBConfig configures a PodDisruptionBudget for the FeatureStore deployment.
362+
// Exactly one of minAvailable or maxUnavailable must be set.
363+
// +kubebuilder:validation:XValidation:rule="[has(self.minAvailable), has(self.maxUnavailable)].exists_one(c, c)",message="Exactly one of minAvailable or maxUnavailable must be set."
364+
type PDBConfig struct {
365+
// MinAvailable specifies the minimum number/percentage of pods that must remain available.
366+
// Mutually exclusive with maxUnavailable.
367+
// +optional
368+
MinAvailable *intstr.IntOrString `json:"minAvailable,omitempty"`
369+
// MaxUnavailable specifies the maximum number/percentage of pods that can be unavailable.
370+
// Mutually exclusive with minAvailable.
371+
// +optional
372+
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
373+
}
374+
345375
// OfflineStore configures the offline store service
346376
type OfflineStore struct {
347377
// Creates a remote offline server container

infra/feast-operator/api/v1/zz_generated.deepcopy.go

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)