From 51edda0619d4ae0c2e511f2c86598133dca937df Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Fri, 17 Apr 2026 22:40:16 +0100 Subject: [PATCH] refactor(mlflow): use external gateway URL for tracking Use status.url (gateway) instead of status.address.url (internal service) for the MLflow tracking URI. This avoids requiring the OpenShift service-serving CA in agent containers. Remove MLFLOW_TRACKING_SERVER_CERT_PATH, DefaultCACertPath, custom CA loading, and MLflowAddressStatus type. Signed-off-by: Ignas Baranauskas --- .../internal/controller/mlflow_controller.go | 25 +++++++--------- .../controller/mlflow_controller_test.go | 1 - kagenti-operator/internal/mlflow/client.go | 30 +------------------ kagenti-operator/internal/mlflow/types.go | 12 ++------ 4 files changed, 13 insertions(+), 55 deletions(-) diff --git a/kagenti-operator/internal/controller/mlflow_controller.go b/kagenti-operator/internal/controller/mlflow_controller.go index 9e71aec..33658f0 100644 --- a/kagenti-operator/internal/controller/mlflow_controller.go +++ b/kagenti-operator/internal/controller/mlflow_controller.go @@ -184,13 +184,11 @@ func (r *MLflowReconciler) resolveTrackingURI(ctx context.Context) string { for i := range list.Items { cr := &list.Items[i] if meta.IsStatusConditionTrue(cr.Status.Conditions, "Available") { - if cr.Status.Address == nil || cr.Status.Address.URL == "" { - logger.Info("MLflow CR is Available but status.address.url is not set, skipping", "cr", cr.GetName()) - continue + if cr.Status.URL != "" { + logger.V(1).Info("Auto-discovered MLflow gateway URL", "uri", cr.Status.URL, "cr", cr.GetName()) + return cr.Status.URL } - uri := cr.Status.Address.URL - logger.V(1).Info("Auto-discovered MLflow tracking URI", "uri", uri, "cr", cr.GetName()) - return uri + logger.Info("MLflow CR is Available but status.url is not set, skipping", "cr", cr.GetName()) } } @@ -198,17 +196,14 @@ func (r *MLflowReconciler) resolveTrackingURI(ctx context.Context) string { } // mlflowEnvVars returns the environment variables to inject into agent containers. -// TODO(mlflow): MLFLOW_TRACKING_SERVER_CERT_PATH is OpenShift-specific — the -// service-ca operator injects service-ca.crt into the SA volume. On vanilla -// Kubernetes this file does not exist and MLflow clients will fail TLS verification. -// This should be made configurable (Helm value / annotation) before supporting non-OpenShift clusters. +// The tracking URI is typically the external gateway URL which uses a publicly-trusted +// TLS certificate, so no custom CA cert path is needed. func mlflowEnvVars(trackingURI, experimentID, experimentName string) map[string]string { return map[string]string{ - "MLFLOW_TRACKING_URI": trackingURI, - "MLFLOW_TRACKING_AUTH": "kubernetes-namespaced", - "MLFLOW_EXPERIMENT_ID": experimentID, - "MLFLOW_EXPERIMENT_NAME": experimentName, - "MLFLOW_TRACKING_SERVER_CERT_PATH": "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt", + "MLFLOW_TRACKING_URI": trackingURI, + "MLFLOW_TRACKING_AUTH": "kubernetes-namespaced", + "MLFLOW_EXPERIMENT_ID": experimentID, + "MLFLOW_EXPERIMENT_NAME": experimentName, } } diff --git a/kagenti-operator/internal/controller/mlflow_controller_test.go b/kagenti-operator/internal/controller/mlflow_controller_test.go index 19ccb04..150e788 100644 --- a/kagenti-operator/internal/controller/mlflow_controller_test.go +++ b/kagenti-operator/internal/controller/mlflow_controller_test.go @@ -232,7 +232,6 @@ var _ = Describe("MLflow Controller", func() { Expect(envMap["MLFLOW_TRACKING_AUTH"]).To(Equal("kubernetes-namespaced")) Expect(envMap["MLFLOW_EXPERIMENT_ID"]).To(Equal("exp-123")) Expect(envMap["MLFLOW_EXPERIMENT_NAME"]).To(Equal("mlflow-full")) - Expect(envMap["MLFLOW_TRACKING_SERVER_CERT_PATH"]).To(Equal("/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt")) Expect(updated.Spec.Template.Annotations[AnnotationMLflowExperimentID]).To(Equal("exp-123")) Expect(updated.Spec.Template.Annotations[AnnotationMLflowExperimentName]).To(Equal("mlflow-full")) diff --git a/kagenti-operator/internal/mlflow/client.go b/kagenti-operator/internal/mlflow/client.go index 0787658..76480dd 100644 --- a/kagenti-operator/internal/mlflow/client.go +++ b/kagenti-operator/internal/mlflow/client.go @@ -22,8 +22,6 @@ package mlflow import ( "bytes" "context" - "crypto/tls" - "crypto/x509" "encoding/json" "errors" "fmt" @@ -40,10 +38,6 @@ const ( // DefaultTokenPath is the projected SA token path in a pod. DefaultTokenPath = "/var/run/secrets/kubernetes.io/serviceaccount/token" - // DefaultCACertPath is the service-serving CA certificate path. - // On OpenShift, the service-ca.crt is projected into the SA token volume. - DefaultCACertPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" - // WorkspaceHeader is the MLflow workspace header (namespace-based isolation). WorkspaceHeader = "X-MLFLOW-WORKSPACE" ) @@ -56,10 +50,6 @@ type Client struct { // TokenPath is the path to the SA token file. Defaults to DefaultTokenPath. TokenPath string - // CACertPath is the path to the CA certificate for TLS verification. - // Defaults to the in-cluster SA CA cert. - CACertPath string - // HTTPClient is the HTTP client to use. If nil, a default client with 30s timeout is used. HTTPClient *http.Client @@ -111,32 +101,14 @@ func IsResourceAlreadyExists(err error) bool { func (c *Client) httpClient() *http.Client { c.httpOnce.Do(func() { if c.HTTPClient == nil { - tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12} - if caCert, err := os.ReadFile(c.caCertPath()); err == nil { - pool, err := x509.SystemCertPool() - if err != nil { - // Fall back to an empty pool; the service-CA cert will still be appended. - pool = x509.NewCertPool() - } - pool.AppendCertsFromPEM(caCert) - tlsCfg.RootCAs = pool - } c.HTTPClient = &http.Client{ - Timeout: 30 * time.Second, - Transport: &http.Transport{TLSClientConfig: tlsCfg}, + Timeout: 30 * time.Second, } } }) return c.HTTPClient } -func (c *Client) caCertPath() string { - if c.CACertPath != "" { - return c.CACertPath - } - return DefaultCACertPath -} - func (c *Client) tokenPath() string { if c.TokenPath != "" { return c.TokenPath diff --git a/kagenti-operator/internal/mlflow/types.go b/kagenti-operator/internal/mlflow/types.go index f18824a..1649bae 100644 --- a/kagenti-operator/internal/mlflow/types.go +++ b/kagenti-operator/internal/mlflow/types.go @@ -51,13 +51,8 @@ type MLflow struct { } type MLflowStatus struct { - Conditions []metav1.Condition `json:"conditions,omitempty"` - Address *MLflowAddressStatus `json:"address,omitempty"` -} - -// MLflowAddressStatus holds the internal in-cluster endpoint for the MLflow Service. -type MLflowAddressStatus struct { - // URL is the in-cluster HTTPS URL for the managed MLflow Service. + Conditions []metav1.Condition `json:"conditions,omitempty"` + // URL is the external gateway URL for the MLflow server (e.g. via the RHOAI data-science gateway). URL string `json:"url,omitempty"` } @@ -95,9 +90,6 @@ func (in *MLflowStatus) DeepCopyInto(out *MLflowStatus) { in.Conditions[i].DeepCopyInto(&out.Conditions[i]) } } - if in.Address != nil { - out.Address = &MLflowAddressStatus{URL: in.Address.URL} - } } func (in *MLflowList) DeepCopyObject() runtime.Object {