BasedHardware · beastoin · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/desktop/Backend-Rust/src/llm/client.rs b/desktop/Backend-Rust/src/llm/client.rs
@@ -211,17 +211,16 @@ struct GeminiPartResponse {
 }
 
 impl LlmClient {
-    /// Create a new Gemini client
+    /// Create a new Gemini client with the QoS-configured default model.
     pub fn new(api_key: String) -> Self {
         Self {
             client: Client::new(),
             api_key,
-            model: "gemini-3-flash-preview".to_string(),
+            model: super::model_qos::gemini_default().to_string(),
         }
     }
 
     /// Set the model to use
-    #[allow(dead_code)]
     pub fn with_model(mut self, model: &str) -> Self {
         self.model = model.to_string();
         self
@@ -1162,3 +1161,29 @@ Return relationships as source -> relationship -> target triples."#,
         Ok(result)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn new_uses_qos_default_model() {
+        let client = LlmClient::new("test-key".to_string());
+        assert_eq!(client.model, super::super::model_qos::gemini_default());
+    }
+
+    #[test]
+    fn with_model_overrides_default() {
+        let client = LlmClient::new("test-key".to_string())
+            .with_model("gemini-pro-latest");
+        assert_eq!(client.model, "gemini-pro-latest");
+    }
+
+    #[test]
+    fn with_model_extraction_uses_extraction_accessor() {
+        let client = LlmClient::new("test-key".to_string())
+            .with_model(super::super::model_qos::gemini_extraction());
+        // In test env (premium tier), extraction == default == flash
+        assert_eq!(client.model, "gemini-3-flash-preview");
+    }
+}
diff --git a/desktop/Backend-Rust/src/llm/mod.rs b/desktop/Backend-Rust/src/llm/mod.rs
@@ -1,6 +1,7 @@
 // LLM module
 
 pub mod client;
+pub mod model_qos;
 pub mod persona;
 pub mod prompts;
 

diff --git a/desktop/Backend-Rust/src/llm/model_qos.rs b/desktop/Backend-Rust/src/llm/model_qos.rs
@@ -0,0 +1,215 @@
+// Model QoS Tier System for Rust Backend
+//
+// Central model configuration with switchable tiers, mirroring the Swift ModelQoS.
+// All LlmClient call sites should use these accessors instead of hardcoded model strings.
+//
+// Tier is read from OMI_MODEL_TIER env var at startup (default: "premium").
+
+use std::sync::OnceLock;
+
+/// Active tier, resolved once from OMI_MODEL_TIER env var.
+static ACTIVE_TIER: OnceLock<ModelTier> = OnceLock::new();
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum ModelTier {
+    /// Cost-optimized: Flash for all Gemini workloads, lower rate limits
+    Premium,
+    /// Quality-optimized: same models, higher rate limits
+    Max,
+}
+
+impl ModelTier {
+    fn from_env() -> Self {
+        match std::env::var("OMI_MODEL_TIER").as_deref() {
+            Ok("max") => ModelTier::Max,
+            _ => ModelTier::Premium,
+        }
+    }
+}
+
+/// Get the active model tier (resolved once from env).
+pub fn active_tier() -> ModelTier {
+    *ACTIVE_TIER.get_or_init(ModelTier::from_env)
+}
+
+// MARK: - Gemini Models
+
+/// Default model for LlmClient (used by chat, conversations, personas, knowledge graph).
+pub fn gemini_default() -> &'static str {
+    gemini_default_for(active_tier())
+}
+
+fn gemini_default_for(tier: ModelTier) -> &'static str {
+    match tier {
+        ModelTier::Premium => "gemini-3-flash-preview",
+        ModelTier::Max => "gemini-3-flash-preview",
+    }
+}
+
+/// Model for structured extraction tasks (conversations, knowledge graph).
+pub fn gemini_extraction() -> &'static str {
+    gemini_extraction_for(active_tier())
+}
+
+fn gemini_extraction_for(_tier: ModelTier) -> &'static str {
+    "gemini-3-flash-preview"
+}
+
+/// Allowed models for the Gemini proxy (passthrough from Swift app).
+/// These are the models the desktop app is allowed to request.
+pub fn gemini_proxy_allowed() -> &'static [&'static str] {
+    &[
+        "gemini-3-flash-preview",
+        "gemini-embedding-001",
+    ]
+}
+
+/// Model that rate-limited Pro requests degrade to.
+pub fn gemini_degrade_target() -> &'static str {
+    "gemini-3-flash-preview"
+}
+
+// MARK: - Rate Limit Thresholds (tier-aware)
+
+/// Daily soft limit — at or above this, Pro requests degrade to Flash.
+/// Premium: aggressive (30) since premium already sends Flash.
+/// Max: generous (300) to allow Pro usage.
+pub fn daily_soft_limit() -> u32 {
+    daily_soft_limit_for(active_tier())
+}
+
+fn daily_soft_limit_for(tier: ModelTier) -> u32 {
+    match tier {
+        ModelTier::Premium => 30,
+        ModelTier::Max => 300,
+    }
+}
+
+/// Daily hard limit — at or above this, all requests are rejected (429).
+pub fn daily_hard_limit() -> u32 {
+    daily_hard_limit_for(active_tier())
+}
+
+fn daily_hard_limit_for(_tier: ModelTier) -> u32 {
+    1500
+}
+
+/// Tier description for logging.
+pub fn tier_description() -> &'static str {
+    tier_description_for(active_tier())
+}
+
+fn tier_description_for(tier: ModelTier) -> &'static str {
+    match tier {
+        ModelTier::Premium => "Premium (cost-optimized)",
+        ModelTier::Max => "Max (quality-optimized)",
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Mutex;
+
+    /// Serialize env-var-mutating tests to avoid races under parallel execution.
+    static ENV_LOCK: Mutex<()> = Mutex::new(());
+
+    // --- ModelTier::from_env (serialized — shares process env) ---
+
+    #[test]
+    fn from_env_all_cases() {
+        let _guard = ENV_LOCK.lock().unwrap();
+
+        // Default (unset) → Premium
+        std::env::remove_var("OMI_MODEL_TIER");
+        assert_eq!(ModelTier::from_env(), ModelTier::Premium);
+
+        // Explicit max → Max
+        std::env::set_var("OMI_MODEL_TIER", "max");
+        assert_eq!(ModelTier::from_env(), ModelTier::Max);
+
+        // Invalid value → Premium fallback
+        std::env::set_var("OMI_MODEL_TIER", "garbage");
+        assert_eq!(ModelTier::from_env(), ModelTier::Premium);
+
+        // Empty string → Premium fallback
+        std::env::set_var("OMI_MODEL_TIER", "");
+        assert_eq!(ModelTier::from_env(), ModelTier::Premium);
+
+        std::env::remove_var("OMI_MODEL_TIER");
+    }
+
+    // --- gemini_default_for (both tiers) ---
+
+    #[test]
+    fn gemini_default_premium_is_flash() {
+        assert_eq!(gemini_default_for(ModelTier::Premium), "gemini-3-flash-preview");
+    }
+
+    #[test]
+    fn gemini_default_max_is_flash() {
+        // Default model is Flash for both tiers (cheap baseline)
+        assert_eq!(gemini_default_for(ModelTier::Max), "gemini-3-flash-preview");
+    }
+
+    // --- gemini_extraction_for (the tier-dependent branch) ---
+
+    #[test]
+    fn gemini_extraction_is_flash_for_both_tiers() {
+        assert_eq!(gemini_extraction_for(ModelTier::Premium), "gemini-3-flash-preview");
+        assert_eq!(gemini_extraction_for(ModelTier::Max), "gemini-3-flash-preview");
+    }
+
+    // --- tier_description_for ---
+
+    #[test]
+    fn tier_description_premium() {
+        assert!(tier_description_for(ModelTier::Premium).contains("Premium"));
+    }
+
+    #[test]
+    fn tier_description_max() {
+        assert!(tier_description_for(ModelTier::Max).contains("Max"));
+    }
+
+    // --- Static accessors (pinned models) ---
+
+    #[test]
+    fn proxy_allowed_contains_expected_models() {
+        let allowed = gemini_proxy_allowed();
+        assert!(allowed.contains(&"gemini-3-flash-preview"));
+        assert!(allowed.contains(&"gemini-embedding-001"));
+        assert!(!allowed.contains(&"gemini-pro-latest"), "pro removed from allowlist");
+        assert!(!allowed.contains(&"gemini-ultra"));
+    }
+
+    #[test]
+    fn degrade_target_is_flash() {
+        assert_eq!(gemini_degrade_target(), "gemini-3-flash-preview");
+    }
+
+    // --- Rate limit thresholds ---
+
+    #[test]
+    fn daily_soft_limit_premium_is_lower() {
+        assert_eq!(daily_soft_limit_for(ModelTier::Premium), 30);
+    }
+
+    #[test]
+    fn daily_soft_limit_max_is_higher() {
+        assert_eq!(daily_soft_limit_for(ModelTier::Max), 300);
+    }
+
+    #[test]
+    fn daily_hard_limit_same_for_both_tiers() {
+        assert_eq!(daily_hard_limit_for(ModelTier::Premium), 1500);
+        assert_eq!(daily_hard_limit_for(ModelTier::Max), 1500);
+    }
+
+    #[test]
+    fn soft_limit_always_below_hard_limit() {
+        for tier in [ModelTier::Premium, ModelTier::Max] {
+            assert!(daily_soft_limit_for(tier) < daily_hard_limit_for(tier));
+        }
+    }
+}
diff --git a/desktop/Backend-Rust/src/main.rs b/desktop/Backend-Rust/src/main.rs
@@ -89,6 +89,13 @@ async fn main() {
     // Load environment variables
     dotenvy::dotenv().ok();
 
+    // Log active QoS tier
+    tracing::info!("Model QoS tier: {} | rate limits: soft={}, hard={}",
+        llm::model_qos::tier_description(),
+        llm::model_qos::daily_soft_limit(),
+        llm::model_qos::daily_hard_limit(),
+    );
+
     // Load and validate config
     let config = Config::from_env();
     if let Err(e) = config.validate() {

diff --git a/desktop/Backend-Rust/src/routes/conversations.rs b/desktop/Backend-Rust/src/routes/conversations.rs
@@ -191,6 +191,7 @@ async fn create_conversation_from_segments(
         // Get LLM client (Gemini)
         let llm_client = if let Some(api_key) = &state.config.gemini_api_key {
             LlmClient::new(api_key.clone())
+                .with_model(crate::llm::model_qos::gemini_extraction())
         } else {
             return Err((
                 StatusCode::INTERNAL_SERVER_ERROR,
@@ -441,6 +442,7 @@ async fn reprocess_conversation(
     // Get LLM client (Gemini)
     let llm_client = if let Some(api_key) = &state.config.gemini_api_key {
         LlmClient::new(api_key.clone())
+            .with_model(crate::llm::model_qos::gemini_extraction())
     } else {
         return Err((
             StatusCode::INTERNAL_SERVER_ERROR,
@@ -841,7 +843,8 @@ async fn merge_conversations(
     // If reprocessing is requested and we have an LLM client, process the merged conversation
     if request.reprocess {
         if let Some(api_key) = &state.config.gemini_api_key {
-            let llm = LlmClient::new(api_key.clone());
+            let llm = LlmClient::new(api_key.clone())
+                .with_model(crate::llm::model_qos::gemini_extraction());
 
             // Get existing data for deduplication
             let existing_memories = state

diff --git a/desktop/Backend-Rust/src/routes/knowledge_graph.rs b/desktop/Backend-Rust/src/routes/knowledge_graph.rs
@@ -94,7 +94,8 @@ async fn rebuild_knowledge_graph(
     tracing::info!("Processing {} memories for knowledge graph", memories.len());
 
     // Create LLM client
-    let llm = LlmClient::new(api_key);
+    let llm = LlmClient::new(api_key)
+        .with_model(crate::llm::model_qos::gemini_extraction());
 
     // Track nodes by lowercase label for deduplication
     let mut node_map: HashMap<String, KnowledgeGraphNode> = HashMap::new();

diff --git a/desktop/Backend-Rust/src/routes/proxy.rs b/desktop/Backend-Rust/src/routes/proxy.rs
@@ -27,14 +27,9 @@ const GEMINI_ALLOWED_ACTIONS: &[&str] = &[
     "batchEmbedContents",
 ];
 
-// Allowed Gemini models — only these can be requested through the proxy.
-// Desktop app uses: gemini-3-flash-preview (default), gemini-pro-latest (tasks/insights),
-// gemini-embedding-001 (embeddings). Rate limiting may rewrite pro → flash.
-const GEMINI_ALLOWED_MODELS: &[&str] = &[
-    "gemini-3-flash-preview",
-    "gemini-pro-latest",
-    "gemini-embedding-001",
-];
+// Allowed Gemini models — driven by model_qos (issue #6834).
+// Desktop app uses: gemini-3-flash-preview (all features), gemini-embedding-001 (embeddings).
+// Rate limiting may degrade requests above soft limit.
 
 /// Maximum request body size for Gemini proxy routes (5 MB).
 /// Normal app payloads are 300-600 KB (base64 JPEG + prompt); 5 MB gives ~8x headroom.
@@ -458,9 +453,9 @@ fn is_gemini_action_allowed(action: &str) -> bool {
     GEMINI_ALLOWED_ACTIONS.contains(&action)
 }
 
-/// Check if a Gemini model is in the allowlist (issue #6624)
+/// Check if a Gemini model is in the allowlist (issue #6624, #6834)
 fn is_gemini_model_allowed(model: &str) -> bool {
-    GEMINI_ALLOWED_MODELS.contains(&model)
+    crate::llm::model_qos::gemini_proxy_allowed().contains(&model)
 }
 
 /// Sanitize a Gemini request body (issue #6624).
@@ -722,12 +717,12 @@ mod tests {
     #[test]
     fn model_allowlist_permits_valid_models() {
         assert!(is_gemini_model_allowed("gemini-3-flash-preview"));
-        assert!(is_gemini_model_allowed("gemini-pro-latest"));
         assert!(is_gemini_model_allowed("gemini-embedding-001"));
     }
 
     #[test]
     fn model_allowlist_blocks_unknown() {
+        assert!(!is_gemini_model_allowed("gemini-pro-latest"), "pro removed from allowlist");
         assert!(!is_gemini_model_allowed("gemini-2.5-pro"));
         assert!(!is_gemini_model_allowed("gemini-1.5-pro"));
         assert!(!is_gemini_model_allowed("gemini-ultra"));