Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a9e111b
desktop: add ModelQoS tier system for centralized model configuration…
beastoin Apr 19, 2026
f26c9b4
desktop: wire ChatProvider to ModelQoS for chat and floating bar mode…
beastoin Apr 19, 2026
eb39d39
desktop: wire FloatingControlBarState to ModelQoS (#6834)
beastoin Apr 19, 2026
b2f8d83
desktop: wire ShortcutSettings to ModelQoS (#6834)
beastoin Apr 19, 2026
b149dcc
desktop: wire FloatingControlBarWindow to ModelQoS (#6834)
beastoin Apr 19, 2026
9de181f
desktop: wire CalendarReaderService to ModelQoS.Claude.synthesis (#6834)
beastoin Apr 19, 2026
1ee5538
desktop: wire GmailReaderService to ModelQoS.Claude.synthesis (#6834)
beastoin Apr 19, 2026
8e1c764
desktop: wire AppleNotesReaderService to ModelQoS.Claude.synthesis (#…
beastoin Apr 19, 2026
ce10d41
desktop: wire OnboardingMemoryLogImportService to ModelQoS.Claude.syn…
beastoin Apr 19, 2026
575d2c2
desktop: wire OnboardingPagedIntroCoordinator to ModelQoS.Claude.synt…
beastoin Apr 19, 2026
5822c77
desktop: wire OnboardingChatView to ModelQoS.Claude.synthesis (#6834)
beastoin Apr 19, 2026
a0e3208
desktop: wire ChatLabView to ModelQoS.Claude.chatLabQuery/chatLabGrad…
beastoin Apr 19, 2026
6308e90
desktop: wire GeminiClient default model to ModelQoS.Gemini.proactive…
beastoin Apr 19, 2026
3b1cd8e
desktop: wire InsightAssistant to ModelQoS.Gemini.insight (#6834)
beastoin Apr 19, 2026
7ade1d3
desktop: wire TaskAssistant to ModelQoS.Gemini.taskExtraction (#6834)
beastoin Apr 19, 2026
22987ca
desktop: wire EmbeddingService to ModelQoS.Gemini.embedding (#6834)
beastoin Apr 19, 2026
a1c055a
desktop: sanitize persisted model against active QoS tier (#6834)
beastoin Apr 19, 2026
f59fd91
desktop: add ModelQoS unit tests for tier switching and model accesso…
beastoin Apr 19, 2026
6616f91
desktop: extract sanitizedSelection helper for stale model fallback (…
beastoin Apr 19, 2026
9941f20
desktop: add stale model sanitization regression tests (#6834)
beastoin Apr 19, 2026
1db9355
desktop: add ModelQoS tier system for Rust backend (#6834)
beastoin Apr 19, 2026
9ddba3c
desktop: register model_qos module in llm mod.rs (#6834)
beastoin Apr 19, 2026
e123d5b
desktop: wire LlmClient::new() to model_qos::gemini_default() (#6834)
beastoin Apr 19, 2026
fd3503f
desktop: wire Gemini proxy allowlist to model_qos (#6834)
beastoin Apr 19, 2026
87829fa
desktop: wire rate limiter degrade target to model_qos (#6834)
beastoin Apr 19, 2026
3f428c3
desktop: log active QoS tier on backend startup (#6834)
beastoin Apr 19, 2026
d18bf21
fix(backend): wire gemini_extraction() to structured extraction routes
beastoin Apr 19, 2026
8fa3b3b
fix(backend): use extraction model for conversation processing routes
beastoin Apr 19, 2026
e2aa9e7
fix(backend): use extraction model for knowledge graph rebuild
beastoin Apr 19, 2026
e4bb1d3
fix(backend): wire extraction model for merged-conversation reprocess…
beastoin Apr 19, 2026
292a5f1
test(backend): expand model_qos tests for premium tier and boundary c…
beastoin Apr 19, 2026
91f0e7a
test(backend): add LlmClient wiring tests for QoS model selection
beastoin Apr 19, 2026
09fd96f
fix(backend): serialize env-var tests with mutex to prevent flakiness
beastoin Apr 19, 2026
4b1fed3
feat(backend): add tier-aware rate limit thresholds to model_qos
beastoin Apr 19, 2026
ca2d324
feat(backend): wire rate limiter to tier-aware QoS thresholds
beastoin Apr 19, 2026
4a83112
feat(backend): log rate limit thresholds at startup
beastoin Apr 19, 2026
81caf85
fix(backend): keep daily hard limit at 1500 for both tiers
beastoin Apr 19, 2026
a1a7feb
refactor: rename ModelTier standard→premium, premium→max (Swift)
beastoin Apr 19, 2026
c315bbd
refactor: update ModelQoS tests for premium/max tier names
beastoin Apr 19, 2026
cf2bd18
refactor: rename ModelTier Standard→Premium, Premium→Max (Rust)
beastoin Apr 19, 2026
231ad7d
fix(backend): update stale comments from standard→premium and hard=50…
beastoin Apr 19, 2026
d9a062b
fix(backend): update stale comment from standard→premium tier in clie…
beastoin Apr 19, 2026
015a186
feat(desktop): post notification on ModelTier change for re-sanitization
beastoin Apr 19, 2026
c7572ec
fix(desktop): re-sanitize selectedModel when ModelTier changes at run…
beastoin Apr 19, 2026
85d9e8b
test(backend): add boundary tests for rate limit just-below soft and …
beastoin Apr 19, 2026
c316570
test(desktop): add tier change notification test for ModelQoS
beastoin Apr 19, 2026
5bd39c6
refactor(desktop): optimize model palette — 7 IDs down to 5
beastoin Apr 19, 2026
f74d7d9
fix(desktop): use chat model for onboarding conversation, not synthesis
beastoin Apr 19, 2026
d0fc2e8
fix(desktop): use chat model for onboarding intro coordinator
beastoin Apr 19, 2026
6aa6f12
refactor(backend): simplify Gemini models — Flash for all workloads
beastoin Apr 19, 2026
72b00bf
refactor(backend): remove gemini-pro-latest from proxy allowlist and …
beastoin Apr 19, 2026
5a63097
test(desktop): update tests for optimized 5-model palette
beastoin Apr 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions desktop/Backend-Rust/src/llm/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,17 +211,16 @@ struct GeminiPartResponse {
}

impl LlmClient {
/// Create a new Gemini client
/// Create a new Gemini client with the QoS-configured default model.
pub fn new(api_key: String) -> Self {
Self {
client: Client::new(),
api_key,
model: "gemini-3-flash-preview".to_string(),
model: super::model_qos::gemini_default().to_string(),
}
}

/// Set the model to use
#[allow(dead_code)]
pub fn with_model(mut self, model: &str) -> Self {
self.model = model.to_string();
self
Expand Down Expand Up @@ -1162,3 +1161,29 @@ Return relationships as source -> relationship -> target triples."#,
Ok(result)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn new_uses_qos_default_model() {
let client = LlmClient::new("test-key".to_string());
assert_eq!(client.model, super::super::model_qos::gemini_default());
}

#[test]
fn with_model_overrides_default() {
let client = LlmClient::new("test-key".to_string())
.with_model("gemini-pro-latest");
assert_eq!(client.model, "gemini-pro-latest");
}

#[test]
fn with_model_extraction_uses_extraction_accessor() {
let client = LlmClient::new("test-key".to_string())
.with_model(super::super::model_qos::gemini_extraction());
// In test env (premium tier), extraction == default == flash
assert_eq!(client.model, "gemini-3-flash-preview");
}
}
1 change: 1 addition & 0 deletions desktop/Backend-Rust/src/llm/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// LLM module

pub mod client;
pub mod model_qos;
pub mod persona;
pub mod prompts;

Expand Down
215 changes: 215 additions & 0 deletions desktop/Backend-Rust/src/llm/model_qos.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// Model QoS Tier System for Rust Backend
//
// Central model configuration with switchable tiers, mirroring the Swift ModelQoS.
// All LlmClient call sites should use these accessors instead of hardcoded model strings.
//
// Tier is read from OMI_MODEL_TIER env var at startup (default: "premium").

use std::sync::OnceLock;

/// Active tier, resolved once from OMI_MODEL_TIER env var.
static ACTIVE_TIER: OnceLock<ModelTier> = OnceLock::new();

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ModelTier {
/// Cost-optimized: Flash for all Gemini workloads, lower rate limits
Premium,
/// Quality-optimized: same models, higher rate limits
Max,
}

impl ModelTier {
fn from_env() -> Self {
match std::env::var("OMI_MODEL_TIER").as_deref() {
Ok("max") => ModelTier::Max,
_ => ModelTier::Premium,
}
}
}

/// Get the active model tier (resolved once from env).
pub fn active_tier() -> ModelTier {
*ACTIVE_TIER.get_or_init(ModelTier::from_env)
}

// MARK: - Gemini Models

/// Default model for LlmClient (used by chat, conversations, personas, knowledge graph).
pub fn gemini_default() -> &'static str {
gemini_default_for(active_tier())
}

fn gemini_default_for(tier: ModelTier) -> &'static str {
match tier {
ModelTier::Premium => "gemini-3-flash-preview",
ModelTier::Max => "gemini-3-flash-preview",
}
}

/// Model for structured extraction tasks (conversations, knowledge graph).
pub fn gemini_extraction() -> &'static str {
gemini_extraction_for(active_tier())
}

fn gemini_extraction_for(_tier: ModelTier) -> &'static str {
"gemini-3-flash-preview"
}

/// Allowed models for the Gemini proxy (passthrough from Swift app).
/// These are the models the desktop app is allowed to request.
pub fn gemini_proxy_allowed() -> &'static [&'static str] {
&[
"gemini-3-flash-preview",
"gemini-embedding-001",
]
}

/// Model that rate-limited Pro requests degrade to.
pub fn gemini_degrade_target() -> &'static str {
"gemini-3-flash-preview"
}

// MARK: - Rate Limit Thresholds (tier-aware)

/// Daily soft limit — at or above this, Pro requests degrade to Flash.
/// Premium: aggressive (30) since premium already sends Flash.
/// Max: generous (300) to allow Pro usage.
pub fn daily_soft_limit() -> u32 {
daily_soft_limit_for(active_tier())
}

fn daily_soft_limit_for(tier: ModelTier) -> u32 {
match tier {
ModelTier::Premium => 30,
ModelTier::Max => 300,
}
}

/// Daily hard limit — at or above this, all requests are rejected (429).
pub fn daily_hard_limit() -> u32 {
daily_hard_limit_for(active_tier())
}

fn daily_hard_limit_for(_tier: ModelTier) -> u32 {
1500
}

/// Tier description for logging.
pub fn tier_description() -> &'static str {
tier_description_for(active_tier())
}

fn tier_description_for(tier: ModelTier) -> &'static str {
match tier {
ModelTier::Premium => "Premium (cost-optimized)",
ModelTier::Max => "Max (quality-optimized)",
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;

/// Serialize env-var-mutating tests to avoid races under parallel execution.
static ENV_LOCK: Mutex<()> = Mutex::new(());

// --- ModelTier::from_env (serialized — shares process env) ---

#[test]
fn from_env_all_cases() {
let _guard = ENV_LOCK.lock().unwrap();

// Default (unset) → Premium
std::env::remove_var("OMI_MODEL_TIER");
assert_eq!(ModelTier::from_env(), ModelTier::Premium);

// Explicit max → Max
std::env::set_var("OMI_MODEL_TIER", "max");
assert_eq!(ModelTier::from_env(), ModelTier::Max);

// Invalid value → Premium fallback
std::env::set_var("OMI_MODEL_TIER", "garbage");
assert_eq!(ModelTier::from_env(), ModelTier::Premium);

// Empty string → Premium fallback
std::env::set_var("OMI_MODEL_TIER", "");
assert_eq!(ModelTier::from_env(), ModelTier::Premium);

std::env::remove_var("OMI_MODEL_TIER");
}

// --- gemini_default_for (both tiers) ---

#[test]
fn gemini_default_premium_is_flash() {
assert_eq!(gemini_default_for(ModelTier::Premium), "gemini-3-flash-preview");
}

#[test]
fn gemini_default_max_is_flash() {
// Default model is Flash for both tiers (cheap baseline)
assert_eq!(gemini_default_for(ModelTier::Max), "gemini-3-flash-preview");
}

// --- gemini_extraction_for (the tier-dependent branch) ---

#[test]
fn gemini_extraction_is_flash_for_both_tiers() {
assert_eq!(gemini_extraction_for(ModelTier::Premium), "gemini-3-flash-preview");
assert_eq!(gemini_extraction_for(ModelTier::Max), "gemini-3-flash-preview");
}

// --- tier_description_for ---

#[test]
fn tier_description_premium() {
assert!(tier_description_for(ModelTier::Premium).contains("Premium"));
}

#[test]
fn tier_description_max() {
assert!(tier_description_for(ModelTier::Max).contains("Max"));
}

// --- Static accessors (pinned models) ---

#[test]
fn proxy_allowed_contains_expected_models() {
let allowed = gemini_proxy_allowed();
assert!(allowed.contains(&"gemini-3-flash-preview"));
assert!(allowed.contains(&"gemini-embedding-001"));
assert!(!allowed.contains(&"gemini-pro-latest"), "pro removed from allowlist");
assert!(!allowed.contains(&"gemini-ultra"));
}

#[test]
fn degrade_target_is_flash() {
assert_eq!(gemini_degrade_target(), "gemini-3-flash-preview");
}

// --- Rate limit thresholds ---

#[test]
fn daily_soft_limit_premium_is_lower() {
assert_eq!(daily_soft_limit_for(ModelTier::Premium), 30);
}

#[test]
fn daily_soft_limit_max_is_higher() {
assert_eq!(daily_soft_limit_for(ModelTier::Max), 300);
}

#[test]
fn daily_hard_limit_same_for_both_tiers() {
assert_eq!(daily_hard_limit_for(ModelTier::Premium), 1500);
assert_eq!(daily_hard_limit_for(ModelTier::Max), 1500);
}

#[test]
fn soft_limit_always_below_hard_limit() {
for tier in [ModelTier::Premium, ModelTier::Max] {
assert!(daily_soft_limit_for(tier) < daily_hard_limit_for(tier));
}
}
}
7 changes: 7 additions & 0 deletions desktop/Backend-Rust/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ async fn main() {
// Load environment variables
dotenvy::dotenv().ok();

// Log active QoS tier
tracing::info!("Model QoS tier: {} | rate limits: soft={}, hard={}",
llm::model_qos::tier_description(),
llm::model_qos::daily_soft_limit(),
llm::model_qos::daily_hard_limit(),
);

// Load and validate config
let config = Config::from_env();
if let Err(e) = config.validate() {
Expand Down
5 changes: 4 additions & 1 deletion desktop/Backend-Rust/src/routes/conversations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ async fn create_conversation_from_segments(
// Get LLM client (Gemini)
let llm_client = if let Some(api_key) = &state.config.gemini_api_key {
LlmClient::new(api_key.clone())
.with_model(crate::llm::model_qos::gemini_extraction())
} else {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Expand Down Expand Up @@ -441,6 +442,7 @@ async fn reprocess_conversation(
// Get LLM client (Gemini)
let llm_client = if let Some(api_key) = &state.config.gemini_api_key {
LlmClient::new(api_key.clone())
.with_model(crate::llm::model_qos::gemini_extraction())
} else {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Expand Down Expand Up @@ -841,7 +843,8 @@ async fn merge_conversations(
// If reprocessing is requested and we have an LLM client, process the merged conversation
if request.reprocess {
if let Some(api_key) = &state.config.gemini_api_key {
let llm = LlmClient::new(api_key.clone());
let llm = LlmClient::new(api_key.clone())
.with_model(crate::llm::model_qos::gemini_extraction());

// Get existing data for deduplication
let existing_memories = state
Expand Down
3 changes: 2 additions & 1 deletion desktop/Backend-Rust/src/routes/knowledge_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ async fn rebuild_knowledge_graph(
tracing::info!("Processing {} memories for knowledge graph", memories.len());

// Create LLM client
let llm = LlmClient::new(api_key);
let llm = LlmClient::new(api_key)
.with_model(crate::llm::model_qos::gemini_extraction());

// Track nodes by lowercase label for deduplication
let mut node_map: HashMap<String, KnowledgeGraphNode> = HashMap::new();
Expand Down
17 changes: 6 additions & 11 deletions desktop/Backend-Rust/src/routes/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,9 @@ const GEMINI_ALLOWED_ACTIONS: &[&str] = &[
"batchEmbedContents",
];

// Allowed Gemini models — only these can be requested through the proxy.
// Desktop app uses: gemini-3-flash-preview (default), gemini-pro-latest (tasks/insights),
// gemini-embedding-001 (embeddings). Rate limiting may rewrite pro → flash.
const GEMINI_ALLOWED_MODELS: &[&str] = &[
"gemini-3-flash-preview",
"gemini-pro-latest",
"gemini-embedding-001",
];
// Allowed Gemini models — driven by model_qos (issue #6834).
// Desktop app uses: gemini-3-flash-preview (all features), gemini-embedding-001 (embeddings).
// Rate limiting may degrade requests above soft limit.

/// Maximum request body size for Gemini proxy routes (5 MB).
/// Normal app payloads are 300-600 KB (base64 JPEG + prompt); 5 MB gives ~8x headroom.
Expand Down Expand Up @@ -458,9 +453,9 @@ fn is_gemini_action_allowed(action: &str) -> bool {
GEMINI_ALLOWED_ACTIONS.contains(&action)
}

/// Check if a Gemini model is in the allowlist (issue #6624)
/// Check if a Gemini model is in the allowlist (issue #6624, #6834)
fn is_gemini_model_allowed(model: &str) -> bool {
GEMINI_ALLOWED_MODELS.contains(&model)
crate::llm::model_qos::gemini_proxy_allowed().contains(&model)
}

/// Sanitize a Gemini request body (issue #6624).
Expand Down Expand Up @@ -722,12 +717,12 @@ mod tests {
#[test]
fn model_allowlist_permits_valid_models() {
assert!(is_gemini_model_allowed("gemini-3-flash-preview"));
assert!(is_gemini_model_allowed("gemini-pro-latest"));
assert!(is_gemini_model_allowed("gemini-embedding-001"));
}

#[test]
fn model_allowlist_blocks_unknown() {
assert!(!is_gemini_model_allowed("gemini-pro-latest"), "pro removed from allowlist");
assert!(!is_gemini_model_allowed("gemini-2.5-pro"));
assert!(!is_gemini_model_allowed("gemini-1.5-pro"));
assert!(!is_gemini_model_allowed("gemini-ultra"));
Expand Down
Loading
Loading