Skip to content

Commit 0370cb6

Browse files
Add qwen 3/3.5 series support (#445)
2 parents 7ed42a7 + 6526e86 commit 0370cb6

File tree

5 files changed

+94
-16
lines changed

5 files changed

+94
-16
lines changed

examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,64 @@ struct RunAnywhereAIApp: App {
267267
)
268268
}
269269

270+
// Qwen3 models
271+
if let qwen3_06bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf") {
272+
RunAnywhere.registerModel(
273+
id: "qwen3-0.6b-q4_k_m",
274+
name: "Qwen3 0.6B Q4_K_M",
275+
url: qwen3_06bURL,
276+
framework: .llamaCpp,
277+
memoryRequirement: 500_000_000
278+
)
279+
}
280+
if let qwen3_17bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf") {
281+
RunAnywhere.registerModel(
282+
id: "qwen3-1.7b-q4_k_m",
283+
name: "Qwen3 1.7B Q4_K_M",
284+
url: qwen3_17bURL,
285+
framework: .llamaCpp,
286+
memoryRequirement: 1_200_000_000
287+
)
288+
}
289+
if let qwen3_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf") {
290+
RunAnywhere.registerModel(
291+
id: "qwen3-4b-q4_k_m",
292+
name: "Qwen3 4B Q4_K_M",
293+
url: qwen3_4bURL,
294+
framework: .llamaCpp,
295+
memoryRequirement: 2_800_000_000
296+
)
297+
}
298+
299+
// Qwen3.5 models
300+
if let qwen35_08bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf") {
301+
RunAnywhere.registerModel(
302+
id: "qwen3.5-0.8b-q4_k_m",
303+
name: "Qwen3.5 0.8B Q4_K_M",
304+
url: qwen35_08bURL,
305+
framework: .llamaCpp,
306+
memoryRequirement: 600_000_000
307+
)
308+
}
309+
if let qwen35_2bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf") {
310+
RunAnywhere.registerModel(
311+
id: "qwen3.5-2b-q4_k_m",
312+
name: "Qwen3.5 2B Q4_K_M",
313+
url: qwen35_2bURL,
314+
framework: .llamaCpp,
315+
memoryRequirement: 1_500_000_000
316+
)
317+
}
318+
if let qwen35_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf") {
319+
RunAnywhere.registerModel(
320+
id: "qwen3.5-4b-q4_k_m",
321+
name: "Qwen3.5 4B Q4_K_M",
322+
url: qwen35_4bURL,
323+
framework: .llamaCpp,
324+
memoryRequirement: 2_800_000_000
325+
)
326+
}
327+
270328
logger.info("✅ LLM models registered (including tool-calling optimized models)")
271329

272330
// Register VLM (Vision Language) models

sdk/runanywhere-commons/VERSIONS

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ SHERPA_ONNX_VERSION_LINUX=1.12.23
7272
# =============================================================================
7373
# llama.cpp (LLM inference)
7474
# =============================================================================
75-
# b8011 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
75+
# b8201 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
7676
# NOTE: Bumped from b7650 to enable WebGPU acceleration for WASM builds
77-
LLAMACPP_VERSION=b8011
77+
LLAMACPP_VERSION=b8201
7878

7979
# =============================================================================
8080
# nlohmann/json

sdk/runanywhere-commons/src/backends/llamacpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ if(RAC_VLM_USE_MTMD)
153153
${llamacpp_SOURCE_DIR}/tools/mtmd/models/whisper-enc.cpp
154154
${llamacpp_SOURCE_DIR}/tools/mtmd/models/kimik25.cpp
155155
${llamacpp_SOURCE_DIR}/tools/mtmd/models/mobilenetv5.cpp
156+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/paddleocr.cpp
157+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp
156158
)
157159
endif()
158160

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ bool LlamaCppTextGeneration::unload_model_internal() {
371371
// Clear LoRA adapters from context before freeing
372372
// (adapter memory is freed automatically with the model per llama.cpp API)
373373
if (context_ && !lora_adapters_.empty()) {
374-
llama_clear_adapter_lora(context_);
374+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
375375
}
376376
lora_adapters_.clear();
377377

@@ -828,13 +828,32 @@ bool LlamaCppTextGeneration::recreate_context() {
828828
}
829829

830830
bool LlamaCppTextGeneration::apply_lora_adapters() {
831+
if (lora_adapters_.empty()) {
832+
// Clear all adapters from context
833+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
834+
return true;
835+
}
836+
837+
std::vector<llama_adapter_lora*> adapters;
838+
std::vector<float> scales;
839+
adapters.reserve(lora_adapters_.size());
840+
scales.reserve(lora_adapters_.size());
841+
831842
for (auto& entry : lora_adapters_) {
832-
int32_t result = llama_set_adapter_lora(context_, entry.adapter, entry.scale);
833-
if (result != 0) {
834-
LOGE("Failed to apply LoRA adapter: %s (error=%d)", entry.path.c_str(), result);
843+
adapters.push_back(entry.adapter);
844+
scales.push_back(entry.scale);
845+
}
846+
847+
int32_t result = llama_set_adapters_lora(context_, adapters.data(), adapters.size(), scales.data());
848+
if (result != 0) {
849+
LOGE("Failed to apply LoRA adapters (error=%d)", result);
850+
for (auto& entry : lora_adapters_) {
835851
entry.applied = false;
836-
return false;
837852
}
853+
return false;
854+
}
855+
856+
for (auto& entry : lora_adapters_) {
838857
entry.applied = true;
839858
LOGI("Applied LoRA adapter: %s (scale=%.2f)", entry.path.c_str(), entry.scale);
840859
}
@@ -911,17 +930,16 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
911930
return false;
912931
}
913932

914-
// Remove from context
915-
int32_t result = llama_rm_adapter_lora(context_, it->adapter);
916-
if (result != 0) {
917-
LOGE("Failed to remove LoRA adapter from context: %s (error=%d)", adapter_path.c_str(), result);
918-
return false;
919-
}
920-
921933
// Remove from tracking (adapter memory is freed automatically with the model
922934
// per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
923935
lora_adapters_.erase(it);
924936

937+
// Re-apply remaining adapters (or clear if none left)
938+
if (!apply_lora_adapters()) {
939+
LOGE("Failed to re-apply remaining LoRA adapters after removal");
940+
return false;
941+
}
942+
925943
// Clear KV cache after adapter changes
926944
llama_memory_clear(llama_get_memory(context_), true);
927945

@@ -937,7 +955,7 @@ void LlamaCppTextGeneration::clear_lora_adapters() {
937955
}
938956

939957
if (context_) {
940-
llama_clear_adapter_lora(context_);
958+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
941959
llama_memory_clear(llama_get_memory(context_), true);
942960
}
943961

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ class LlamaCppTextGeneration {
163163
nlohmann::json model_config_;
164164

165165
int context_size_ = 0;
166-
int max_default_context_ = 8192;
166+
int max_default_context_ = 1024;
167167

168168
std::vector<LoraAdapterEntry> lora_adapters_;
169169

0 commit comments

Comments
 (0)