[feature] arm: speed up exp_ps floor step on aarch64 (#6657)

crafcat7 · web-flow · commit 16beb34966c1 · 2026-04-07T19:09:51.000+08:00
Summary:
  Use vrndmq_f32 for floor computation in exp_ps on aarch64 while keeping the legacy fallback path for non-aarch64 targets. This reduces the exp_ps hot-path cost on ARM without changing approximation behavior.
diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h
@@ -141,13 +141,17 @@ static inline float32x4_t exp_ps(float32x4_t x)
     fx = VFMAQ_F32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
 
     /* perform a floorf */
+#if defined(__aarch64__)
+    fx = vrndmq_f32(fx);
+#else
     tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
 
     /* if greater, substract 1 */
     uint32x4_t mask = vcgtq_f32(tmp, fx);
     mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
 
     fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+#endif
 
     tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
     float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));