Skip to content

Commit 7a8b9a5

Browse files
authored
[feature] arm: speed up fp16 exp_ps floor step on aarch64 (#6659)
1 parent 16beb34 commit 7a8b9a5

File tree

1 file changed

+2
-14
lines changed

1 file changed

+2
-14
lines changed

src/layer/arm/neon_mathfun_fp16s.h

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -197,13 +197,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
197197
#endif
198198

199199
/* perform a floorf */
200-
tmp = vcvt_f16_s16(vcvt_s16_f16(fx));
201-
202-
/* if greater, substract 1 */
203-
uint16x4_t mask = vcgt_f16(tmp, fx);
204-
mask = vand_u16(mask, (uint16x4_t)(one));
205-
206-
fx = vsub_f16(tmp, (float16x4_t)(mask));
200+
fx = vrndm_f16(fx);
207201

208202
#if defined(_MSC_VER) && !defined(__clang__)
209203
tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
@@ -255,13 +249,7 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
255249
#endif
256250

257251
/* perform a floorf */
258-
tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));
259-
260-
/* if greater, substract 1 */
261-
uint16x8_t mask = vcgtq_f16(tmp, fx);
262-
mask = vandq_u16(mask, vreinterpretq_u16_f16(one));
263-
264-
fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
252+
fx = vrndmq_f16(fx);
265253

266254
#if defined(_MSC_VER) && !defined(__clang__)
267255
float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));

0 commit comments

Comments
 (0)