302 unsigned int num_points)
307 unsigned int number = num_points;
308 unsigned int quarter_points = num_points / 4;
310 float32x4x2_t a_val, b_val, c_val, scalar_val;
311 float32x4x2_t tmp_val;
313 scalar_val.val[0] = vld1q_dup_f32((
const float*)scalar);
314 scalar_val.val[1] = vld1q_dup_f32(((
const float*)scalar) + 1);
316 for (number = 0; number < quarter_points; ++number) {
317 a_val = vld2q_f32((
float*)aPtr);
318 b_val = vld2q_f32((
float*)bPtr);
319 b_val.val[1] = vnegq_f32(b_val.val[1]);
323 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
326 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
329 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
332 vst2q_f32((
float*)cPtr, c_val);
339 for (number = quarter_points * 4; number < num_points; number++) {
340 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);