105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
116 __m256 aVal, bVal, cVal;
117 for (; number < eighthPoints; number++) {
118 aVal = _mm256_load_ps(aPtr);
119 bVal = _mm256_load_ps(bPtr);
121 cVal = _mm256_div_ps(aVal, bVal);
123 _mm256_store_ps(cPtr, cVal);
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *cPtr++ = (*aPtr++) / (*bPtr++);
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
153 __m128 aVal, bVal, cVal;
154 for (; number < quarterPoints; number++) {
155 aVal = _mm_load_ps(aPtr);
156 bVal = _mm_load_ps(bPtr);
158 cVal = _mm_div_ps(aVal, bVal);
160 _mm_store_ps(cPtr, cVal);
167 number = quarterPoints * 4;
168 for (; number < num_points; number++) {
169 *cPtr++ = (*aPtr++) / (*bPtr++);
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
187 float32x4x4_t aVal, bVal, bInv, cVal;
189 const unsigned int eighthPoints = num_points / 16;
190 unsigned int number = 0;
191 for (; number < eighthPoints; number++) {
192 aVal = vld4q_f32(aPtr);
194 bVal = vld4q_f32(bPtr);
200 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
201 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
202 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
203 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
205 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
206 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
207 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
208 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
210 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
211 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
212 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
213 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
215 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
216 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
217 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
218 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
220 vst4q_f32(cPtr, cVal);
224 for (number = eighthPoints * 16; number < num_points; number++) {
225 *cPtr++ = (*aPtr++) / (*bPtr++);
263 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
318 const float* aVector,
319 const float* bVector,
320 unsigned int num_points)
322 unsigned int number = 0;
323 const unsigned int eighthPoints = num_points / 8;
325 float* cPtr = cVector;
326 const float* aPtr = aVector;
327 const float* bPtr = bVector;
329 __m256 aVal, bVal, cVal;
330 for (; number < eighthPoints; number++) {
331 aVal = _mm256_loadu_ps(aPtr);
332 bVal = _mm256_loadu_ps(bPtr);
334 cVal = _mm256_div_ps(aVal, bVal);
336 _mm256_storeu_ps(cPtr, cVal);
343 number = eighthPoints * 8;
344 for (; number < num_points; number++) {
345 *cPtr++ = (*aPtr++) / (*bPtr++);