/** * Use of AVX vector units * unroll by a factor of 2 * perform independant calculations */ f32 dp_avx_vec(f32 *x, f32 *y, u32 size) { // use ymm0 to store v_sum and ymm2 for w_sum f32 v_sum[8], w_sum[8]; u32 i; // use xorps ymm0, ymm0 v_sum[0] = v_sum[1] = v_sum[2] = v_sum[3] = 0.0; v_sum[4] = v_sum[5] = v_sum[6] = v_sum[7] = 0.0; // use xorps ymm2, ymm2 w_sum[0] = w_sum[1] = w_sum[2] = w_sum[3] = 0.0; w_sum[4] = w_sum[5] = w_sum[6] = w_sum[7] = 0.0; // loop unrolling by a factor of 16 // use ymm1 to store x[i:i+7] and ymm3 to store y[i+8:i+15] for (i = 0; i < (size & ~15); i += 16) { v_sum[0] += x[i + 0] * y[i + 0]; v_sum[1] += x[i + 1] * y[i + 1]; v_sum[2] += x[i + 2] * y[i + 2]; v_sum[3] += x[i + 3] * y[i + 3]; v_sum[4] += x[i + 4] * y[i + 4]; v_sum[5] += x[i + 5] * y[i + 5]; v_sum[6] += x[i + 6] * y[i + 6]; v_sum[7] += x[i + 7] * y[i + 7]; w_sum[0] += x[i + 8] * y[i + 8]; w_sum[1] += x[i + 9] * y[i + 9]; w_sum[2] += x[i + 10] * y[i + 10]; w_sum[3] += x[i + 11] * y[i + 11]; w_sum[4] += x[i + 12] * y[i + 12]; w_sum[5] += x[i + 13] * y[i + 13]; w_sum[6] += x[i + 14] * y[i + 14]; w_sum[7] += x[i + 15] * y[i + 15]; } // sum of partial sums float v_sum_total = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3] +; v_sum[4] + v_sum[5] + v_sum[6] + v_sum[7]; float w_sum_total = w_sum[0] + w_sum[1] + w_sum[2] + w_sum[3] +; w_sum[4] + w_sum[5] + w_sum[6] + w_sum[7]; float sum = v_sum_total + w_sum_total; // last iterations while (i < size) { sum += x[i] * y[i]; ++i; } return sum; }