/** * Use of SSE vector units */ f32 dp_sse_vec(f32 *x, f32 *y, u32 size) { // for example use xmm0 to store v_sum f32 v_sum[4]; // i must be defined here because it will be used // by the for loop and then the while loop u32 i; // use xorps xmm0, xmm0 v_sum[0] = v_sum[1] = v_sum[2] = v_sum[3] = 0.0; // loop unrolling by a factor of 4 // use xmm1 to store x[i:i+3] for (i = 0; i < (size & ~3); i += 4) { v_sum[0] += x[i + 0] * y[i + 0]; v_sum[1] += x[i + 1] * y[i + 1]; v_sum[2] += x[i + 2] * y[i + 2]; v_sum[3] += x[i + 3] * y[i + 3]; } // sum of partial sums float sum = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; // last iterations while (i < size) { sum += x[i] * y[i]; ++i; } return sum; }