/** * Use of SSE vector units * unroll by a factor of 2 * perform independant calculations */ f32 dp_sse_vec(f32 *x, f32 *y, u32 size) { u32 i; // for independant calculcations we use two vectors // for example xmm0 for v_sum and xmm2 for w_sum f32 v_sum[4], w_w_sum[4]; v_sum[0] = v_sum[1] = v_sum[2] = v_sum[3] = 0.0; w_sum[0] = w_sum[1] = w_sum[2] = w_sum[3] = 0.0; // loop unrolling by a factor of 2x4 // for example // we load x[i:i+3] in xmm1 and x[i+4:i+7] in xmm3 for (i = 0; i < (size & ~7); i += 8) { v_sum[0] += x[i + 0] * y[i + 0]; v_sum[1] += x[i + 1] * y[i + 1]; v_sum[2] += x[i + 2] * y[i + 2]; v_sum[3] += x[i + 3] * y[i + 3]; w_sum[0] += x[i + 4] * y[i + 4]; w_sum[1] += x[i + 5] * y[i + 5]; w_sum[2] += x[i + 6] * y[i + 6]; w_sum[3] += x[i + 7] * y[i + 7]; } // partial sums of both vectors float v_sum_total = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; float w_sum_total = w_sum[0] + w_sum[1] + w_sum[2] + w_sum[3]; float sum = v_sum_total + w_sum_total; // last iterations while (i < size) { sum += x[i] * y[i]; ++i; } return sum; }