// ================================================================== // Program: dot_product.cpp // Date: July 2020 // Author: Jean-Michel Richer // Email: jean-michel.richer@univ-angers.fr // ================================================================== // Description: // This program enables to test different implementations of the // dot product of two vectors of floats // ================================================================== #include #include #include #include #include #include #include "timer.hpp" #include using namespace std; // ------------------------------------------------------------------ // // TYPES // // ------------------------------------------------------------------ typedef float f32; typedef uint32_t u32; typedef f32 (*DotProductFn)(f32 *x, f32 *y, u32 size); // ------------------------------------------------------------------ // // GLOBAL VARIABLES // // ------------------------------------------------------------------ // use 16 for SSE, 32 for AVX, 64 for AVX512 const int ALIGNMENT = 64; // method to test u32 method = 1; // number of times the method is called u32 zillions = 20000; // size of each vector u32 vector_size = 64; // the two vectors used for the dot product f32 *vector_x, *vector_y; // print all methods numbers and names bool list_methods_flag = false; // ------------------------------------------------------------------ // // IMPLEMENTATION OF METHODS // // ------------------------------------------------------------------ /** * Method that computes the dot product of two vectors * @param x pointer to first vector * @param y pointer to second vector * @param size size of the vectors * @return the dot product of x and y, i.e. x_0 * y_0 + ... + x_(n-1) * y_(n-1) */ f32 dp_ref(f32 *x, f32 *y, u32 size) { f32 sum = 0.0; for (u32 i = 0; i < size; ++i) { sum += x[i] * y[i]; } return sum; } f32 dp_sse_intrin(f32 *x, f32 *y, u32 size) { f32 sum = 0.0; u32 i = 0; __m128 vsum, vx, vy; vsum = _mm_setzero_ps(); for ( ; i < (size & ~3); i = i + 4) { vx = _mm_load_ps (&x[i]); vy = _mm_load_ps (&y[i]); vx = _mm_mul_ps( vx, vy ); vsum = _mm_add_ps( vsum, vx ); } vsum = _mm_hadd_ps( vsum, vsum); vsum = _mm_hadd_ps( vsum, vsum); _mm_store_ss(&sum, vsum); while (i < size) { sum += x[i] * y[i]; ++i; } return sum; } // ------------------------------------------------------------------ // // METHODS // // ------------------------------------------------------------------ // register assembly methods here extern "C" { f32 dp_fpu(f32 *x, f32 *y, u32 size); f32 dp_sse_low(f32 *x, f32 *y, u32 size); f32 dp_sse(f32 *x, f32 *y, u32 size); f32 dp_avx(f32 *x, f32 *y, u32 size); f32 dp_avx_ur2(f32 *x, f32 *y, u32 size); f32 dp_avx_ur4(f32 *x, f32 *y, u32 size); f32 dp_avx512(f32 *x, f32 *y, u32 size); f32 dp_avx512_ur2(f32 *x, f32 *y, u32 size); f32 dp_avx512_ur4(f32 *x, f32 *y, u32 size); f32 dp_avx512_ur4_fma(f32 *x, f32 *y, u32 size); } typedef struct { DotProductFn function; string name; } Method ; #define entry(name) { name, #name } // define methods here Method methods [ ] = { { nullptr, "undefined" }, entry(dp_ref), entry(dp_fpu), entry(dp_sse_low), entry(dp_sse), entry(dp_sse_intrin), entry(dp_avx), entry(dp_avx_ur2), entry(dp_avx_ur4), entry(dp_avx512), entry(dp_avx512_ur2), entry(dp_avx512_ur4), entry(dp_avx512_ur4_fma), { nullptr, "undefined" } }; /** * main subprogram */ int main(int argc, char *argv[]) { // get command line arguments // use getopt int opt; while ((opt = getopt(argc, argv, "m:s:l")) != -1) { switch(opt) { case 'm': method = std::stoi(optarg); break; case 's': vector_size = std::stoi(optarg); break; case 'l': list_methods_flag = true; break; default: cerr << "error !!!!" << endl; exit(EXIT_FAILURE); } } if (list_methods_flag == true) { int i = 1; for ( ; methods[i].function != nullptr; ++i) { cout << setw(2) << i << ": " << methods[i].name << endl; } cout << "nbr_methods=" << (i-1) << endl; exit(EXIT_SUCCESS); } // // create vectors // vector_x = (f32 *) _mm_malloc( vector_size * sizeof(f32), ALIGNMENT); vector_y = (f32 *) _mm_malloc( vector_size * sizeof(f32), ALIGNMENT); // // initialize vectors // const u32 modulo = 17; for (u32 i = 0; i < vector_size; ++i) { vector_x[i] = ((i % 3) == 0) ? -1.0 : 1.0; vector_y[i] = (f32) 1 + (i % modulo); } cout << "method=" << method << endl; cout << "method_name=" << methods[ method ].name << endl; // ============================================================== // Measure of the time needed to execute the treatment // ============================================================== Timer timer; timer.start(); float total = 0; float result = 0; u32 z; result = methods[ method ].function( vector_x, vector_y, vector_size ); total += result; cout << "result=" << std::fixed << result << endl; for (z = 2; z <= zillions; ++z) { result = methods[ method ].function( vector_x, vector_y, vector_size ); total += result; } timer.stop(); cout << timer << endl; cout << "total=" << total << endl; return EXIT_SUCCESS; }