; void vector_sum_prod_avx(float *x, float *y, float *z, size_t size) ; use ymm registers and avx instructions ; esi = x [ebp + 8] ; edi = y [ebp + 12] ; ebx = z [ebp + 16] ; edx = size [ebp + 20] ; ecx = i vector_sum_prod_avx: push ebp mov ebp, esp push esi push edi push ebx mov esi, [ebp + 8] mov edi, [ebp + 12] mov ebx, [ebp + 16] mov edx, [ebp + 20] xor ecx, ecx test edx, edx jz .end and edx, ~7 cmp edx, 4 jl .last .while_u8: vmovaps ymm0, [esi + ecx * 4] vmovaps ymm1, [edi + ecx * 4] vmovaps ymm2, [ebx + ecx * 4] vmulps ymm0, ymm1 vaddps ymm2, ymm0 vmovaps [ebx + ecx * 4], ymm2 add ecx, 8 cmp ecx, edx jl .while_u8 .last: mov edx, [ebp+20] .while_u1: cmp ecx, edx jge .end fld dword [esi + ecx * 4] fmul dword [edi + ecx * 4] fadd dword [ebx + ecx * 4] fstp dword [ebx + ecx * 4] inc ecx jmp .while_u1 .end: pop ebx pop edi pop esi mov esp, ebp pop ebp ret