; void vector_sum_prod_x87(float *x, float *y, float *z, size_t size) ; unroll by 4 ; esi = x [ebp + 8] ; edi = y [ebp + 12] ; ebx = z [ebp + 16] ; edx = size [ebp + 20] ; ecx = i vector_sum_prod_x87: push ebp mov ebp, esp push esi push edi push ebx mov esi, [ebp + 8] mov edi, [ebp + 12] mov ebx, [ebp + 16] mov edx, [ebp + 20] xor ecx, ecx test edx, edx jz .end and edx, ~3 cmp edx, 4 jl .last .while_u4: fld dword [esi + ecx * 4] fmul dword [edi + ecx * 4] fadd dword [ebx + ecx * 4] fstp dword [ebx + ecx * 4] fld dword [esi + ecx * 4 + 4] fmul dword [edi + ecx * 4 + 4] fadd dword [ebx + ecx * 4 + 4] fstp dword [ebx + ecx * 4 + 4] fld dword [esi + ecx * 4 + 8] fmul dword [edi + ecx * 4 + 8] fadd dword [ebx + ecx * 4 + 8] fstp dword [ebx + ecx * 4 + 8] fld dword [esi + ecx * 4 + 12] fmul dword [edi + ecx * 4 + 12] fadd dword [ebx + ecx * 4 + 12] fstp dword [ebx + ecx * 4 + 12] add ecx, 4 cmp ecx, edx jl .while_u4 .last: mov edx, [ebp+20] .while_u1: cmp ecx, edx jge .end fld dword [esi + ecx * 4] fmul dword [edi + ecx * 4] fadd dword [ebx + ecx * 4] fstp dword [ebx + ecx * 4] inc ecx jmp .while_u1 .end: pop ebx pop edi pop esi mov esp, ebp pop ebp ret