; void vector_sum_prod_avx(float *x, float *y, float *z, size_t size)
; use ymm registers and avx instructions
; esi = x [ebp +  8]
; edi = y [ebp + 12]
; ebx = z [ebp + 16]
; edx = size [ebp + 20]
; ecx = i	
vector_sum_prod_avx:
	push	ebp
	mov		ebp, esp
	push	esi
	push	edi
	push	ebx
	
	mov		esi, [ebp + 8]
	mov		edi, [ebp + 12]
	mov		ebx, [ebp + 16]
	mov		edx, [ebp + 20]
	
	xor		ecx, ecx
	
	test	edx, edx
	jz		.end
	
	and		edx, ~7
	cmp		edx, 4
	jl		.last
	
.while_u8:
	vmovaps	ymm0, [esi + ecx * 4]
	vmovaps	ymm1, [edi + ecx * 4]
	vmovaps	ymm2, [ebx + ecx * 4]
	vmulps	ymm0, ymm1
	vaddps	ymm2, ymm0
	vmovaps	[ebx + ecx * 4], ymm2
	add		ecx, 8
	cmp		ecx, edx
	jl		.while_u8


.last:
	mov		edx, [ebp+20]
	
.while_u1:	
	cmp		ecx, edx
	jge		.end
	fld		dword [esi + ecx * 4]
	fmul	dword [edi + ecx * 4]
	fadd	dword [ebx + ecx * 4]
	fstp	dword [ebx + ecx * 4]
	inc		ecx
	jmp		.while_u1

.end:	
	pop		ebx
	pop		edi
	pop		esi
	mov		esp, ebp
	pop		ebp
	ret