global char_replace_sse20 section .text ; int char_replace_sse2(char *dst, char *src, int n, char c, char d); ; esi, xmm3 = src ; edi = dst, ; edx = n / 4 ; xmm0 = 0xff....ff ; xmm2 = c ; xmm3 = d ; eax = changes char_replace_sse20: push ebp mov ebp,esp push esi push edi push ebx movzx eax, byte PARAM_C ; mov ah,al ; mov cx,ax ; shl eax, 16 ; or ax, cx ; movd xmm2, eax ; pshufd xmm2,xmm2,0 movd xmm2, eax punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm2 pshufd xmm2, xmm2, 0 movzx eax, byte PARAM_D ; mov ah,al ; mov cx,ax ; shl eax, 16 ; or ax, cx ; movd xmm3, eax ; pshufd xmm3,xmm3,0 movd xmm3, eax punpcklbw xmm3, xmm3 punpcklbw xmm3, xmm3 pshufd xmm3, xmm3, 0 xor eax, eax xor ecx, ecx mov esi, PARAM_SRC mov edi, PARAM_DST mov edx, PARAM_SIZE shr edx, 4 test edx, edx jz .next_x1 .loop_x16: movdqa xmm0, [esi] ; xmm0 = src[i:i+15] movdqa xmm1, xmm0 ; make a copy in xmm1 ; ; compare xmm0 == xmm2 [c,...,c] ; if xmm0[i] == xmm2[i] then xmm0[i] = 0xFF else xmm0[i] = 0x00 ; pcmpeqb xmm0, xmm2 ; move mask to ebx ; if xmm0 = [ 0xFF, 0x00, 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00] ; then ebx = 0000.0000.0000.0000.1011.0100.1111.0000_b = 000B4F0_h pmovmskb ebx, xmm0 movdqa xmm4, xmm0 pand xmm0, xmm3 ; PANDN xmm1, xmm2 => xmm1 = NOT(xmm1) & xmm2 pandn xmm4, xmm1 por xmm0, xmm4 movdqa [edi], xmm0 popcnt ebx, ebx add eax, ebx add edi, 16 add esi, 16 dec edx jnz .loop_x16 .next_x1: mov edx, PARAM_SIZE and edx, 15 test edx, edx jz .end .loop_x1: mov cl, byte [esi] cmp cl, byte PARAM_C jne .next mov cl, byte PARAM_D add eax, 1 .next: mov byte [edi], cl inc esi inc edi dec edx jnz .loop_x1 .end: pop ebx pop edi pop esi mov esp,ebp pop ebp ret