Vectorisation - Parcimonie

There's art to all this madness
Tho' it seems insane to you
Have you any imagination
Of what I'm goin' through

Michael Jackson, Art of Madness

This page shows how to optimize the computation of fitch's score when dealing with maximum parsimony. We use the SSE instructions implemented on Intel or AMD processors to speed up the computations (see this page for time improvement for different architectures).

source code is available :

for Unix/Linux using gcc and nasm : popcount.tgz
for Windows 98/XP : popcount_win.zip, you might consider using dev-cpp and nasm for dos 32 bits if you want to compile those files.

Algorithme de base / basic algorithm

int parsimony(char *x, char *y, char *z, int size) {
  int i, changes=0;

  for (i = 0; i < size; i++) {
    z[i] = x[i] & y[i];
    if (z[i] == 0) {
      z[i] = x[i] | y[i];
      ++changes;
    }
  }
  return changes;
}

Amélioration : suppression du if (version 1)

int parsimony(char *x, char *y, char *z, int n) {
	int i, changes=0;
	for (i=0; i < size; ++i) {
		char x_, y_;
		int c;
		x_ = x[i] & y[i];
		y_ = x[i] | y[i];
		z[i] = x_ | ((!x_) * y_);
		c = ((!x_) & 1);
		changes += c;
	}
	return changes;
}

Amélioration : suppression du if (version 2)

int parsimony(char *x, char *y, char *z, int n) {
	int i, changes=0;
	for (i=0; i < size; ++i) {
		char x_, y_;
		int c;
		x_ = x[i] & y[i];
		y_ = x[i] | y[i];
		z[i] = (x_ == 0) ? y_ : x_;  // change here !
		c = ((!x_) & 1);
		changes += c;
	}
	return changes;
}

Amélioration : vectorisation (version 1) / Improvement : vectorization

On choisit de stocker / we choose to store :

x dans / in esi
y dans / in edi
z dans / in ebx

comportement des opérations sse sur les registres xmm
behaviour of the sse operations over xmm registers
(les valeurs sont données en hexadécimal
values are given in hexadecimal)
registre / register	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
p	02	10	08	02	20	02	10	01	08	02	04	02	04	02	20	10
q	01	01	02	10	02	20	08	20	08	10	10	02	20	02	01	04
p & q	00	00	00	00	00	00	00	00	08	00	00	02	00	02	00	00
p \| q	03	11	0A	12	22	22	18	21	08	12	14	02	24	02	21	14
pcmpeqb xmm4,xmm3	FF	FF	FF	FF	FF	FF	FF	FF	00	FF	FF	00	FF	00	FF	FF
pmovmskb edx,xmm4	1	1	1	1	1	1	1	1	0	1	1	0	1	0	1	1
not	0000.0000b = 00h = 0 #=0								1001.0100b = 94h = 148 #=3
	1111.1111b = FFh = 255								0110.1011b = D6h = 214

section .data

; 128 bits set to 1, this value will be used to get a
; complementary value
align 16
one: dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF

section .code

;
; only valid for 16x data
;

func_byte_vect1_16x:
	push 	ebp	
	mov	ebp,esp
	push	esi
	push	edi
	push	ebx
	push	ecx
	push	edx

	mov	esi,[ebp+8]  ; x
	mov	edi,[ebp+12] ; y
	mov	ebx,[ebp+16] ; z

	xor	eax,eax  ; fitch's score
        xor	ecx,ecx  ; i

	movdqu	xmm6,[one]

vect1_loop:
	movdqa	xmm0,[esi+ecx] ; xmm0 <- p (x)

	movdqa	xmm2,[edi+ecx] ; xmm2 <- q (y)
	movdqa	xmm1,xmm0      ; xmm1 <- p
	pxor	xmm3,xmm3
	pxor	xmm5,xmm5
	pand	xmm0,xmm2      ; xmm0 <- p & q
	por	xmm1,xmm2      ; xmm1 <- p | q
	movdqa	xmm4,xmm0      ; xmm4 <- p & q
	pcmpeqb xmm4,xmm3      ; xmm4[i] ?= 0 for all i
	pmovmskb edx,xmm4

	movdqa	xmm5,xmm6      ; xmm5 = 0xFF...F
	pxor	xmm5,xmm4      ; xmm5 = ~xmm4

	pand	xmm1,xmm4
	pand	xmm0,xmm5
	por	xmm0,xmm1

	movdqa	[ebx+ecx],xmm0 ; z

	; eax contains fitch score
	; compute number of bits set to 1
	; bt will set carry flag to 1 if bit is set
	bt      edx,0
	adc	eax,0
	bt      edx,1
	adc	eax,0
	bt      edx,2
	adc	eax,0
	bt      edx,3
	adc	eax,0
	bt      edx,4
	adc	eax,0
	bt      edx,5
	adc	eax,0
	bt      edx,6
	adc	eax,0
	bt      edx,7

	adc	eax,0
	bt      edx,8
	adc	eax,0
	bt      edx,9
	adc	eax,0
	bt      edx,10
	adc	eax,0
	bt      edx,11
	adc	eax,0
	bt      edx,12
	adc	eax,0
	bt      edx,13
	adc	eax,0
	bt      edx,14
	adc	eax,0
	bt      edx,15
	adc	eax,0

	add	ecx,16
	cmp	ecx,[ebp+20]
	jl 	vect1_loop

	pop	edx
	pop	ecx
	pop	ebx
	pop	edi
	pop	esi

	mov	esp,ebp
	pop	ebp
	ret

PCMPEQB xmmDst,xmmSrc/m128
(Packed compare if equal to byte)
compare chacun des 16 octets des registres xmm. Si Src[i] == Dst[i] alors Dst[i] = 0xFF, sinon Dst[i] = 0x00. /
compare each of the 16 bytes of xmm register. if Src[i] == Dst[i] then Dst[i] = 0xFF else Dst[i] = 0x00.
PMOVMSKB r(32/64), mmxSrc
(Extract Packed Byte Mask)
copie les bits de poids fort de chaque octet dans le registre spécifié. /
copy the left most bit of each byte to the given register

Amélioration : vectorisation (version 2) / Improvement

La partie qui pose le plus de problème concerne le calcul du nombre de bits à 1 dans le registre edx après exécution de l'instruction pmovmskb (cf. ci-après popcount = population count). L'utilisation de l'instruction bt associée à adc ralentit considérablement les calculs. On envisage donc d'utiliser une table de conversion.

The most difficult problem is how can we find the number of bits set to 1 in register edx (see popcount below = population count). The instruction pair bt and adc is not efficient enough, so we decide to use a conversion table.

;
; only valid for 16x data
;

func_byte_vect2_16x:
	push 	ebp	
	mov	ebp,esp
	push	esi
	push	edi
	push	ebx
	push	ecx
	push	edx

	mov	esi,[ebp+8]  ; x
	mov	edi,[ebp+12] ; y
	mov	ebx,[ebp+16] ; z

	xor	eax,eax
	xor	ecx,ecx

	movdqu	xmm6,[one]

vect2_loop:
	movdqa	xmm0,[esi+ecx] ; xmm0 <- p (x)

	movdqa	xmm2,[edi+ecx] ; xmm2 <- q (y)
	movdqa	xmm1,xmm0      ; xmm1 <- p
	pxor	xmm3,xmm3
	pxor	xmm5,xmm5
	pand	xmm0,xmm2      ; xmm0 <- p & q
	por	xmm1,xmm2      ; xmm1 <- p | q
	movdqa	xmm4,xmm0      ; xmm4 <- p & q
	pcmpeqb xmm4,xmm3      ; xmm4[i]
	pmovmskb edx,xmm4

	movdqa	xmm5,xmm6
	pxor	xmm5,xmm4      ; xmm5 = ~xmm4

	pand	xmm1,xmm4
	pand	xmm0,xmm5
	por	xmm0,xmm1

	movdqa	[ebx+ecx],xmm0 ; put result in z


	; improvement to compute number of bits set to 1 in edx
	; use a table to convert dl (and then dh) into number of
	; bits

	push	ebx     ; temporary use of ebx, so save ebx in stack
	mov	ebx,edx
	xor	edx,edx
	mov	dl,bl   ; get first lower 8 bits
	mov	dl,[translation_table+edx] 
	; note : we could use movzx edx,byte [translation_table+edx] 
        ; but it seems slower than the mov dl,[translation_table+edx] 
	; on Core 2 Duo architecture
	add	eax,edx
	mov	dl,bh   ; get upper 8 bits
	mov	dl,[translation_table+edx]
	add	eax,edx
	pop	ebx

	
	add	ecx,16
	cmp	ecx,[ebp+20]
	jl 	vect2_loop

	pop	edx
	pop	ecx
	pop	ebx
	pop	edi
	pop	esi

	mov	esp,ebp
	pop	ebp
	ret

translation_table:
db 0, 1, 1, 2, 1, 2, 2, 3
db 1, 2, 2, 3, 2, 3, 3, 4
db 1, 2, 2, 3, 2, 3, 3, 4
db 2, 3, 3, 4, 3, 4, 4, 5
db 1, 2, 2, 3, 2, 3, 3, 4
db 2, 3, 3, 4, 3, 4, 4, 5
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 1, 2, 2, 3, 2, 3, 3, 4
db 2, 3, 3, 4, 3, 4, 4, 5
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 3, 4, 4, 5, 4, 5, 5, 6
db 4, 5, 5, 6, 5, 6, 6, 7
db 1, 2, 2, 3, 2, 3, 3, 4
db 2, 3, 3, 4, 3, 4, 4, 5
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 3, 4, 4, 5, 4, 5, 5, 6
db 4, 5, 5, 6, 5, 6, 6, 7
db 2, 3, 3, 4, 3, 4, 4, 5
db 3, 4, 4, 5, 4, 5, 5, 6
db 3, 4, 4, 5, 4, 5, 5, 6
db 4, 5, 5, 6, 5, 6, 6, 7
db 3, 4, 4, 5, 4, 5, 5, 6
db 4, 5, 5, 6, 5, 6, 6, 7
db 4, 5, 5, 6, 5, 6, 6, 7
db 5, 6, 6, 7, 6, 7, 7, 8

Quelques latences d'instructions (source Athlon 64 X2) :

pand xmm1,xmm2 : 2 cycles
por xmm1,xmm2 : 2 cycles
pxor xmm1,xmm2 : 2 cycles
movdqa xmm1, mem128 : 2 cycles
pcmpeqb xmm1,xmm2 : 2 cycles
pmovmskb reg32,xmm1 : 3 cycles

Résultats (parsi_c)

temps d'exécution sur Core i7 2600 k
méthode	temps (s)	gain
C (basique)	43.090	-
C (optimisé 1)	39.770	7%
C (optimisé 2)	29.310	32%
assembleur 1	25.360	41%
assembleur 2	21.750	51%
assembleur 3	21.420	51%
SSE	2.230	95%

assembleur 1, 2 et 3 sont des versions assembleur des versions optimisées

Résultats (versions SSE)

temps d'exécution sur Core i7 2600 k
méthode	temps (s)
version 1	54.520
version 2	48.590
version 3	45.640
version 4 (optimized)	44.600
version 5 (optimized AVX)	45.190

Compter le nombre de bits à 1 : popcount

La fonction qui consiste à compter le nombre de bits à 1 stockés dans un mot (WORD : 16 bits) ou un double mot (DWORD : 32 bits) s'appelle popcount. Elle n'a été implantée nativement que sur certains processeurs et peut être étendue à 64 ou 128 bits.

Elle devrait normalement être implantée au niveau des instructions SSSE4 (Supplementary SSE3 = SSE4) chez Intel à partir du Core 2 Nehalem (fin 2008) et sera identifiée par le mnémonique POPCNT.

On peut cependant implanter en C ou en assembleur cette fonction. Voici quelques exemples d'implantations trouvés sur le net. Note : this code was found on internet.

/* very naive : student's style ;-) */

int popcount0(unsigned long x) {
  int n=0;
  while (x) {
    if ((x&1)) ++n;
    x = (x >> 1);
  }
  return n;
}

/* basic implementation */

int popcount1(unsigned long x) {   
  int n = 0;
  if (x) do ++n; while ((x &= x-1)!=0);
  return n;
}


/* optimized version for all kind of architectures */

int popcount2(unsigned long x) {   
  x = ((x>>1) & 0x55555555) + (x & 0x55555555);
  x = ((x>>2) & 0x33333333) + (x & 0x33333333);
  return (((x>>4)+x) & 0x0F0F0F0F) % 255;
}

/* use of a conversion table */

int popcount3(unsigned long x) {   
  int n = 0;
  do n += "\0\1\1\2\1\2\2\3\1\2\2\3\2\3\3\4"	\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\4\5\5\6\5\6\6\7\5\6\6\7\6\7\7\10" [x & 0xFF];
  while ((x >>= 8)!=0);
  return n;
}

/* improvement of popcount3 */

int popcount4(unsigned long x) {   
  static char __attribute__((aligned(16))) *table4 = 
    "\0\1\1\2\1\2\2\3\1\2\2\3\2\3\3\4"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"		\
    "\4\5\5\6\5\6\6\7\5\6\6\7\6\7\7\10" ;
  int n = 0;
  n =  table4[x & 0xFF];
  x >>= 8;
  n +=  table4[x & 0xFF];
  x >>= 8;
  n +=  table4[x & 0xFF];
  x >>= 8;
  n +=  table4[x & 0xFF];
  return n;
}

Amélioration	Pentium-M	Pentium III	Pentium 4	Pentium D	C2D E6420	Athlon 64	Sempron
popcount 0	44		45.66	42.87	30.41	30.27	38.78
popcount 1	25		21.18	23.10	19.06	22.08	28.63
popcount 2	12		20.50²	9.76	8.08	9.51	12.33
popcount 3	7¹		9.76	8.12	5.27	7.16	9.35
popcount 4 (/popcount0)	6 (-86%)		8.68 (-81%)	7.78 (-81%)	5.29 (-82%)	6.38 (-78%)	8.27 (-78%)
popcount 5 (popcount3_1)	21		10.07	8.08	21.27	7.14	9.31
popcount 6 (popcount3_2)	7		9.92	8.23	6.02	7.16	9.28
popcount 7 (popcount2_1) mul	12		13.86	10.04	9.48	9.06	11.74
popcount 8 (popcount2_2) div	17		20.47	20.73	12.93	25.81	33.46
POPCNT	NA	NA	NA	NA	NA	NA	NA

Temps en secondes pour 1000 itérations pour vérification 0 à 1.000.000
les sources ont été compilés uniquement avec l'option -O2 de gcc
Amélioration	Pentium II	P9500
popcount 0	320	23.81
popcount 1	170	15.51
popcount 2	103	7.25
popcount 3	156	4.20
popcount 4	62 (-80%)	4.70 (-80%)
popcount 5	156	4.86
popcount 6	75	4.86
popcount 7	101	7.43
popcount 8	160	6.58

Pentium II Celeron, 300 Mhz, 640 Mo SDRAM, 4 Go disque dur (Solo 2500, Gateway, 1999, under Windows 98)
Pentium III Tualatin, 1.4 Ghz, FSB 100/133 Mhz, 256 Mo SDRAM PC 100
Pentium-M, 2.0 Ghz, FSB 533 Mhz, 1 Go DDR-SDRAM
Pentium 4 Northwood, 2.8 Ghz, FSB 800 Mhz, 256 Mo DDR-SDRAM PC3200
Pentium D 830 Smithfield, 3 Ghz, FSB 800 Mhz, 1 Go DDR2-SDRAM PC4300
Core 2 Duo E6420, 2.13 Ghz, FSB 1066 Mhz, 2 Go DDR2-SDRAM PC 6400
Athlon 64 X2 4200+, 4.2 Ghz (2.2 Ghz), FSB 400 Mhz, 512 Mo DDR-SDRAM PC 3200
Sempron 3000+, FSB 400 Mhz, 512 Mo DDR-SDRAM PC 3200

Quelques commentaires :

une implantation naive peut parfois être grandement améliorée
les implantations les plus efficaces sont popcount3 ou 4
utiliser la division entière avec parcimonie sur les Athlon 64 / Sempron car elle est 3 fois plus pénalisante qu'une multiplication (cf. popcount7 / popcount8)

(1) Note : en compilant sous Ubuntu Linux avec gcc version 4.1.2 avec un Pentium-M, je me suis aperçu que si on utilisait les options -march=pentium-m -mtune=pentium-m on obtenait pour popcount 3 un temps d'exécution de 23s, alors qu'en supprimant ces options, on obtient un temps d'exécution de 8s !! Voici le code de chaque sous-programme (objdump -d -j .text -M intel popcount.exe) :

23s -march=pentium-m 8s

popcount3_1

  push   ebp
  xor    ecx,ecx
  mov    ebp,esp
  mov    edx,DWORD PTR [ebp+8]
  nop    
  lea    esi,[esi]
L1:
  movzx  eax,dl
  movsx  eax,BYTE PTR [eax+0x8048bd0]
  add    ecx,eax
  shr    edx,0x8
  jne    L1
  pop    ebp
  mov    eax,ecx
  ret

popcount3_2

  push   ebp
  xor    ecx,ecx
  mov    ebp,esp
  mov    edx,DWORD PTR [ebp+8]


L1:
  movzx  eax,dl
  movsx  eax,BYTE PTR [eax+0x8048bc0]
  shr    edx,0x8
  add    ecx,eax
  test   edx,edx
  jne    L1
  pop    ebp
  mov    eax,ecx
  ret

Restait à déterminer si les 2 instructions nop et lea étaient pénalisantes ou si les instructions à l'intérieur de la boucle L1 influençaient le temps d'exécution. J'ai donc réécrit popcount3_1 en assembleur en supprimant les 2 instructions étranges (en fait elles permettent de réaliser l'alignement des instructions de la boucle L1).

Supprimer les 2 instructions permet de gagner moins d'une seconde. En fait, l'introduction d'une instruction test edx,edx ou cmp edx,0 avant l'instruction jne L1 permet de diminuer le temps d'exécution à 8s ! Pretty strange isn't it !

Ce phénomène apparaît sur Pentium-M 760 et Core 2 Duo E6420, mais pas sur Pentium 4 ou Athlon 64. Il s'agit en fait d'un EFLAGS partial register stall (cf chapitre 6). Il s'agit probablement d'un bug corrigé depuis, car il ne se reproduit pas sur un Core 2 Duo E6750.

(2) sur Pentium 4 avec gcc 4.0.2, en fonction que l'on utilise ou pas l'option -march=pentium4, on obtient une version de popcount2 avec multiplication (13s) ou division (20s) .

13s 20s

popcount2_1

push   ebp
mov    ebp,esp
push   ebx
mov    eax,DWORD PTR [ebp+8]
mov    ecx,eax
shr    ecx,1
and    ecx,0x55555555
and    eax,0x55555555
add    ecx,eax
mov    eax,ecx
shr    eax,0x2
and    eax,0x33333333
and    ecx,0x33333333
add    eax,ecx
mov    ecx,eax
shr    ecx,0x4
lea    ebx,[eax+ecx]
and    ebx,0xf0f0f0f
mov    edx,0x80808081
mov    eax,ebx
mul    edx
shr    edx,0x7
mov    ecx,edx
shl    ecx,0x8
sub    ecx,edx
sub    ebx,ecx
mov    eax,ebx
pop    ebx
pop    ebp
ret

popcount2_2

push   ebp
mov    ebp,esp
mov    eax,DWORD PTR [ebp+8]
mov    edx,eax
shr    edx,1
and    edx,0x55555555
and    eax,0x55555555
add    edx,eax
mov    eax,edx
shr    eax,0x2
and    eax,0x33333333
and    edx,0x33333333
add    eax,edx
mov    edx,eax
shr    edx,0x4
add    eax,edx
and    eax,0xf0f0f0f
mov    edx,0xff
mov    ecx,edx
xor    edx,edx
div    ecx
mov    eax,edx
pop    ebp
ret

Les latences des instructions div et mul sur Athlon 64 sont :

div mreg 16/32/64 : 23/39/71 cycles
mul mreg32 : 3 cycles

Page de Jean-Michel Richer