|
|
@@ -59,11 +59,11 @@ |
|
|
|
# longer. A CPU with higher pclmulqdq issue rate would also benefit |
|
|
|
# from higher aggregate factor... |
|
|
|
# |
|
|
|
# Westmere 1.76(+14%) |
|
|
|
# Sandy Bridge 1.79(+9%) |
|
|
|
# Ivy Bridge 1.79(+8%) |
|
|
|
# Westmere 1.78(+13%) |
|
|
|
# Sandy Bridge 1.80(+8%) |
|
|
|
# Ivy Bridge 1.80(+7%) |
|
|
|
# Haswell 0.55(+93%) (if system doesn't support AVX) |
|
|
|
# Bulldozer 1.52(+25%) |
|
|
|
# Bulldozer 1.49(+27%) |
|
|
|
|
|
|
|
# March 2013 |
|
|
|
# |
|
|
@@ -673,8 +673,8 @@ $code.=<<___; |
|
|
|
pxor $Xl,$Xm |
|
|
|
pclmulqdq \$0x00,$Hkey2,$Xl |
|
|
|
pclmulqdq \$0x11,$Hkey2,$Xh |
|
|
|
xorps $Xl,$Xln |
|
|
|
pclmulqdq \$0x10,$HK,$Xm |
|
|
|
xorps $Xl,$Xln |
|
|
|
xorps $Xh,$Xhn |
|
|
|
movups 0x50($Htbl),$HK |
|
|
|
xorps $Xm,$Xmn |
|
|
@@ -692,8 +692,8 @@ $code.=<<___; |
|
|
|
pshufd \$0b01001110,$Xi,$T1 |
|
|
|
pxor $Xi,$T1 |
|
|
|
pclmulqdq \$0x11,$Hkey3,$Xh |
|
|
|
xorps $Xl,$Xln |
|
|
|
pclmulqdq \$0x00,$HK,$Xm |
|
|
|
xorps $Xl,$Xln |
|
|
|
xorps $Xh,$Xhn |
|
|
|
|
|
|
|
lea 0x40($inp),$inp |
|
|
@@ -711,23 +711,23 @@ $code.=<<___; |
|
|
|
xorps $Xln,$Xi |
|
|
|
movdqu 0x20($inp),$Xln |
|
|
|
movdqa $Xl,$Xh |
|
|
|
pshufd \$0b01001110,$Xl,$Xm |
|
|
|
pclmulqdq \$0x10,$HK,$T1 |
|
|
|
pshufd \$0b01001110,$Xl,$Xm |
|
|
|
xorps $Xhn,$Xhi |
|
|
|
pxor $Xl,$Xm |
|
|
|
pshufb $T3,$Xln |
|
|
|
movups 0x20($Htbl),$HK |
|
|
|
pclmulqdq \$0x00,$Hkey,$Xl |
|
|
|
xorps $Xmn,$T1 |
|
|
|
movdqa $Xln,$Xhn |
|
|
|
pclmulqdq \$0x00,$Hkey,$Xl |
|
|
|
pshufd \$0b01001110,$Xln,$Xmn |
|
|
|
|
|
|
|
pxor $Xi,$T1 # aggregated Karatsuba post-processing |
|
|
|
pxor $Xln,$Xmn |
|
|
|
movdqa $Xln,$Xhn |
|
|
|
pxor $Xhi,$T1 # |
|
|
|
pxor $Xln,$Xmn |
|
|
|
movdqa $T1,$T2 # |
|
|
|
pslldq \$8,$T1 |
|
|
|
pclmulqdq \$0x11,$Hkey,$Xh |
|
|
|
pslldq \$8,$T1 |
|
|
|
psrldq \$8,$T2 # |
|
|
|
pxor $T1,$Xi |
|
|
|
movdqa .L7_mask(%rip),$T1 |
|
|
@@ -736,8 +736,8 @@ $code.=<<___; |
|
|
|
|
|
|
|
pand $Xi,$T1 # 1st phase |
|
|
|
pshufb $T1,$T2 # |
|
|
|
pclmulqdq \$0x00,$HK,$Xm |
|
|
|
pxor $Xi,$T2 # |
|
|
|
pclmulqdq \$0x00,$HK,$Xm |
|
|
|
psllq \$57,$T2 # |
|
|
|
movdqa $T2,$T1 # |
|
|
|
pslldq \$8,$T2 |
|
|
@@ -764,32 +764,31 @@ $code.=<<___; |
|
|
|
movdqa $Xl,$Xh |
|
|
|
pxor $Xm,$Xmn |
|
|
|
pshufd \$0b01001110,$Xl,$Xm |
|
|
|
pxor $Xl,$Xm |
|
|
|
pclmulqdq \$0x00,$Hkey3,$Xl |
|
|
|
pxor $T2,$Xi # |
|
|
|
pxor $T1,$Xhi |
|
|
|
pxor $Xl,$Xm |
|
|
|
pclmulqdq \$0x00,$Hkey3,$Xl |
|
|
|
psrlq \$1,$Xi # |
|
|
|
pxor $Xhi,$Xi # |
|
|
|
movdqa $Xi,$Xhi |
|
|
|
pclmulqdq \$0x11,$Hkey3,$Xh |
|
|
|
xorps $Xl,$Xln |
|
|
|
pxor $Xhi,$Xi # |
|
|
|
pshufd \$0b01001110,$Xi,$T1 |
|
|
|
pxor $Xi,$T1 |
|
|
|
|
|
|
|
pclmulqdq \$0x00,$HK,$Xm |
|
|
|
xorps $Xh,$Xhn |
|
|
|
|
|
|
|
movdqa $Xi,$Xhi |
|
|
|
pshufd \$0b01001110,$Xi,$T1 |
|
|
|
pxor $Xi,$T1 |
|
|
|
|
|
|
|
lea 0x40($inp),$inp |
|
|
|
sub \$0x40,$len |
|
|
|
jnc .Lmod4_loop |
|
|
|
|
|
|
|
.Ltail4x: |
|
|
|
pclmulqdq \$0x00,$Hkey4,$Xi |
|
|
|
xorps $Xm,$Xmn |
|
|
|
pclmulqdq \$0x11,$Hkey4,$Xhi |
|
|
|
xorps $Xln,$Xi |
|
|
|
pclmulqdq \$0x10,$HK,$T1 |
|
|
|
xorps $Xm,$Xmn |
|
|
|
xorps $Xln,$Xi |
|
|
|
xorps $Xhn,$Xhi |
|
|
|
pxor $Xi,$Xhi # aggregated Karatsuba post-processing |
|
|
|
pxor $Xmn,$T1 |
|
|
@@ -852,13 +851,13 @@ $code.=<<___; |
|
|
|
|
|
|
|
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) |
|
|
|
pxor $Xhn,$Xhi |
|
|
|
movdqu ($inp),$Xhn # Ii |
|
|
|
movdqu ($inp),$T2 # Ii |
|
|
|
pxor $Xi,$T1 # aggregated Karatsuba post-processing |
|
|
|
pshufb $T3,$Xhn |
|
|
|
pshufb $T3,$T2 |
|
|
|
movdqu 16($inp),$Xln # Ii+1 |
|
|
|
|
|
|
|
pxor $Xhi,$T1 |
|
|
|
pxor $Xhn,$Xhi # "Ii+Xi", consume early |
|
|
|
pxor $T2,$Xhi # "Ii+Xi", consume early |
|
|
|
pxor $T1,$Xmn |
|
|
|
pshufb $T3,$Xln |
|
|
|
movdqa $Xmn,$T1 # |
|
|
@@ -885,9 +884,9 @@ $code.=<<___; |
|
|
|
pxor $T1,$Xhi # |
|
|
|
pxor $Xhn,$Xmn # |
|
|
|
|
|
|
|
pclmulqdq \$0x11,$Hkey,$Xhn ####### |
|
|
|
movdqa $Xi,$T2 # 2nd phase |
|
|
|
psrlq \$1,$Xi |
|
|
|
pclmulqdq \$0x11,$Hkey,$Xhn ####### |
|
|
|
pxor $T2,$Xhi # |
|
|
|
pxor $Xi,$T2 |
|
|
|
psrlq \$5,$Xi |
|
|
@@ -896,7 +895,6 @@ $code.=<<___; |
|
|
|
psrlq \$1,$Xi # |
|
|
|
pclmulqdq \$0x00,$HK,$Xmn ####### |
|
|
|
pxor $Xhi,$Xi # |
|
|
|
.byte 0x66,0x90 |
|
|
|
|
|
|
|
sub \$0x20,$len |
|
|
|
ja .Lmod_loop |
|
|
|