sha512-x86_64.pl: +15% better performance on Westmere and incidentally Atom.

Other Intel processors +5%, Opteron -2%.
This commit is contained in:
Andy Polyakov 2011-09-17 11:30:28 +00:00
parent 819cf4b886
commit d2fd65f6f6

View file

@ -95,50 +95,44 @@ sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___; $code.=<<___;
mov $e,$a0 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $e,$a1
mov $f,$a2 mov $f,$a2
ror \$$Sigma1[0],$a0
ror \$$Sigma1[1],$a1
xor $g,$a2 # f^g
xor $a1,$a0
ror \$`$Sigma1[2]-$Sigma1[1]`,$a1
and $e,$a2 # (f^g)&e
mov $T1,`$SZ*($i&0xf)`(%rsp) mov $T1,`$SZ*($i&0xf)`(%rsp)
xor $a1,$a0 # Sigma1(e) ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g xor $e,$a0
xor $g,$a2 # f^g
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
add $h,$T1 # T1+=h add $h,$T1 # T1+=h
xor $a,$a1
mov $a,$h
add $a0,$T1 # T1+=Sigma1(e)
add $a2,$T1 # T1+=Ch(e,f,g)
mov $a,$a0
mov $a,$a1
ror \$$Sigma0[0],$h
ror \$$Sigma0[1],$a0
mov $a,$a2
add ($Tbl,$round,$SZ),$T1 # T1+=K[round] add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
mov $b,$h
xor $a0,$h ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 xor $e,$a0
or $c,$a1 # a|c xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
xor $c,$h # b^c
xor $a,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
mov $b,$a2
ror \$$Sigma1[0],$a0 # Sigma1(e)
and $a,$h # h=(b^c)&a
and $c,$a2 # b&c
ror \$$Sigma0[0],$a1 # Sigma0(a)
add $a0,$T1 # T1+=Sigma1(e)
add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
xor $a0,$h # h=Sigma0(a)
and $c,$a2 # a&c
add $T1,$d # d+=T1 add $T1,$d # d+=T1
and $b,$a1 # (a|c)&b
add $T1,$h # h+=T1 add $T1,$h # h+=T1
or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1($round),$round # round++ lea 1($round),$round # round++
add $a1,$h # h+=Sigma0(a)
add $a1,$h # h+=Maj(a,b,c)
___ ___
} }
@ -147,32 +141,30 @@ sub ROUND_16_XX()
$code.=<<___; $code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
mov $a0,$T1
mov $a0,$a2 mov $a1,$a2
ror \$`$sigma0[1]-$sigma0[0]`,$T1
xor $a0,$T1
shr \$$sigma0[2],$a0 shr \$$sigma0[2],$a0
ror \$$sigma0[0],$a2
xor $a2,$a0 ror \$$sigma0[0],$T1
ror \$`$sigma0[1]-$sigma0[0]`,$a2 xor $T1,$a0 # sigma0(X[(i+1)&0xf])
mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
xor $a2,$a0 # sigma0(X[(i+1)&0xf]) ror \$`$sigma1[1]-$sigma1[0]`,$a2
mov $T1,$a1 xor $a1,$a2
shr \$$sigma1[2],$a1
shr \$$sigma1[2],$T1
ror \$$sigma1[0],$a1
xor $a1,$T1
ror \$`$sigma1[1]-$sigma1[0]`,$a1
xor $a1,$T1 # sigma1(X[(i+14)&0xf])
ror \$$sigma1[0],$a2
add $a0,$T1 add $a0,$T1
xor $a2,$a1 # sigma1(X[(i+14)&0xf])
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
add `$SZ*($i&0xf)`(%rsp),$T1 add `$SZ*($i&0xf)`(%rsp),$T1
mov $e,$a0
add $a1,$T1
mov $a,$a1
___ ___
&ROUND_00_15(@_); &ROUND_00_15(@_);
} }
@ -219,6 +211,8 @@ $func:
___ ___
for($i=0;$i<16;$i++) { for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n"; $code.=" mov $SZ*$i($inp),$T1\n";
$code.=" mov @ROT[4],$a0\n";
$code.=" mov @ROT[0],$a1\n";
$code.=" bswap $T1\n"; $code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT); &ROUND_00_15($i,@ROT);
unshift(@ROT,pop(@ROT)); unshift(@ROT,pop(@ROT));