sha512-x86_64.pl: +15% better performance on Westmere and incidentally Atom.
Other Intel processors +5%, Opteron -2%.
This commit is contained in:
parent
819cf4b886
commit
d2fd65f6f6
1 changed files with 44 additions and 50 deletions
|
@ -95,50 +95,44 @@ sub ROUND_00_15()
|
|||
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||||
|
||||
$code.=<<___;
|
||||
mov $e,$a0
|
||||
mov $e,$a1
|
||||
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
|
||||
mov $f,$a2
|
||||
|
||||
ror \$$Sigma1[0],$a0
|
||||
ror \$$Sigma1[1],$a1
|
||||
xor $g,$a2 # f^g
|
||||
|
||||
xor $a1,$a0
|
||||
ror \$`$Sigma1[2]-$Sigma1[1]`,$a1
|
||||
and $e,$a2 # (f^g)&e
|
||||
mov $T1,`$SZ*($i&0xf)`(%rsp)
|
||||
|
||||
xor $a1,$a0 # Sigma1(e)
|
||||
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
|
||||
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
|
||||
xor $e,$a0
|
||||
xor $g,$a2 # f^g
|
||||
|
||||
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
|
||||
add $h,$T1 # T1+=h
|
||||
xor $a,$a1
|
||||
|
||||
mov $a,$h
|
||||
add $a0,$T1 # T1+=Sigma1(e)
|
||||
|
||||
add $a2,$T1 # T1+=Ch(e,f,g)
|
||||
mov $a,$a0
|
||||
mov $a,$a1
|
||||
|
||||
ror \$$Sigma0[0],$h
|
||||
ror \$$Sigma0[1],$a0
|
||||
mov $a,$a2
|
||||
add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
|
||||
and $e,$a2 # (f^g)&e
|
||||
mov $b,$h
|
||||
|
||||
xor $a0,$h
|
||||
ror \$`$Sigma0[2]-$Sigma0[1]`,$a0
|
||||
or $c,$a1 # a|c
|
||||
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
|
||||
xor $e,$a0
|
||||
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
|
||||
|
||||
xor $c,$h # b^c
|
||||
xor $a,$a1
|
||||
add $a2,$T1 # T1+=Ch(e,f,g)
|
||||
mov $b,$a2
|
||||
|
||||
ror \$$Sigma1[0],$a0 # Sigma1(e)
|
||||
and $a,$h # h=(b^c)&a
|
||||
and $c,$a2 # b&c
|
||||
|
||||
ror \$$Sigma0[0],$a1 # Sigma0(a)
|
||||
add $a0,$T1 # T1+=Sigma1(e)
|
||||
add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
|
||||
|
||||
xor $a0,$h # h=Sigma0(a)
|
||||
and $c,$a2 # a&c
|
||||
add $T1,$d # d+=T1
|
||||
|
||||
and $b,$a1 # (a|c)&b
|
||||
add $T1,$h # h+=T1
|
||||
|
||||
or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
|
||||
lea 1($round),$round # round++
|
||||
add $a1,$h # h+=Sigma0(a)
|
||||
|
||||
add $a1,$h # h+=Maj(a,b,c)
|
||||
___
|
||||
}
|
||||
|
||||
|
@ -147,32 +141,30 @@ sub ROUND_16_XX()
|
|||
|
||||
$code.=<<___;
|
||||
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
|
||||
mov `$SZ*(($i+14)&0xf)`(%rsp),$T1
|
||||
|
||||
mov $a0,$a2
|
||||
mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
|
||||
mov $a0,$T1
|
||||
mov $a1,$a2
|
||||
|
||||
ror \$`$sigma0[1]-$sigma0[0]`,$T1
|
||||
xor $a0,$T1
|
||||
shr \$$sigma0[2],$a0
|
||||
ror \$$sigma0[0],$a2
|
||||
|
||||
xor $a2,$a0
|
||||
ror \$`$sigma0[1]-$sigma0[0]`,$a2
|
||||
ror \$$sigma0[0],$T1
|
||||
xor $T1,$a0 # sigma0(X[(i+1)&0xf])
|
||||
mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
|
||||
|
||||
xor $a2,$a0 # sigma0(X[(i+1)&0xf])
|
||||
mov $T1,$a1
|
||||
|
||||
shr \$$sigma1[2],$T1
|
||||
ror \$$sigma1[0],$a1
|
||||
|
||||
xor $a1,$T1
|
||||
ror \$`$sigma1[1]-$sigma1[0]`,$a1
|
||||
|
||||
xor $a1,$T1 # sigma1(X[(i+14)&0xf])
|
||||
ror \$`$sigma1[1]-$sigma1[0]`,$a2
|
||||
xor $a1,$a2
|
||||
shr \$$sigma1[2],$a1
|
||||
|
||||
ror \$$sigma1[0],$a2
|
||||
add $a0,$T1
|
||||
|
||||
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
|
||||
xor $a2,$a1 # sigma1(X[(i+14)&0xf])
|
||||
|
||||
add `$SZ*($i&0xf)`(%rsp),$T1
|
||||
mov $e,$a0
|
||||
add $a1,$T1
|
||||
mov $a,$a1
|
||||
___
|
||||
&ROUND_00_15(@_);
|
||||
}
|
||||
|
@ -219,6 +211,8 @@ $func:
|
|||
___
|
||||
for($i=0;$i<16;$i++) {
|
||||
$code.=" mov $SZ*$i($inp),$T1\n";
|
||||
$code.=" mov @ROT[4],$a0\n";
|
||||
$code.=" mov @ROT[0],$a1\n";
|
||||
$code.=" bswap $T1\n";
|
||||
&ROUND_00_15($i,@ROT);
|
||||
unshift(@ROT,pop(@ROT));
|
||||
|
|
Loading…
Reference in a new issue