bn/asm/sparcv9-mont.pl: fix squaring code path.

This module is used only with odd input lengths, i.e. not used in normal
PKI cases, on contemporary processors. The problem was "illuminated" by
fuzzing tests.

Reviewed-by: Richard Levitte <levitte@openssl.org>
This commit is contained in:
Andy Polyakov 2017-03-22 10:51:25 +01:00
parent 3ba4dac67a
commit 120a9e1a82

View file

@ -300,7 +300,7 @@ ___
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2"; # re-use $bp!
$sbit="%o5";
$code.=<<___;
.align 32
@ -413,7 +413,7 @@ $code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
@ -422,7 +422,7 @@ $code.=<<___;
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
@ -436,12 +436,12 @@ $code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@ -449,7 +449,7 @@ $code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@ -509,7 +509,7 @@ $code.=<<___;
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
@ -517,7 +517,7 @@ $code.=<<___;
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
@ -532,12 +532,12 @@ $code.=<<___;
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@ -546,7 +546,7 @@ $code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@ -591,14 +591,17 @@ $code.=<<___;
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$acc0,$acc0
srlx $acc0,32,$tmp0
and $acc0,$mask,$acc0
add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]