FIPS MIPS assembly pack refresh.

Backport CVE-2014-3570 bug and postability fixes.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from commit 10fa6736b1)
This commit is contained in:
Andy Polyakov 2016-06-24 12:53:54 +02:00
parent 894c04aa05
commit 2e691f8fa8
5 changed files with 165 additions and 516 deletions

View file

@ -47,7 +47,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@ -70,7 +70,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
@ -89,7 +89,7 @@ $code.=<<___;
# include <openssl/fipssyms.h>
#endif
#if !defined(__vxworks) || defined(__pic__)
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
.set noat

View file

@ -46,7 +46,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@ -133,7 +133,7 @@ $code.=<<___;
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnezl $at,bn_mul_mont_internal
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0

View file

@ -48,7 +48,7 @@
# has to content with 40-85% improvement depending on benchmark and
# key length, more for longer keys.
$flavour = shift;
$flavour = shift || "o32";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -140,10 +140,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
$LD $t0,0($a1)
beqz $ta0,.L_bn_mul_add_words_tail
.L_bn_mul_add_words_loop:
$LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t1,0($a0)
$LD $t2,$BNSZ($a1)
@ -200,10 +200,9 @@ $code.=<<___;
$ADDU $v0,$ta2
sltu $at,$ta3,$at
$ST $ta3,-$BNSZ($a0)
$ADDU $v0,$at
.set noreorder
bgtzl $ta0,.L_bn_mul_add_words_loop
$LD $t0,0($a1)
bgtz $ta0,.L_bn_mul_add_words_loop
$ADDU $v0,$at
beqz $a2,.L_bn_mul_add_words_return
nop
@ -300,10 +299,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
$LD $t0,0($a1)
beqz $ta0,.L_bn_mul_words_tail
.L_bn_mul_words_loop:
$LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
@ -341,10 +340,9 @@ $code.=<<___;
$ADDU $v0,$at
sltu $ta3,$v0,$at
$ST $v0,-$BNSZ($a0)
$ADDU $v0,$ta3,$ta2
.set noreorder
bgtzl $ta0,.L_bn_mul_words_loop
$LD $t0,0($a1)
bgtz $ta0,.L_bn_mul_words_loop
$ADDU $v0,$ta3,$ta2
beqz $a2,.L_bn_mul_words_return
nop
@ -429,10 +427,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
$LD $t0,0($a1)
beqz $ta0,.L_bn_sqr_words_tail
.L_bn_sqr_words_loop:
$LD $t0,0($a1)
$MULTU $t0,$t0
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
@ -463,11 +461,10 @@ $code.=<<___;
mflo $ta3
mfhi $ta2
$ST $ta3,-2*$BNSZ($a0)
$ST $ta2,-$BNSZ($a0)
.set noreorder
bgtzl $ta0,.L_bn_sqr_words_loop
$LD $t0,0($a1)
bgtz $ta0,.L_bn_sqr_words_loop
$ST $ta2,-$BNSZ($a0)
beqz $a2,.L_bn_sqr_words_return
nop
@ -547,10 +544,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $at,$a3,$minus4
$LD $t0,0($a1)
beqz $at,.L_bn_add_words_tail
.L_bn_add_words_loop:
$LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
@ -589,11 +586,10 @@ $code.=<<___;
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
$ADDU $v0,$t9
.set noreorder
bgtzl $at,.L_bn_add_words_loop
$LD $t0,0($a1)
bgtz $at,.L_bn_add_words_loop
$ADDU $v0,$t9
beqz $a3,.L_bn_add_words_return
nop
@ -679,10 +675,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $at,$a3,$minus4
$LD $t0,0($a1)
beqz $at,.L_bn_sub_words_tail
.L_bn_sub_words_loop:
$LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
@ -722,11 +718,10 @@ $code.=<<___;
$SUBU $t3,$ta3,$v0
sgtu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
$ADDU $v0,$t9
.set noreorder
bgtzl $at,.L_bn_sub_words_loop
$LD $t0,0($a1)
bgtz $at,.L_bn_sub_words_loop
$ADDU $v0,$t9
beqz $a3,.L_bn_sub_words_return
nop
@ -819,7 +814,7 @@ ___
$code.=<<___;
.set reorder
move $ta3,$ra
bal bn_div_words
bal bn_div_words_internal
move $ra,$ta3
$MULTU $ta2,$v0
$LD $t2,-2*$BNSZ($a3)
@ -840,8 +835,9 @@ $code.=<<___;
sltu $ta0,$a1,$a2
or $t8,$ta0
.set noreorder
beqzl $at,.L_bn_div_3_words_inner_loop
beqz $at,.L_bn_div_3_words_inner_loop
$SUBU $v0,1
$ADDU $v0,1
.set reorder
.L_bn_div_3_words_inner_loop_done:
.set noreorder
@ -902,7 +898,8 @@ $code.=<<___;
and $t2,$a0
$SRL $at,$a1,$t1
.set noreorder
bnezl $t2,.+8
beqz $t2,.+12
nop
break 6 # signal overflow
.set reorder
$SLL $a0,$t9
@ -917,7 +914,8 @@ $code.=<<___;
$SRL $DH,$a2,4*$BNSZ # bits
sgeu $at,$a0,$a2
.set noreorder
bnezl $at,.+8
beqz $at,.+12
nop
$SUBU $a0,$a2
.set reorder
@ -1874,6 +1872,41 @@ ___
($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
sub add_c2 () {
my ($hi,$lo,$c0,$c1,$c2,
$warm, # !$warm denotes first call with specific sequence of
# $c_[XYZ] when there is no Z-carry to accumulate yet;
$an,$bn # these two are arguments for multiplication which
# result is used in *next* step [which is why it's
# commented as "forward multiplication" below];
)=@_;
$code.=<<___;
mflo $lo
mfhi $hi
$ADDU $c0,$lo
sltu $at,$c0,$lo
$MULTU $an,$bn # forward multiplication
$ADDU $c0,$lo
$ADDU $at,$hi
sltu $lo,$c0,$lo
$ADDU $c1,$at
$ADDU $hi,$lo
___
$code.=<<___ if (!$warm);
sltu $c2,$c1,$at
$ADDU $c1,$hi
sltu $hi,$c1,$hi
$ADDU $c2,$hi
___
$code.=<<___ if ($warm);
sltu $at,$c1,$at
$ADDU $c1,$hi
$ADDU $c2,$at
sltu $hi,$c1,$hi
$ADDU $c2,$hi
___
}
$code.=<<___;
.align 5
@ -1922,21 +1955,10 @@ $code.=<<___;
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@ -1947,67 +1969,19 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_3,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
___
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
$a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_1,$t_2,$zero
$SLL $t_2,1
$MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_1,$at
$MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
___
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@ -2018,97 +1992,23 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_2,$at
$MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
$ADDU $c_2,$at
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
$a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
$a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_3,$t_2,$zero
$SLL $t_2,1
$MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
___
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
$a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
@ -2119,112 +2019,25 @@ $code.=<<___;
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,6*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_1,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_1,$at
$MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_1,$at
$MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_1,$at
$MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
___
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
$a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
$a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
$a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
$a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
$code.=<<___;
$ST $c_2,7*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_2,$at
$MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_2,$at
$MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
$a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
$a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@ -2235,82 +2048,21 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,8*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_3,$t_2,$zero
$SLL $t_2,1
$MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
___
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
$a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
$code.=<<___;
$ST $c_1,9*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_1,$t_2,$zero
$SLL $t_2,1
$MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_1,$at
$MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
___
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
$a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
$a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@ -2321,52 +2073,17 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,10*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_2,$at
$MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
$a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
$code.=<<___;
$ST $c_3,11*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_3,$t_2,$zero
$SLL $t_2,1
$MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
___
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
$a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
@ -2377,21 +2094,10 @@ $code.=<<___;
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,12*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_1,$t_2,$zero
$SLL $t_2,1
$MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
___
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
$a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
$code.=<<___;
$ST $c_2,13*$BNSZ($a0)
mflo $t_1
@ -2459,21 +2165,10 @@ $code.=<<___;
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@ -2484,52 +2179,17 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_3,$t_2,$zero
$SLL $t_2,1
$MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
mflo $t_1
mfhi $t_2
slt $at,$t_2,$zero
$ADDU $c_3,$at
$MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
$SLL $t_2,1
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_1,$t_1
sltu $at,$c_1,$t_1
$ADDU $t_2,$at
$ADDU $c_2,$t_2
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
___
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
$a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_1,$t_2,$zero
$SLL $t_2,1
$MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_2,$t_1
sltu $at,$c_2,$t_1
$ADDU $t_2,$at
$ADDU $c_3,$t_2
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
___
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@ -2540,21 +2200,10 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
mflo $t_1
mfhi $t_2
slt $c_2,$t_2,$zero
$SLL $t_2,1
$MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
slt $a2,$t_1,$zero
$ADDU $t_2,$a2
$SLL $t_1,1
$ADDU $c_3,$t_1
sltu $at,$c_3,$t_1
$ADDU $t_2,$at
$ADDU $c_1,$t_2
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
___
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
mflo $t_1

View file

@ -42,7 +42,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@ -64,7 +64,7 @@ if ($flavour =~ /64|n32/i) {
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";

View file

@ -45,7 +45,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@ -68,7 +68,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
@ -244,7 +244,7 @@ $code.=<<___;
.text
.set noat
#if !defined(__vxworks) || defined(__pic__)
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
@ -351,7 +351,7 @@ $code.=<<___;
$ST $G,6*$SZ($ctx)
$ST $H,7*$SZ($ctx)
bnel $inp,@X[15],.Loop
bne $inp,@X[15],.Loop
$PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
$REG_L $ra,$FRAMESIZE-1*$SZREG($sp)