245 lines
5.7 KiB
Perl
245 lines
5.7 KiB
Perl
|
#!/usr/bin/env perl
|
||
|
|
||
|
# ====================================================================
|
||
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
|
# project. The module is, however, dual licensed under OpenSSL and
|
||
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
|
# ====================================================================
|
||
|
|
||
|
# March 2015
|
||
|
#
|
||
|
# "Teaser" Montgomery multiplication module for ARMv8. Needs more
|
||
|
# work. While it does improve RSA sign performance by 20-30% (less for
|
||
|
# longer keys) on most processors, for some reason RSA2048 is not
|
||
|
# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
|
||
|
# instruction issue rate is limited on processor in question, meaning
|
||
|
# that dedicated squaring procedure is a must. Well, actually all
|
||
|
# contemporary AArch64 processors seem to have limited multiplication
|
||
|
# issue rate, i.e. they can't issue multiplication every cycle, which
|
||
|
# explains moderate improvement coefficients in comparison to
|
||
|
# compiler-generated code. Recall that compiler is instructed to use
|
||
|
# umulh and therefore uses same amount of multiplication instructions
|
||
|
# to do the job. Assembly's edge is to minimize number of "collateral"
|
||
|
# instructions and of course instruction scheduling.
|
||
|
|
||
|
$flavour = shift;
|
||
|
$output = shift;
|
||
|
|
||
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||
|
die "can't locate arm-xlate.pl";
|
||
|
|
||
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||
|
*STDOUT=*OUT;
|
||
|
|
||
|
($lo0,$hi0,$aj,$m0,$alo,$ahi,
|
||
|
$lo1,$hi1,$nj,$m1,$nlo,$nhi,
|
||
|
$ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
|
||
|
|
||
|
# int bn_mul_mont(
|
||
|
$rp="x0"; # BN_ULONG *rp,
|
||
|
$ap="x1"; # const BN_ULONG *ap,
|
||
|
$bp="x2"; # const BN_ULONG *bp,
|
||
|
$np="x3"; # const BN_ULONG *np,
|
||
|
$n0="x4"; # const BN_ULONG *n0,
|
||
|
$num="x5"; # int num);
|
||
|
|
||
|
$code.=<<___;
|
||
|
.text
|
||
|
|
||
|
.globl bn_mul_mont
|
||
|
.type bn_mul_mont,%function
|
||
|
.align 5
|
||
|
bn_mul_mont:
|
||
|
stp x29,x30,[sp,#-64]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
stp x23,x24,[sp,#48]
|
||
|
|
||
|
ldr $m0,[$bp],#8 // bp[0]
|
||
|
sub $tp,sp,$num,lsl#3
|
||
|
ldp $hi0,$aj,[$ap],#16 // ap[0..1]
|
||
|
lsl $num,$num,#3
|
||
|
ldr $n0,[$n0] // *n0
|
||
|
and $tp,$tp,#-16 // ABI says so
|
||
|
ldp $hi1,$nj,[$np],#16 // np[0..1]
|
||
|
|
||
|
mul $lo0,$hi0,$m0 // ap[0]*bp[0]
|
||
|
sub $j,$num,#16 // j=num-2
|
||
|
umulh $hi0,$hi0,$m0
|
||
|
mul $alo,$aj,$m0 // ap[1]*bp[0]
|
||
|
umulh $ahi,$aj,$m0
|
||
|
|
||
|
mul $m1,$lo0,$n0 // "tp[0]"*n0
|
||
|
mov sp,$tp // alloca
|
||
|
|
||
|
mul $lo1,$hi1,$m1 // np[0]*m1
|
||
|
umulh $hi1,$hi1,$m1
|
||
|
mul $nlo,$nj,$m1 // np[1]*m1
|
||
|
adds $lo1,$lo1,$lo0 // discarded
|
||
|
umulh $nhi,$nj,$m1
|
||
|
adc $hi1,$hi1,xzr
|
||
|
cbz $j,.L1st_skip
|
||
|
|
||
|
.L1st:
|
||
|
ldr $aj,[$ap],#8
|
||
|
adds $lo0,$alo,$hi0
|
||
|
sub $j,$j,#8 // j--
|
||
|
adc $hi0,$ahi,xzr
|
||
|
|
||
|
ldr $nj,[$np],#8
|
||
|
adds $lo1,$nlo,$hi1
|
||
|
mul $alo,$aj,$m0 // ap[j]*bp[0]
|
||
|
adc $hi1,$nhi,xzr
|
||
|
umulh $ahi,$aj,$m0
|
||
|
|
||
|
adds $lo1,$lo1,$lo0
|
||
|
mul $nlo,$nj,$m1 // np[j]*m1
|
||
|
adc $hi1,$hi1,xzr
|
||
|
umulh $nhi,$nj,$m1
|
||
|
str $lo1,[$tp],#8 // tp[j-1]
|
||
|
cbnz $j,.L1st
|
||
|
|
||
|
.L1st_skip:
|
||
|
adds $lo0,$alo,$hi0
|
||
|
sub $ap,$ap,$num // rewind $ap
|
||
|
adc $hi0,$ahi,xzr
|
||
|
|
||
|
adds $lo1,$nlo,$hi1
|
||
|
sub $np,$np,$num // rewind $np
|
||
|
adc $hi1,$nhi,xzr
|
||
|
|
||
|
adds $lo1,$lo1,$lo0
|
||
|
sub $i,$num,#8 // i=num-1
|
||
|
adcs $hi1,$hi1,$hi0
|
||
|
|
||
|
adc $ovf,xzr,xzr // upmost overflow bit
|
||
|
stp $lo1,$hi1,[$tp]
|
||
|
|
||
|
.Louter:
|
||
|
ldr $m0,[$bp],#8 // bp[i]
|
||
|
ldp $hi0,$aj,[$ap],#16
|
||
|
ldr $tj,[sp] // tp[0]
|
||
|
add $tp,sp,#8
|
||
|
|
||
|
mul $lo0,$hi0,$m0 // ap[0]*bp[i]
|
||
|
sub $j,$num,#16 // j=num-2
|
||
|
umulh $hi0,$hi0,$m0
|
||
|
ldp $hi1,$nj,[$np],#16
|
||
|
mul $alo,$aj,$m0 // ap[1]*bp[i]
|
||
|
adds $lo0,$lo0,$tj
|
||
|
umulh $ahi,$aj,$m0
|
||
|
adc $hi0,$hi0,xzr
|
||
|
|
||
|
mul $m1,$lo0,$n0
|
||
|
sub $i,$i,#8 // i--
|
||
|
|
||
|
mul $lo1,$hi1,$m1 // np[0]*m1
|
||
|
umulh $hi1,$hi1,$m1
|
||
|
mul $nlo,$nj,$m1 // np[1]*m1
|
||
|
adds $lo1,$lo1,$lo0
|
||
|
umulh $nhi,$nj,$m1
|
||
|
cbz $j,.Linner_skip
|
||
|
|
||
|
.Linner:
|
||
|
ldr $aj,[$ap],#8
|
||
|
adc $hi1,$hi1,xzr
|
||
|
ldr $tj,[$tp],#8 // tp[j]
|
||
|
adds $lo0,$alo,$hi0
|
||
|
sub $j,$j,#8 // j--
|
||
|
adc $hi0,$ahi,xzr
|
||
|
|
||
|
adds $lo1,$nlo,$hi1
|
||
|
ldr $nj,[$np],#8
|
||
|
adc $hi1,$nhi,xzr
|
||
|
|
||
|
mul $alo,$aj,$m0 // ap[j]*bp[i]
|
||
|
adds $lo0,$lo0,$tj
|
||
|
umulh $ahi,$aj,$m0
|
||
|
adc $hi0,$hi0,xzr
|
||
|
|
||
|
mul $nlo,$nj,$m1 // np[j]*m1
|
||
|
adds $lo1,$lo1,$lo0
|
||
|
umulh $nhi,$nj,$m1
|
||
|
str $lo1,[$tp,#-16] // tp[j-1]
|
||
|
cbnz $j,.Linner
|
||
|
|
||
|
.Linner_skip:
|
||
|
ldr $tj,[$tp],#8 // tp[j]
|
||
|
adc $hi1,$hi1,xzr
|
||
|
adds $lo0,$alo,$hi0
|
||
|
sub $ap,$ap,$num // rewind $ap
|
||
|
adc $hi0,$ahi,xzr
|
||
|
|
||
|
adds $lo1,$nlo,$hi1
|
||
|
sub $np,$np,$num // rewind $np
|
||
|
adc $hi1,$nhi,$ovf
|
||
|
|
||
|
adds $lo0,$lo0,$tj
|
||
|
adc $hi0,$hi0,xzr
|
||
|
|
||
|
adds $lo1,$lo1,$lo0
|
||
|
adcs $hi1,$hi1,$hi0
|
||
|
adc $ovf,xzr,xzr // upmost overflow bit
|
||
|
stp $lo1,$hi1,[$tp,#-16]
|
||
|
|
||
|
cbnz $i,.Louter
|
||
|
|
||
|
// Final step. We see if result is larger than modulus, and
|
||
|
// if it is, subtract the modulus. But comparison implies
|
||
|
// subtraction. So we subtract modulus, see if it borrowed,
|
||
|
// and conditionally copy original value.
|
||
|
ldr $tj,[sp] // tp[0]
|
||
|
add $tp,sp,#8
|
||
|
ldr $nj,[$np],#8 // np[0]
|
||
|
subs $j,$num,#8 // j=num-1 and clear borrow
|
||
|
mov $ap,$rp
|
||
|
.Lsub:
|
||
|
sbcs $aj,$tj,$nj // tp[j]-np[j]
|
||
|
ldr $tj,[$tp],#8
|
||
|
sub $j,$j,#8 // j--
|
||
|
ldr $nj,[$np],#8
|
||
|
str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
|
||
|
cbnz $j,.Lsub
|
||
|
|
||
|
sbcs $aj,$tj,$nj
|
||
|
sbcs $ovf,$ovf,xzr // did it borrow?
|
||
|
str $aj,[$ap],#8 // rp[num-1]
|
||
|
|
||
|
ldr $tj,[sp] // tp[0]
|
||
|
add $tp,sp,#8
|
||
|
ldr $aj,[$rp],#8 // rp[0]
|
||
|
sub $num,$num,#8 // num--
|
||
|
nop
|
||
|
.Lcond_copy:
|
||
|
sub $num,$num,#8 // num--
|
||
|
csel $nj,$aj,$tj,cs // did it borrow?
|
||
|
ldr $tj,[$tp],#8
|
||
|
ldr $aj,[$rp],#8
|
||
|
str xzr,[$tp,#-16] // wipe tp
|
||
|
str $nj,[$rp,#-16]
|
||
|
cbnz $num,.Lcond_copy
|
||
|
|
||
|
csel $nj,$aj,$tj,cs
|
||
|
str xzr,[$tp,#-8] // wipe tp
|
||
|
str $nj,[$rp,#-8]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
mov sp,x29
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldp x23,x24,[x29,#48]
|
||
|
ldr x29,[sp],#64
|
||
|
ret
|
||
|
.size bn_mul_mont,.-bn_mul_mont
|
||
|
|
||
|
.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
|
.align 4
|
||
|
___
|
||
|
|
||
|
print $code;
|
||
|
|
||
|
close STDOUT;
|