Add ARMv8 Montgomery multiplication module.
Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
parent
35141544e2
commit
cb2ed54582
1 changed files with 244 additions and 0 deletions
244
crypto/bn/asm/armv8-mont.pl
Executable file
244
crypto/bn/asm/armv8-mont.pl
Executable file
|
@ -0,0 +1,244 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# March 2015
|
||||
#
|
||||
# "Teaser" Montgomery multiplication module for ARMv8. Needs more
|
||||
# work. While it does improve RSA sign performance by 20-30% (less for
|
||||
# longer keys) on most processors, for some reason RSA2048 is not
|
||||
# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
|
||||
# instruction issue rate is limited on processor in question, meaning
|
||||
# that dedicated squaring procedure is a must. Well, actually all
|
||||
# contemporary AArch64 processors seem to have limited multiplication
|
||||
# issue rate, i.e. they can't issue multiplication every cycle, which
|
||||
# explains moderate improvement coefficients in comparison to
|
||||
# compiler-generated code. Recall that compiler is instructed to use
|
||||
# umulh and therefore uses same amount of multiplication instructions
|
||||
# to do the job. Assembly's edge is to minimize number of "collateral"
|
||||
# instructions and of course instruction scheduling.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
($lo0,$hi0,$aj,$m0,$alo,$ahi,
|
||||
$lo1,$hi1,$nj,$m1,$nlo,$nhi,
|
||||
$ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="x0"; # BN_ULONG *rp,
|
||||
$ap="x1"; # const BN_ULONG *ap,
|
||||
$bp="x2"; # const BN_ULONG *bp,
|
||||
$np="x3"; # const BN_ULONG *np,
|
||||
$n0="x4"; # const BN_ULONG *n0,
|
||||
$num="x5"; # int num);
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl bn_mul_mont
|
||||
.type bn_mul_mont,%function
|
||||
.align 5
|
||||
bn_mul_mont:
|
||||
stp x29,x30,[sp,#-64]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
|
||||
ldr $m0,[$bp],#8 // bp[0]
|
||||
sub $tp,sp,$num,lsl#3
|
||||
ldp $hi0,$aj,[$ap],#16 // ap[0..1]
|
||||
lsl $num,$num,#3
|
||||
ldr $n0,[$n0] // *n0
|
||||
and $tp,$tp,#-16 // ABI says so
|
||||
ldp $hi1,$nj,[$np],#16 // np[0..1]
|
||||
|
||||
mul $lo0,$hi0,$m0 // ap[0]*bp[0]
|
||||
sub $j,$num,#16 // j=num-2
|
||||
umulh $hi0,$hi0,$m0
|
||||
mul $alo,$aj,$m0 // ap[1]*bp[0]
|
||||
umulh $ahi,$aj,$m0
|
||||
|
||||
mul $m1,$lo0,$n0 // "tp[0]"*n0
|
||||
mov sp,$tp // alloca
|
||||
|
||||
mul $lo1,$hi1,$m1 // np[0]*m1
|
||||
umulh $hi1,$hi1,$m1
|
||||
mul $nlo,$nj,$m1 // np[1]*m1
|
||||
adds $lo1,$lo1,$lo0 // discarded
|
||||
umulh $nhi,$nj,$m1
|
||||
adc $hi1,$hi1,xzr
|
||||
cbz $j,.L1st_skip
|
||||
|
||||
.L1st:
|
||||
ldr $aj,[$ap],#8
|
||||
adds $lo0,$alo,$hi0
|
||||
sub $j,$j,#8 // j--
|
||||
adc $hi0,$ahi,xzr
|
||||
|
||||
ldr $nj,[$np],#8
|
||||
adds $lo1,$nlo,$hi1
|
||||
mul $alo,$aj,$m0 // ap[j]*bp[0]
|
||||
adc $hi1,$nhi,xzr
|
||||
umulh $ahi,$aj,$m0
|
||||
|
||||
adds $lo1,$lo1,$lo0
|
||||
mul $nlo,$nj,$m1 // np[j]*m1
|
||||
adc $hi1,$hi1,xzr
|
||||
umulh $nhi,$nj,$m1
|
||||
str $lo1,[$tp],#8 // tp[j-1]
|
||||
cbnz $j,.L1st
|
||||
|
||||
.L1st_skip:
|
||||
adds $lo0,$alo,$hi0
|
||||
sub $ap,$ap,$num // rewind $ap
|
||||
adc $hi0,$ahi,xzr
|
||||
|
||||
adds $lo1,$nlo,$hi1
|
||||
sub $np,$np,$num // rewind $np
|
||||
adc $hi1,$nhi,xzr
|
||||
|
||||
adds $lo1,$lo1,$lo0
|
||||
sub $i,$num,#8 // i=num-1
|
||||
adcs $hi1,$hi1,$hi0
|
||||
|
||||
adc $ovf,xzr,xzr // upmost overflow bit
|
||||
stp $lo1,$hi1,[$tp]
|
||||
|
||||
.Louter:
|
||||
ldr $m0,[$bp],#8 // bp[i]
|
||||
ldp $hi0,$aj,[$ap],#16
|
||||
ldr $tj,[sp] // tp[0]
|
||||
add $tp,sp,#8
|
||||
|
||||
mul $lo0,$hi0,$m0 // ap[0]*bp[i]
|
||||
sub $j,$num,#16 // j=num-2
|
||||
umulh $hi0,$hi0,$m0
|
||||
ldp $hi1,$nj,[$np],#16
|
||||
mul $alo,$aj,$m0 // ap[1]*bp[i]
|
||||
adds $lo0,$lo0,$tj
|
||||
umulh $ahi,$aj,$m0
|
||||
adc $hi0,$hi0,xzr
|
||||
|
||||
mul $m1,$lo0,$n0
|
||||
sub $i,$i,#8 // i--
|
||||
|
||||
mul $lo1,$hi1,$m1 // np[0]*m1
|
||||
umulh $hi1,$hi1,$m1
|
||||
mul $nlo,$nj,$m1 // np[1]*m1
|
||||
adds $lo1,$lo1,$lo0
|
||||
umulh $nhi,$nj,$m1
|
||||
cbz $j,.Linner_skip
|
||||
|
||||
.Linner:
|
||||
ldr $aj,[$ap],#8
|
||||
adc $hi1,$hi1,xzr
|
||||
ldr $tj,[$tp],#8 // tp[j]
|
||||
adds $lo0,$alo,$hi0
|
||||
sub $j,$j,#8 // j--
|
||||
adc $hi0,$ahi,xzr
|
||||
|
||||
adds $lo1,$nlo,$hi1
|
||||
ldr $nj,[$np],#8
|
||||
adc $hi1,$nhi,xzr
|
||||
|
||||
mul $alo,$aj,$m0 // ap[j]*bp[i]
|
||||
adds $lo0,$lo0,$tj
|
||||
umulh $ahi,$aj,$m0
|
||||
adc $hi0,$hi0,xzr
|
||||
|
||||
mul $nlo,$nj,$m1 // np[j]*m1
|
||||
adds $lo1,$lo1,$lo0
|
||||
umulh $nhi,$nj,$m1
|
||||
str $lo1,[$tp,#-16] // tp[j-1]
|
||||
cbnz $j,.Linner
|
||||
|
||||
.Linner_skip:
|
||||
ldr $tj,[$tp],#8 // tp[j]
|
||||
adc $hi1,$hi1,xzr
|
||||
adds $lo0,$alo,$hi0
|
||||
sub $ap,$ap,$num // rewind $ap
|
||||
adc $hi0,$ahi,xzr
|
||||
|
||||
adds $lo1,$nlo,$hi1
|
||||
sub $np,$np,$num // rewind $np
|
||||
adc $hi1,$nhi,$ovf
|
||||
|
||||
adds $lo0,$lo0,$tj
|
||||
adc $hi0,$hi0,xzr
|
||||
|
||||
adds $lo1,$lo1,$lo0
|
||||
adcs $hi1,$hi1,$hi0
|
||||
adc $ovf,xzr,xzr // upmost overflow bit
|
||||
stp $lo1,$hi1,[$tp,#-16]
|
||||
|
||||
cbnz $i,.Louter
|
||||
|
||||
// Final step. We see if result is larger than modulus, and
|
||||
// if it is, subtract the modulus. But comparison implies
|
||||
// subtraction. So we subtract modulus, see if it borrowed,
|
||||
// and conditionally copy original value.
|
||||
ldr $tj,[sp] // tp[0]
|
||||
add $tp,sp,#8
|
||||
ldr $nj,[$np],#8 // np[0]
|
||||
subs $j,$num,#8 // j=num-1 and clear borrow
|
||||
mov $ap,$rp
|
||||
.Lsub:
|
||||
sbcs $aj,$tj,$nj // tp[j]-np[j]
|
||||
ldr $tj,[$tp],#8
|
||||
sub $j,$j,#8 // j--
|
||||
ldr $nj,[$np],#8
|
||||
str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
|
||||
cbnz $j,.Lsub
|
||||
|
||||
sbcs $aj,$tj,$nj
|
||||
sbcs $ovf,$ovf,xzr // did it borrow?
|
||||
str $aj,[$ap],#8 // rp[num-1]
|
||||
|
||||
ldr $tj,[sp] // tp[0]
|
||||
add $tp,sp,#8
|
||||
ldr $aj,[$rp],#8 // rp[0]
|
||||
sub $num,$num,#8 // num--
|
||||
nop
|
||||
.Lcond_copy:
|
||||
sub $num,$num,#8 // num--
|
||||
csel $nj,$aj,$tj,cs // did it borrow?
|
||||
ldr $tj,[$tp],#8
|
||||
ldr $aj,[$rp],#8
|
||||
str xzr,[$tp,#-16] // wipe tp
|
||||
str $nj,[$rp,#-16]
|
||||
cbnz $num,.Lcond_copy
|
||||
|
||||
csel $nj,$aj,$tj,cs
|
||||
str xzr,[$tp,#-8] // wipe tp
|
||||
str $nj,[$rp,#-8]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
mov sp,x29
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x23,x24,[x29,#48]
|
||||
ldr x29,[sp],#64
|
||||
ret
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
|
||||
.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
print $code;
|
||||
|
||||
close STDOUT;
|
Loading…
Reference in a new issue