openssl/crypto/bn/asm/armv8-mont.pl

#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# March 2015
#
# "Teaser" Montgomery multiplication module for ARMv8. Needs more
# work. While it does improve RSA sign performance by 20-30% (less for
# longer keys) on most processors, for some reason RSA2048 is not
# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
# instruction issue rate is limited on processor in question, meaning
# that dedicated squaring procedure is a must. Well, actually all
# contemporary AArch64 processors seem to have limited multiplication
# issue rate, i.e. they can't issue multiplication every cycle, which
# explains moderate improvement coefficients in comparison to
# compiler-generated code. Recall that compiler is instructed to use
# umulh and therefore uses same amount of multiplication instructions
# to do the job. Assembly's edge is to minimize number of "collateral"
# instructions and of course instruction scheduling.

$flavour = shift;
$output  = shift;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

($lo0,$hi0,$aj,$m0,$alo,$ahi,
 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);

# int bn_mul_mont(
$rp="x0";	# BN_ULONG *rp,
$ap="x1";	# const BN_ULONG *ap,
$bp="x2";	# const BN_ULONG *bp,
$np="x3";	# const BN_ULONG *np,
$n0="x4";	# const BN_ULONG *n0,
$num="x5";	# int num);

$code.=<<___;
.text

.globl	bn_mul_mont
.type	bn_mul_mont,%function
.align	5
bn_mul_mont:
	stp	x29,x30,[sp,#-64]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]

	ldr	$m0,[$bp],#8		// bp[0]
	sub	$tp,sp,$num,lsl#3
	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
	lsl	$num,$num,#3
	ldr	$n0,[$n0]		// *n0
	and	$tp,$tp,#-16		// ABI says so
	ldp	$hi1,$nj,[$np],#16	// np[0..1]

	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
	sub	$j,$num,#16		// j=num-2
	umulh	$hi0,$hi0,$m0
	mul	$alo,$aj,$m0		// ap[1]*bp[0]
	umulh	$ahi,$aj,$m0

	mul	$m1,$lo0,$n0		// "tp[0]"*n0
	mov	sp,$tp			// alloca

	mul	$lo1,$hi1,$m1		// np[0]*m1
	umulh	$hi1,$hi1,$m1
	mul	$nlo,$nj,$m1		// np[1]*m1
	adds	$lo1,$lo1,$lo0		// discarded
	umulh	$nhi,$nj,$m1
	adc	$hi1,$hi1,xzr
	cbz	$j,.L1st_skip

.L1st:
	ldr	$aj,[$ap],#8
	adds	$lo0,$alo,$hi0
	sub	$j,$j,#8		// j--
	adc	$hi0,$ahi,xzr

	ldr	$nj,[$np],#8
	adds	$lo1,$nlo,$hi1
	mul	$alo,$aj,$m0		// ap[j]*bp[0]
	adc	$hi1,$nhi,xzr
	umulh	$ahi,$aj,$m0

	adds	$lo1,$lo1,$lo0
	mul	$nlo,$nj,$m1		// np[j]*m1
	adc	$hi1,$hi1,xzr
	umulh	$nhi,$nj,$m1
	str	$lo1,[$tp],#8		// tp[j-1]
	cbnz	$j,.L1st

.L1st_skip:
	adds	$lo0,$alo,$hi0
	sub	$ap,$ap,$num		// rewind $ap
	adc	$hi0,$ahi,xzr

	adds	$lo1,$nlo,$hi1
	sub	$np,$np,$num		// rewind $np
	adc	$hi1,$nhi,xzr

	adds	$lo1,$lo1,$lo0
	sub	$i,$num,#8		// i=num-1
	adcs	$hi1,$hi1,$hi0

	adc	$ovf,xzr,xzr		// upmost overflow bit
	stp	$lo1,$hi1,[$tp]

.Louter:
	ldr	$m0,[$bp],#8		// bp[i]
	ldp	$hi0,$aj,[$ap],#16
	ldr	$tj,[sp]		// tp[0]
	add	$tp,sp,#8

	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
	sub	$j,$num,#16		// j=num-2
	umulh	$hi0,$hi0,$m0
	ldp	$hi1,$nj,[$np],#16
	mul	$alo,$aj,$m0		// ap[1]*bp[i]
	adds	$lo0,$lo0,$tj
	umulh	$ahi,$aj,$m0
	adc	$hi0,$hi0,xzr

	mul	$m1,$lo0,$n0
	sub	$i,$i,#8		// i--

	mul	$lo1,$hi1,$m1		// np[0]*m1
	umulh	$hi1,$hi1,$m1
	mul	$nlo,$nj,$m1		// np[1]*m1
	adds	$lo1,$lo1,$lo0
	umulh	$nhi,$nj,$m1
	cbz	$j,.Linner_skip

.Linner:
	ldr	$aj,[$ap],#8
	adc	$hi1,$hi1,xzr
	ldr	$tj,[$tp],#8		// tp[j]
	adds	$lo0,$alo,$hi0
	sub	$j,$j,#8		// j--
	adc	$hi0,$ahi,xzr

	adds	$lo1,$nlo,$hi1
	ldr	$nj,[$np],#8
	adc	$hi1,$nhi,xzr

	mul	$alo,$aj,$m0		// ap[j]*bp[i]
	adds	$lo0,$lo0,$tj
	umulh	$ahi,$aj,$m0
	adc	$hi0,$hi0,xzr

	mul	$nlo,$nj,$m1		// np[j]*m1
	adds	$lo1,$lo1,$lo0
	umulh	$nhi,$nj,$m1
	str	$lo1,[$tp,#-16]		// tp[j-1]
	cbnz	$j,.Linner

.Linner_skip:
	ldr	$tj,[$tp],#8		// tp[j]
	adc	$hi1,$hi1,xzr
	adds	$lo0,$alo,$hi0
	sub	$ap,$ap,$num		// rewind $ap
	adc	$hi0,$ahi,xzr

	adds	$lo1,$nlo,$hi1
	sub	$np,$np,$num		// rewind $np
	adc	$hi1,$nhi,$ovf

	adds	$lo0,$lo0,$tj
	adc	$hi0,$hi0,xzr

	adds	$lo1,$lo1,$lo0
	adcs	$hi1,$hi1,$hi0
	adc	$ovf,xzr,xzr		// upmost overflow bit
	stp	$lo1,$hi1,[$tp,#-16]

	cbnz	$i,.Louter

	// Final step. We see if result is larger than modulus, and
	// if it is, subtract the modulus. But comparison implies
	// subtraction. So we subtract modulus, see if it borrowed,
	// and conditionally copy original value. 
	ldr	$tj,[sp]		// tp[0]
	add	$tp,sp,#8
	ldr	$nj,[$np],#8		// np[0]
	subs	$j,$num,#8		// j=num-1 and clear borrow
	mov	$ap,$rp
.Lsub:
	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
	ldr	$tj,[$tp],#8
	sub	$j,$j,#8		// j--
	ldr	$nj,[$np],#8
	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
	cbnz	$j,.Lsub

	sbcs	$aj,$tj,$nj
	sbcs	$ovf,$ovf,xzr		// did it borrow?
	str	$aj,[$ap],#8		// rp[num-1]

	ldr	$tj,[sp]		// tp[0]
	add	$tp,sp,#8
	ldr	$aj,[$rp],#8		// rp[0]
	sub	$num,$num,#8		// num--
	nop
.Lcond_copy:
	sub	$num,$num,#8		// num--
	csel	$nj,$aj,$tj,cs		// did it borrow?
	ldr	$tj,[$tp],#8
	ldr	$aj,[$rp],#8
	str	xzr,[$tp,#-16]		// wipe tp
	str	$nj,[$rp,#-16]
	cbnz	$num,.Lcond_copy

	csel	$nj,$aj,$tj,cs
	str	xzr,[$tp,#-8]		// wipe tp
	str	$nj,[$rp,#-8]

	ldp	x19,x20,[x29,#16]
	mov	sp,x29
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldr	x29,[sp],#64
	ret
.size	bn_mul_mont,.-bn_mul_mont

.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align	4
___

print $code;

close STDOUT;
Add ARMv8 Montgomery multiplication module. Reviewed-by: Rich Salz <rsalz@openssl.org> 2015-03-21 12:54:17 +00:00			`#!/usr/bin/env perl`

			`# ====================================================================`
			`# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL`
			`# project. The module is, however, dual licensed under OpenSSL and`
			`# CRYPTOGAMS licenses depending on where you obtain it. For further`
			`# details see http://www.openssl.org/~appro/cryptogams/.`
			`# ====================================================================`

			`# March 2015`
			`#`
			`# "Teaser" Montgomery multiplication module for ARMv8. Needs more`
			`# work. While it does improve RSA sign performance by 20-30% (less for`
			`# longer keys) on most processors, for some reason RSA2048 is not`
			`# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication`
			`# instruction issue rate is limited on processor in question, meaning`
			`# that dedicated squaring procedure is a must. Well, actually all`
			`# contemporary AArch64 processors seem to have limited multiplication`
			`# issue rate, i.e. they can't issue multiplication every cycle, which`
			`# explains moderate improvement coefficients in comparison to`
			`# compiler-generated code. Recall that compiler is instructed to use`
			`# umulh and therefore uses same amount of multiplication instructions`
			`# to do the job. Assembly's edge is to minimize number of "collateral"`
			`# instructions and of course instruction scheduling.`

			`$flavour = shift;`
			`$output = shift;`

			`$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;`
			`( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or`
			`( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or`
			`die "can't locate arm-xlate.pl";`

			`open OUT,"\| \"$^X\" $xlate $flavour $output";`
			`STDOUT=OUT;`

			`($lo0,$hi0,$aj,$m0,$alo,$ahi,`
			`$lo1,$hi1,$nj,$m1,$nlo,$nhi,`
			`$ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);`

			`# int bn_mul_mont(`
			`$rp="x0"; # BN_ULONG *rp,`
			`$ap="x1"; # const BN_ULONG *ap,`
			`$bp="x2"; # const BN_ULONG *bp,`
			`$np="x3"; # const BN_ULONG *np,`
			`$n0="x4"; # const BN_ULONG *n0,`
			`$num="x5"; # int num);`

			`$code.=<<___;`
			`.text`

			`.globl bn_mul_mont`
			`.type bn_mul_mont,%function`
			`.align 5`
			`bn_mul_mont:`
			`stp x29,x30,[sp,#-64]!`
			`add x29,sp,#0`
			`stp x19,x20,[sp,#16]`
			`stp x21,x22,[sp,#32]`
			`stp x23,x24,[sp,#48]`

			`ldr $m0,[$bp],#8 // bp[0]`
			`sub $tp,sp,$num,lsl#3`
			`ldp $hi0,$aj,[$ap],#16 // ap[0..1]`
			`lsl $num,$num,#3`
			`ldr $n0,[$n0] // *n0`
			`and $tp,$tp,#-16 // ABI says so`
			`ldp $hi1,$nj,[$np],#16 // np[0..1]`

			`mul $lo0,$hi0,$m0 // ap[0]*bp[0]`
			`sub $j,$num,#16 // j=num-2`
			`umulh $hi0,$hi0,$m0`
			`mul $alo,$aj,$m0 // ap[1]*bp[0]`
			`umulh $ahi,$aj,$m0`

			`mul $m1,$lo0,$n0 // "tp[0]"*n0`
			`mov sp,$tp // alloca`

			`mul $lo1,$hi1,$m1 // np[0]*m1`
			`umulh $hi1,$hi1,$m1`
			`mul $nlo,$nj,$m1 // np[1]*m1`
			`adds $lo1,$lo1,$lo0 // discarded`
			`umulh $nhi,$nj,$m1`
			`adc $hi1,$hi1,xzr`
			`cbz $j,.L1st_skip`

			`.L1st:`
			`ldr $aj,[$ap],#8`
			`adds $lo0,$alo,$hi0`
			`sub $j,$j,#8 // j--`
			`adc $hi0,$ahi,xzr`

			`ldr $nj,[$np],#8`
			`adds $lo1,$nlo,$hi1`
			`mul $alo,$aj,$m0 // ap[j]*bp[0]`
			`adc $hi1,$nhi,xzr`
			`umulh $ahi,$aj,$m0`

			`adds $lo1,$lo1,$lo0`
			`mul $nlo,$nj,$m1 // np[j]*m1`
			`adc $hi1,$hi1,xzr`
			`umulh $nhi,$nj,$m1`
			`str $lo1,[$tp],#8 // tp[j-1]`
			`cbnz $j,.L1st`

			`.L1st_skip:`
			`adds $lo0,$alo,$hi0`
			`sub $ap,$ap,$num // rewind $ap`
			`adc $hi0,$ahi,xzr`

			`adds $lo1,$nlo,$hi1`
			`sub $np,$np,$num // rewind $np`
			`adc $hi1,$nhi,xzr`

			`adds $lo1,$lo1,$lo0`
			`sub $i,$num,#8 // i=num-1`
			`adcs $hi1,$hi1,$hi0`

			`adc $ovf,xzr,xzr // upmost overflow bit`
			`stp $lo1,$hi1,[$tp]`

			`.Louter:`
			`ldr $m0,[$bp],#8 // bp[i]`
			`ldp $hi0,$aj,[$ap],#16`
			`ldr $tj,[sp] // tp[0]`
			`add $tp,sp,#8`

			`mul $lo0,$hi0,$m0 // ap[0]*bp[i]`
			`sub $j,$num,#16 // j=num-2`
			`umulh $hi0,$hi0,$m0`
			`ldp $hi1,$nj,[$np],#16`
			`mul $alo,$aj,$m0 // ap[1]*bp[i]`
			`adds $lo0,$lo0,$tj`
			`umulh $ahi,$aj,$m0`
			`adc $hi0,$hi0,xzr`

			`mul $m1,$lo0,$n0`
			`sub $i,$i,#8 // i--`

			`mul $lo1,$hi1,$m1 // np[0]*m1`
			`umulh $hi1,$hi1,$m1`
			`mul $nlo,$nj,$m1 // np[1]*m1`
			`adds $lo1,$lo1,$lo0`
			`umulh $nhi,$nj,$m1`
			`cbz $j,.Linner_skip`

			`.Linner:`
			`ldr $aj,[$ap],#8`
			`adc $hi1,$hi1,xzr`
			`ldr $tj,[$tp],#8 // tp[j]`
			`adds $lo0,$alo,$hi0`
			`sub $j,$j,#8 // j--`
			`adc $hi0,$ahi,xzr`

			`adds $lo1,$nlo,$hi1`
			`ldr $nj,[$np],#8`
			`adc $hi1,$nhi,xzr`

			`mul $alo,$aj,$m0 // ap[j]*bp[i]`
			`adds $lo0,$lo0,$tj`
			`umulh $ahi,$aj,$m0`
			`adc $hi0,$hi0,xzr`

			`mul $nlo,$nj,$m1 // np[j]*m1`
			`adds $lo1,$lo1,$lo0`
			`umulh $nhi,$nj,$m1`
			`str $lo1,[$tp,#-16] // tp[j-1]`
			`cbnz $j,.Linner`

			`.Linner_skip:`
			`ldr $tj,[$tp],#8 // tp[j]`
			`adc $hi1,$hi1,xzr`
			`adds $lo0,$alo,$hi0`
			`sub $ap,$ap,$num // rewind $ap`
			`adc $hi0,$ahi,xzr`

			`adds $lo1,$nlo,$hi1`
			`sub $np,$np,$num // rewind $np`
			`adc $hi1,$nhi,$ovf`

			`adds $lo0,$lo0,$tj`
			`adc $hi0,$hi0,xzr`

			`adds $lo1,$lo1,$lo0`
			`adcs $hi1,$hi1,$hi0`
			`adc $ovf,xzr,xzr // upmost overflow bit`
			`stp $lo1,$hi1,[$tp,#-16]`

			`cbnz $i,.Louter`

			`// Final step. We see if result is larger than modulus, and`
			`// if it is, subtract the modulus. But comparison implies`
			`// subtraction. So we subtract modulus, see if it borrowed,`
			`// and conditionally copy original value.`
			`ldr $tj,[sp] // tp[0]`
			`add $tp,sp,#8`
			`ldr $nj,[$np],#8 // np[0]`
			`subs $j,$num,#8 // j=num-1 and clear borrow`
			`mov $ap,$rp`
			`.Lsub:`
			`sbcs $aj,$tj,$nj // tp[j]-np[j]`
			`ldr $tj,[$tp],#8`
			`sub $j,$j,#8 // j--`
			`ldr $nj,[$np],#8`
			`str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]`
			`cbnz $j,.Lsub`

			`sbcs $aj,$tj,$nj`
			`sbcs $ovf,$ovf,xzr // did it borrow?`
			`str $aj,[$ap],#8 // rp[num-1]`

			`ldr $tj,[sp] // tp[0]`
			`add $tp,sp,#8`
			`ldr $aj,[$rp],#8 // rp[0]`
			`sub $num,$num,#8 // num--`
			`nop`
			`.Lcond_copy:`
			`sub $num,$num,#8 // num--`
			`csel $nj,$aj,$tj,cs // did it borrow?`
			`ldr $tj,[$tp],#8`
			`ldr $aj,[$rp],#8`
			`str xzr,[$tp,#-16] // wipe tp`
			`str $nj,[$rp,#-16]`
			`cbnz $num,.Lcond_copy`

			`csel $nj,$aj,$tj,cs`
			`str xzr,[$tp,#-8] // wipe tp`
			`str $nj,[$rp,#-8]`

			`ldp x19,x20,[x29,#16]`
			`mov sp,x29`
			`ldp x21,x22,[x29,#32]`
			`ldp x23,x24,[x29,#48]`
			`ldr x29,[sp],#64`
			`ret`
			`.size bn_mul_mont,.-bn_mul_mont`

			`.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"`
			`.align 4`
			`___`

			`print $code;`

			`close STDOUT;`