This commit completes recent modular exponentiation optimizations on
x86_64 platform. It targets specifically RSA1024 sign (using ideas from http://eprint.iacr.org/2011/239) and adds more than 10% on most platforms. Overall performance improvement relative to 1.0.0 is ~40% in average, with best result of 54% on Westmere. Incidentally ~40% is average improvement even for longer key lengths.
This commit is contained in:
parent
20735f4c81
commit
361512da0d
6 changed files with 1027 additions and 81 deletions
|
@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
|
|||
|
||||
my $x86_elf_asm="$x86_asm:elf";
|
||||
|
||||
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
|
||||
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
|
||||
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
|
||||
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
|
||||
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
|
||||
|
@ -513,9 +513,9 @@ my %table=(
|
|||
#
|
||||
# Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
|
||||
"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
|
||||
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
|
||||
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
|
||||
"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
|
||||
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
|
||||
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
|
||||
# x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
|
||||
# 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
|
||||
"VC-WIN32","cl:-W3 -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32",
|
||||
|
@ -1510,6 +1510,7 @@ $cflags.=" -DOPENSSL_BN_ASM_PART_WORDS" if ($bn_obj =~ /bn-586/);
|
|||
$cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /86/);
|
||||
|
||||
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
|
||||
$cflags.=" -DOPENSSL_BN_ASM_MONT5" if ($bn_obj =~ /-mont5/);
|
||||
$cflags.=" -DOPENSSL_BN_ASM_GF2m" if ($bn_obj =~ /-gf2m/);
|
||||
|
||||
if ($fips)
|
||||
|
|
24
TABLE
24
TABLE
|
@ -297,7 +297,7 @@ $sys_id =
|
|||
$lflags =
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -777,7 +777,7 @@ $sys_id = WIN64A
|
|||
$lflags =
|
||||
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = bn_asm.o x86_64-mont.o
|
||||
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -1385,7 +1385,7 @@ $sys_id = MACOSX
|
|||
$lflags = -Wl,-search_paths_first%
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -1545,9 +1545,9 @@ $sys_id = WIN64A
|
|||
$lflags =
|
||||
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = bn_asm.o x86_64-mont.o
|
||||
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
$md5_obj = md5-x86_64.o
|
||||
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
|
||||
|
@ -2313,7 +2313,7 @@ $sys_id =
|
|||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -2505,7 +2505,7 @@ $sys_id =
|
|||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -2569,7 +2569,7 @@ $sys_id =
|
|||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -4073,7 +4073,7 @@ $sys_id =
|
|||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -4233,7 +4233,7 @@ $sys_id = MINGW64
|
|||
$lflags = -lws2_32 -lgdi32 -lcrypt32
|
||||
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -5193,7 +5193,7 @@ $sys_id =
|
|||
$lflags = -lsocket -lnsl -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
@ -5225,7 +5225,7 @@ $sys_id =
|
|||
$lflags = -lsocket -lnsl -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj = x86_64cpuid.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
|
||||
$des_obj =
|
||||
$aes_obj = aes-x86_64.o aesni-x86_64.o
|
||||
$bf_obj =
|
||||
|
|
|
@ -98,6 +98,8 @@ x86_64-gcc.o: asm/x86_64-gcc.c
|
|||
$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
|
||||
x86_64-mont.s: asm/x86_64-mont.pl
|
||||
$(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
|
||||
x86_64-mont5.s: asm/x86_64-mont5.pl
|
||||
$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
|
||||
x86_64-gf2m.s: asm/x86_64-gf2m.pl
|
||||
$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
|
||||
modexp512-x86_64.s: asm/modexp512-x86_64.pl
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
|
@ -142,6 +142,7 @@ $code.=<<___;
|
|||
jne .L1st
|
||||
|
||||
add %rax,$hi1
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
|
@ -161,7 +162,6 @@ $code.=<<___;
|
|||
.Louter:
|
||||
mov ($bp,$i,8),$m0 # m0=bp[i]
|
||||
xor $j,$j # j=0
|
||||
mov ($ap),%rax # ap[0]
|
||||
mov $n0,$m1
|
||||
mov (%rsp),$lo0
|
||||
mulq $m0 # ap[0]*bp[i]
|
||||
|
@ -208,6 +208,7 @@ $code.=<<___;
|
|||
jne .Linner
|
||||
|
||||
add %rax,$hi1
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
mov (%rsp,$j,8),$lo0
|
||||
|
|
834
crypto/bn/asm/x86_64-mont5.pl
Executable file
834
crypto/bn/asm/x86_64-mont5.pl
Executable file
|
@ -0,0 +1,834 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# August 2011.
|
||||
#
|
||||
# Companion to x86_64-mont.pl that optimizes cache-timing attack
|
||||
# countermeasures. The subroutines are produced by replacing bp[i]
|
||||
# references in their x86_64-mont.pl counterparts with cache-neutral
|
||||
# references to powers table computed in BN_mod_exp_mont_consttime.
|
||||
# In addition subroutine that scatters elements of the powers table
|
||||
# is implemented, so that scatter-/gathering can be tuned without
|
||||
# bn_exp.c modifications.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour $output";
|
||||
|
||||
# int bn_mul_mont_gather5(
|
||||
$rp="%rdi"; # BN_ULONG *rp,
|
||||
$ap="%rsi"; # const BN_ULONG *ap,
|
||||
$bp="%rdx"; # const BN_ULONG *bp,
|
||||
$np="%rcx"; # const BN_ULONG *np,
|
||||
$n0="%r8"; # const BN_ULONG *n0,
|
||||
$num="%r9"; # int num,
|
||||
# int idx); # 0 to 2^5-1, "index" in $bp holding
|
||||
# pre-computed powers of a', interlaced
|
||||
# in such manner that b[0] is $bp[idx],
|
||||
# b[1] is [2^5+idx], etc.
|
||||
$lo0="%r10";
|
||||
$hi0="%r11";
|
||||
$hi1="%r13";
|
||||
$i="%r14";
|
||||
$j="%r15";
|
||||
$m0="%rbx";
|
||||
$m1="%rbp";
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.globl bn_mul_mont_gather5
|
||||
.type bn_mul_mont_gather5,\@function,6
|
||||
.align 64
|
||||
bn_mul_mont_gather5:
|
||||
test \$3,${num}d
|
||||
jnz .Lmul_enter
|
||||
cmp \$8,${num}d
|
||||
jb .Lmul_enter
|
||||
jmp .Lmul4x_enter
|
||||
|
||||
.align 16
|
||||
.Lmul_enter:
|
||||
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
mov ${num}d,${num}d
|
||||
lea 2($num),%r11
|
||||
mov %rsp,%rax
|
||||
neg %r11
|
||||
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
|
||||
and \$-1024,%rsp # minimize TLB usage
|
||||
|
||||
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||||
.Lmul_body:
|
||||
mov $bp,%r12 # reassign $bp
|
||||
___
|
||||
$bp="%r12";
|
||||
$STRIDE=2**5*8; # 5 is "window size"
|
||||
$N=$STRIDE/4; # should match cache line size
|
||||
$code.=<<___;
|
||||
mov %r10,%r11
|
||||
shr \$`log($N/8)/log(2)`,%r10
|
||||
and \$`$N/8-1`,%r11
|
||||
not %r10
|
||||
lea .Lmagic_masks(%rip),%rax
|
||||
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
|
||||
lea 96($bp,%r11,8),$bp # pointer within 1st cache line
|
||||
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
|
||||
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
|
||||
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
|
||||
movq 24(%rax,%r10,8),%xmm7
|
||||
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
movq %xmm0,$m0 # m0=bp[0]
|
||||
|
||||
mov ($n0),$n0 # pull n0[0] value
|
||||
mov ($ap),%rax
|
||||
|
||||
xor $i,$i # i=0
|
||||
xor $j,$j # j=0
|
||||
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
|
||||
mov $n0,$m1
|
||||
mulq $m0 # ap[0]*bp[0]
|
||||
mov %rax,$lo0
|
||||
mov ($np),%rax
|
||||
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
|
||||
imulq $lo0,$m1 # "tp[0]"*n0
|
||||
mov %rdx,$hi0
|
||||
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
mulq $m1 # np[0]*m1
|
||||
add %rax,$lo0 # discarded
|
||||
mov 8($ap),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$hi1
|
||||
|
||||
lea 1($j),$j # j++
|
||||
jmp .L1st_enter
|
||||
|
||||
.align 16
|
||||
.L1st:
|
||||
add %rax,$hi1
|
||||
mov ($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||||
mov $lo0,$hi0
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$hi1
|
||||
|
||||
.L1st_enter:
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$hi0
|
||||
mov ($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
lea 1($j),$j # j++
|
||||
mov %rdx,$lo0
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
cmp $num,$j
|
||||
jne .L1st
|
||||
|
||||
movq %xmm0,$m0 # bp[1]
|
||||
|
||||
add %rax,$hi1
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$hi1
|
||||
mov $lo0,$hi0
|
||||
|
||||
xor %rdx,%rdx
|
||||
add $hi0,$hi1
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-8(%rsp,$num,8)
|
||||
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
||||
|
||||
lea 1($i),$i # i++
|
||||
jmp .Louter
|
||||
.align 16
|
||||
.Louter:
|
||||
xor $j,$j # j=0
|
||||
mov $n0,$m1
|
||||
mov (%rsp),$lo0
|
||||
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
|
||||
mulq $m0 # ap[0]*bp[i]
|
||||
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
|
||||
mov ($np),%rax
|
||||
adc \$0,%rdx
|
||||
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
|
||||
imulq $lo0,$m1 # tp[0]*n0
|
||||
mov %rdx,$hi0
|
||||
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
mulq $m1 # np[0]*m1
|
||||
add %rax,$lo0 # discarded
|
||||
mov 8($ap),%rax
|
||||
adc \$0,%rdx
|
||||
mov 8(%rsp),$lo0 # tp[1]
|
||||
mov %rdx,$hi1
|
||||
|
||||
lea 1($j),$j # j++
|
||||
jmp .Linner_enter
|
||||
|
||||
.align 16
|
||||
.Linner:
|
||||
add %rax,$hi1
|
||||
mov ($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
mov (%rsp,$j,8),$lo0
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$hi1
|
||||
|
||||
.Linner_enter:
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$hi0
|
||||
mov ($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
|
||||
mov %rdx,$hi0
|
||||
adc \$0,$hi0
|
||||
lea 1($j),$j # j++
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
cmp $num,$j
|
||||
jne .Linner
|
||||
|
||||
movq %xmm0,$m0 # bp[i+1]
|
||||
|
||||
add %rax,$hi1
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
mov (%rsp,$j,8),$lo0
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$hi1
|
||||
|
||||
xor %rdx,%rdx
|
||||
add $hi0,$hi1
|
||||
adc \$0,%rdx
|
||||
add $lo0,$hi1 # pull upmost overflow bit
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-8(%rsp,$num,8)
|
||||
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
||||
|
||||
lea 1($i),$i # i++
|
||||
cmp $num,$i
|
||||
jl .Louter
|
||||
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
mov (%rsp),%rax # tp[0]
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
mov $num,$j # j=num
|
||||
jmp .Lsub
|
||||
.align 16
|
||||
.Lsub: sbb ($np,$i,8),%rax
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov 8($ap,$i,8),%rax # tp[i+1]
|
||||
lea 1($i),$i # i++
|
||||
dec $j # doesnn't affect CF!
|
||||
jnz .Lsub
|
||||
|
||||
sbb \$0,%rax # handle upmost overflow bit
|
||||
xor $i,$i
|
||||
and %rax,$ap
|
||||
not %rax
|
||||
mov $rp,$np
|
||||
and %rax,$np
|
||||
mov $num,$j # j=num
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
.align 16
|
||||
.Lcopy: # copy or in-place refresh
|
||||
mov ($ap,$i,8),%rax
|
||||
mov $i,(%rsp,$i,8) # zap temporary vector
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]
|
||||
lea 1($i),$i
|
||||
sub \$1,$j
|
||||
jnz .Lcopy
|
||||
|
||||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||||
mov \$1,%rax
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
.Lmul_epilogue:
|
||||
ret
|
||||
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
|
||||
___
|
||||
{{{
|
||||
my @A=("%r10","%r11");
|
||||
my @N=("%r13","%rdi");
|
||||
$code.=<<___;
|
||||
.type bn_mul4x_mont_gather5,\@function,6
|
||||
.align 16
|
||||
bn_mul4x_mont_gather5:
|
||||
.Lmul4x_enter:
|
||||
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
mov ${num}d,${num}d
|
||||
lea 4($num),%r11
|
||||
mov %rsp,%rax # !!!!
|
||||
neg %r11
|
||||
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
|
||||
and \$-1024,%rsp # minimize TLB usage
|
||||
|
||||
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||||
.Lmul4x_body:
|
||||
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
|
||||
mov %rdx,%r12 # reassign $bp
|
||||
___
|
||||
$bp="%r12";
|
||||
$STRIDE=2**5*8; # 5 is "window size"
|
||||
$N=$STRIDE/4; # should match cache line size
|
||||
$code.=<<___;
|
||||
mov %r10,%r11
|
||||
shr \$`log($N/8)/log(2)`,%r10
|
||||
and \$`$N/8-1`,%r11
|
||||
not %r10
|
||||
lea .Lmagic_masks(%rip),%rax
|
||||
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
|
||||
lea 96($bp,%r11,8),$bp # pointer within 1st cache line
|
||||
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
|
||||
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
|
||||
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
|
||||
movq 24(%rax,%r10,8),%xmm7
|
||||
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
movq %xmm0,$m0 # m0=bp[0]
|
||||
mov ($n0),$n0 # pull n0[0] value
|
||||
mov ($ap),%rax
|
||||
|
||||
xor $i,$i # i=0
|
||||
xor $j,$j # j=0
|
||||
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
|
||||
mov $n0,$m1
|
||||
mulq $m0 # ap[0]*bp[0]
|
||||
mov %rax,$A[0]
|
||||
mov ($np),%rax
|
||||
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
|
||||
imulq $A[0],$m1 # "tp[0]"*n0
|
||||
mov %rdx,$A[1]
|
||||
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
mulq $m1 # np[0]*m1
|
||||
add %rax,$A[0] # discarded
|
||||
mov 8($ap),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0
|
||||
add %rax,$A[1]
|
||||
mov 8($np),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1
|
||||
add %rax,$N[1]
|
||||
mov 16($ap),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1]
|
||||
lea 4($j),$j # j++
|
||||
adc \$0,%rdx
|
||||
mov $N[1],(%rsp)
|
||||
mov %rdx,$N[0]
|
||||
jmp .L1st4x
|
||||
.align 16
|
||||
.L1st4x:
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[0]
|
||||
mov -16($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov -8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[1]
|
||||
mov -8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov ($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[0]
|
||||
mov ($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov 8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[1]
|
||||
mov 8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
lea 4($j),$j # j++
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov -16($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
cmp $num,$j
|
||||
jl .L1st4x
|
||||
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[0]
|
||||
mov -16($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov -8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[0]
|
||||
add %rax,$A[1]
|
||||
mov -8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
|
||||
movq %xmm0,$m0 # bp[1]
|
||||
|
||||
xor $N[1],$N[1]
|
||||
add $A[0],$N[0]
|
||||
adc \$0,$N[1]
|
||||
mov $N[0],-8(%rsp,$j,8)
|
||||
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
||||
|
||||
lea 1($i),$i # i++
|
||||
.align 4
|
||||
.Louter4x:
|
||||
xor $j,$j # j=0
|
||||
movq `0*$STRIDE/4-96`($bp),%xmm0
|
||||
movq `1*$STRIDE/4-96`($bp),%xmm1
|
||||
pand %xmm4,%xmm0
|
||||
movq `2*$STRIDE/4-96`($bp),%xmm2
|
||||
pand %xmm5,%xmm1
|
||||
|
||||
mov (%rsp),$A[0]
|
||||
mov $n0,$m1
|
||||
mulq $m0 # ap[0]*bp[i]
|
||||
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
|
||||
mov ($np),%rax
|
||||
adc \$0,%rdx
|
||||
|
||||
movq `3*$STRIDE/4-96`($bp),%xmm3
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
|
||||
imulq $A[0],$m1 # tp[0]*n0
|
||||
mov %rdx,$A[1]
|
||||
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
mulq $m1 # np[0]*m1
|
||||
add %rax,$A[0] # "$N[0]", discarded
|
||||
mov 8($ap),%rax
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[1]
|
||||
mov 8($np),%rax
|
||||
adc \$0,%rdx
|
||||
add 8(%rsp),$A[1] # +tp[1]
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov 16($ap),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
lea 4($j),$j # j+=2
|
||||
adc \$0,%rdx
|
||||
mov $N[1],(%rsp) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
jmp .Linner4x
|
||||
.align 16
|
||||
.Linner4x:
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[0]
|
||||
mov -16($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov -8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[1]
|
||||
mov -8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add -8(%rsp,$j,8),$A[1]
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov ($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[0]
|
||||
mov ($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov 8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[1]
|
||||
mov 8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add 8(%rsp,$j,8),$A[1]
|
||||
adc \$0,%rdx
|
||||
lea 4($j),$j # j++
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov -16($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
cmp $num,$j
|
||||
jl .Linner4x
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[0]
|
||||
mov -16($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||||
adc \$0,%rdx
|
||||
mov %rdx,$A[1]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[0]
|
||||
mov -8($ap,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add $A[0],$N[0]
|
||||
adc \$0,%rdx
|
||||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[1]
|
||||
|
||||
mulq $m0 # ap[j]*bp[i]
|
||||
add %rax,$A[1]
|
||||
mov -8($np,$j,8),%rax
|
||||
adc \$0,%rdx
|
||||
add -8(%rsp,$j,8),$A[1]
|
||||
adc \$0,%rdx
|
||||
lea 1($i),$i # i++
|
||||
mov %rdx,$A[0]
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
add %rax,$N[1]
|
||||
mov ($ap),%rax # ap[0]
|
||||
adc \$0,%rdx
|
||||
add $A[1],$N[1]
|
||||
adc \$0,%rdx
|
||||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||||
mov %rdx,$N[0]
|
||||
|
||||
movq %xmm0,$m0 # bp[i+1]
|
||||
|
||||
xor $N[1],$N[1]
|
||||
add $A[0],$N[0]
|
||||
adc \$0,$N[1]
|
||||
add (%rsp,$num,8),$N[0] # pull upmost overflow bit
|
||||
adc \$0,$N[1]
|
||||
mov $N[0],-8(%rsp,$j,8)
|
||||
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
||||
|
||||
cmp $num,$i
|
||||
jl .Louter4x
|
||||
___
|
||||
{
|
||||
my @ri=("%rax","%rdx",$m0,$m1);
|
||||
$code.=<<___;
|
||||
mov 16(%rsp,$num,8),$rp # restore $rp
|
||||
mov 0(%rsp),@ri[0] # tp[0]
|
||||
pxor %xmm0,%xmm0
|
||||
mov 8(%rsp),@ri[1] # tp[1]
|
||||
shr \$2,$num # num/=4
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
|
||||
sub 0($np),@ri[0]
|
||||
mov 16($ap),@ri[2] # tp[2]
|
||||
mov 24($ap),@ri[3] # tp[3]
|
||||
sbb 8($np),@ri[1]
|
||||
lea -1($num),$j # j=num/4-1
|
||||
jmp .Lsub4x
|
||||
.align 16
|
||||
.Lsub4x:
|
||||
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
sbb 16($np,$i,8),@ri[2]
|
||||
mov 32($ap,$i,8),@ri[0] # tp[i+1]
|
||||
mov 40($ap,$i,8),@ri[1]
|
||||
sbb 24($np,$i,8),@ri[3]
|
||||
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
sbb 32($np,$i,8),@ri[0]
|
||||
mov 48($ap,$i,8),@ri[2]
|
||||
mov 56($ap,$i,8),@ri[3]
|
||||
sbb 40($np,$i,8),@ri[1]
|
||||
lea 4($i),$i # i++
|
||||
dec $j # doesnn't affect CF!
|
||||
jnz .Lsub4x
|
||||
|
||||
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov 32($ap,$i,8),@ri[0] # load overflow bit
|
||||
sbb 16($np,$i,8),@ri[2]
|
||||
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
sbb 24($np,$i,8),@ri[3]
|
||||
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
|
||||
sbb \$0,@ri[0] # handle upmost overflow bit
|
||||
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
xor $i,$i # i=0
|
||||
and @ri[0],$ap
|
||||
not @ri[0]
|
||||
mov $rp,$np
|
||||
and @ri[0],$np
|
||||
lea -1($num),$j
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
|
||||
movdqu ($ap),%xmm1
|
||||
movdqa %xmm0,(%rsp)
|
||||
movdqu %xmm1,($rp)
|
||||
jmp .Lcopy4x
|
||||
.align 16
|
||||
.Lcopy4x: # copy or in-place refresh
|
||||
movdqu 16($ap,$i),%xmm2
|
||||
movdqu 32($ap,$i),%xmm1
|
||||
movdqa %xmm0,16(%rsp,$i)
|
||||
movdqu %xmm2,16($rp,$i)
|
||||
movdqa %xmm0,32(%rsp,$i)
|
||||
movdqu %xmm1,32($rp,$i)
|
||||
lea 32($i),$i
|
||||
dec $j
|
||||
jnz .Lcopy4x
|
||||
|
||||
shl \$2,$num
|
||||
movdqu 16($ap,$i),%xmm2
|
||||
movdqa %xmm0,16(%rsp,$i)
|
||||
movdqu %xmm2,16($rp,$i)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||||
mov \$1,%rax
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
.Lmul4x_epilogue:
|
||||
ret
|
||||
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
|
||||
___
|
||||
}}}
|
||||
|
||||
{
|
||||
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||
$code.=<<___;
|
||||
.globl bn_scatter5
|
||||
.type bn_scatter5,\@abi-omnipotent
|
||||
.align 16
|
||||
bn_scatter5:
|
||||
lea ($tbl,$idx,8),$tbl
|
||||
.Lscatter:
|
||||
mov ($inp),%rax
|
||||
lea 8($inp),$inp
|
||||
mov %rax,($tbl)
|
||||
lea 32*8($tbl),$tbl
|
||||
sub \$1,$num
|
||||
jnz .Lscatter
|
||||
ret
|
||||
.size bn_scatter5,.-bn_scatter5
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.Lmagic_masks:
|
||||
.long 0,0, 0,0, 0,0, -1,-1
|
||||
.long 0,0, 0,0, 0,0, 0,0
|
||||
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
|
@ -114,6 +114,18 @@
|
|||
#include "cryptlib.h"
|
||||
#include "bn_lcl.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef _WIN32
|
||||
# include <malloc.h>
|
||||
# ifndef alloca
|
||||
# define alloca _alloca
|
||||
# endif
|
||||
#elif defined(__GNUC__)
|
||||
# ifndef alloca
|
||||
# define alloca(s) __builtin_alloca((s))
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* maximum precomputation table size for *variable* sliding windows */
|
||||
#define TABLE_SIZE 32
|
||||
|
||||
|
@ -562,7 +574,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
|
|||
|
||||
/* Given a pointer value, compute the next address that is a cache line multiple. */
|
||||
#define MOD_EXP_CTIME_ALIGN(x_) \
|
||||
((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
|
||||
((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
|
||||
|
||||
/* This variant of BN_mod_exp_mont() uses fixed windows and the special
|
||||
* precomputation memory layout to limit data-dependency to a minimum
|
||||
|
@ -573,17 +585,16 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
|
|||
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
||||
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
|
||||
{
|
||||
int i,bits,ret=0,idx,window,wvalue;
|
||||
int i,bits,ret=0,window,wvalue;
|
||||
int top;
|
||||
BIGNUM *r;
|
||||
const BIGNUM *aa;
|
||||
BN_MONT_CTX *mont=NULL;
|
||||
|
||||
int numPowers;
|
||||
unsigned char *powerbufFree=NULL;
|
||||
int powerbufLen = 0;
|
||||
unsigned char *powerbuf=NULL;
|
||||
BIGNUM *computeTemp=NULL, *am=NULL;
|
||||
BIGNUM computeTemp, *am=NULL;
|
||||
|
||||
bn_check_top(a);
|
||||
bn_check_top(p);
|
||||
|
@ -621,57 +632,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
|
||||
/* Get the window size to use with size of p. */
|
||||
window = BN_window_bits_for_ctime_exponent_size(bits);
|
||||
|
||||
/* Allocate a buffer large enough to hold all of the pre-computed
|
||||
* powers of a.
|
||||
*/
|
||||
numPowers = 1 << window;
|
||||
powerbufLen = sizeof(m->d[0])*top*numPowers;
|
||||
if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
|
||||
goto err;
|
||||
|
||||
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
|
||||
memset(powerbuf, 0, powerbufLen);
|
||||
|
||||
/* Initialize the intermediate result. Do this early to save double conversion,
|
||||
* once each for a^0 and intermediate result.
|
||||
*/
|
||||
if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
|
||||
|
||||
/* Initialize computeTemp as a^1 with montgomery precalcs */
|
||||
computeTemp = BN_CTX_get(ctx);
|
||||
am = BN_CTX_get(ctx);
|
||||
if (computeTemp==NULL || am==NULL) goto err;
|
||||
|
||||
if (a->neg || BN_ucmp(a,m) >= 0)
|
||||
{
|
||||
if (!BN_mod(am,a,m,ctx))
|
||||
goto err;
|
||||
aa= am;
|
||||
}
|
||||
else
|
||||
aa=a;
|
||||
if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
|
||||
if (!BN_copy(computeTemp, am)) goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
|
||||
|
||||
/* If the window size is greater than 1, then calculate
|
||||
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
|
||||
* (even powers could instead be computed as (a^(i/2))^2
|
||||
* to use the slight performance advantage of sqr over mul).
|
||||
*/
|
||||
if (window > 1)
|
||||
{
|
||||
for (i=2; i<numPowers; i++)
|
||||
{
|
||||
/* Calculate a^i = a^(i-1) * a */
|
||||
if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
|
||||
goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(OPENSSL_BN_ASM_MONT5)
|
||||
if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
|
||||
#endif
|
||||
/* Adjust the number of bits up to a multiple of the window size.
|
||||
* If the exponent length is not a multiple of the window size, then
|
||||
* this pads the most significant bits with zeros to normalize the
|
||||
|
@ -684,28 +647,174 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
* * to be taken in the BN_is_bit_set function.
|
||||
*/
|
||||
bits = ((bits+window-1)/window)*window;
|
||||
idx=bits-1; /* The top bit of the window */
|
||||
|
||||
/* Scan the exponent one window at a time starting from the most
|
||||
/* Allocate a buffer large enough to hold all of the pre-computed
|
||||
* powers of a, plus computeTemp.
|
||||
*/
|
||||
numPowers = 1 << window;
|
||||
powerbufLen = sizeof(m->d[0])*(top*numPowers +
|
||||
(top>numPowers?top:numPowers));
|
||||
if (powerbufLen < 3072)
|
||||
powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
|
||||
else if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
|
||||
goto err;
|
||||
|
||||
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
|
||||
memset(powerbuf, 0, powerbufLen);
|
||||
|
||||
if (powerbufLen < 3072)
|
||||
powerbufFree = NULL;
|
||||
|
||||
computeTemp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
|
||||
computeTemp.top = computeTemp.dmax = top;
|
||||
computeTemp.neg = 0;
|
||||
computeTemp.flags = BN_FLG_STATIC_DATA;
|
||||
|
||||
/* Initialize the intermediate result. Do this early to save double conversion,
|
||||
* once each for a^0 and intermediate result.
|
||||
*/
|
||||
if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
|
||||
|
||||
/* Initialize computeTemp as a^1 with montgomery precalcs */
|
||||
am = BN_CTX_get(ctx);
|
||||
if (am==NULL) goto err;
|
||||
|
||||
if (a->neg || BN_ucmp(a,m) >= 0)
|
||||
{
|
||||
if (!BN_mod(am,a,m,ctx)) goto err;
|
||||
if (!BN_to_montgomery(am,am,mont,ctx)) goto err;
|
||||
}
|
||||
else if (!BN_to_montgomery(am,a,mont,ctx)) goto err;
|
||||
|
||||
if (!BN_copy(&computeTemp, am)) goto err;
|
||||
|
||||
if (bn_wexpand(am,top)==NULL || bn_wexpand(r,top)==NULL)
|
||||
goto err;
|
||||
|
||||
#if defined(OPENSSL_BN_ASM_MONT5)
|
||||
/* This optimization uses ideas from http://eprint.iacr.org/2011/239,
|
||||
* specifically optimization of cache-timing attack countermeasures
|
||||
* and pre-computation optimization. */
|
||||
|
||||
/* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
|
||||
* 512-bit RSA is hardly relevant, we omit it to spare size... */
|
||||
if (window==5)
|
||||
{
|
||||
void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
|
||||
const void *table,const BN_ULONG *np,
|
||||
const BN_ULONG *n0,int num,int power);
|
||||
void bn_scatter5(const BN_ULONG *inp,size_t num,
|
||||
void *table,size_t power);
|
||||
|
||||
BN_ULONG *acc, *np=mont->N.d, *n0=mont->n0;
|
||||
|
||||
bn_scatter5(r->d,r->top,powerbuf,0);
|
||||
bn_scatter5(am->d,am->top,powerbuf,1);
|
||||
|
||||
acc = computeTemp.d;
|
||||
#if 0
|
||||
for (i=2; i<32; i++)
|
||||
{
|
||||
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
|
||||
bn_scatter5(acc,top,powerbuf,i);
|
||||
}
|
||||
#else
|
||||
/* same as above, but uses squaring for 1/2 of operations */
|
||||
for (i=2; i<32; i*=2)
|
||||
{
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_scatter5(acc,top,powerbuf,i);
|
||||
}
|
||||
for (i=3; i<8; i+=2)
|
||||
{
|
||||
int j;
|
||||
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
|
||||
bn_scatter5(acc,top,powerbuf,i);
|
||||
for (j=2*i; j<32; j*=2)
|
||||
{
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_scatter5(acc,top,powerbuf,j);
|
||||
}
|
||||
}
|
||||
for (; i<16; i+=2)
|
||||
{
|
||||
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
|
||||
bn_scatter5(acc,top,powerbuf,i);
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_scatter5(acc,top,powerbuf,2*i);
|
||||
}
|
||||
for (; i<32; i+=2)
|
||||
{
|
||||
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
|
||||
bn_scatter5(acc,top,powerbuf,i);
|
||||
}
|
||||
#endif
|
||||
acc = r->d;
|
||||
|
||||
/* Scan the exponent one window at a time starting from the most
|
||||
* significant bits.
|
||||
*/
|
||||
bits--;
|
||||
while (bits >= 0)
|
||||
{
|
||||
for (wvalue=0, i=0; i<5; i++,bits--)
|
||||
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
|
||||
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_mul_mont(acc,acc,acc,np,n0,top);
|
||||
bn_mul_mont_gather5(acc,acc,powerbuf,np,n0,top,wvalue);
|
||||
}
|
||||
|
||||
r->top=top;
|
||||
bn_correct_top(r);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
|
||||
|
||||
/* If the window size is greater than 1, then calculate
|
||||
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
|
||||
* (even powers could instead be computed as (a^(i/2))^2
|
||||
* to use the slight performance advantage of sqr over mul).
|
||||
*/
|
||||
if (window > 1)
|
||||
{
|
||||
for (i=2; i<numPowers; i++)
|
||||
{
|
||||
/* Calculate a^i = a^(i-1) * a */
|
||||
if (!BN_mod_mul_montgomery(&computeTemp,am,&computeTemp,mont,ctx))
|
||||
goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&computeTemp, top, powerbuf, i, numPowers)) goto err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Scan the exponent one window at a time starting from the most
|
||||
* significant bits.
|
||||
*/
|
||||
while (idx >= 0)
|
||||
bits--;
|
||||
while (bits >= 0)
|
||||
{
|
||||
wvalue=0; /* The 'value' of the window */
|
||||
|
||||
/* Scan the window, squaring the result as we go */
|
||||
for (i=0; i<window; i++,idx--)
|
||||
for (i=0; i<window; i++,bits--)
|
||||
{
|
||||
if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err;
|
||||
wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
|
||||
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
|
||||
}
|
||||
|
||||
/* Fetch the appropriate pre-computed value from the pre-buf */
|
||||
if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
|
||||
if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
|
||||
|
||||
/* Multiply the result into the intermediate result */
|
||||
if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
|
||||
if (!BN_mod_mul_montgomery(r,r,&computeTemp,mont,ctx)) goto err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert the final result from montgomery to standard format */
|
||||
if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
|
||||
|
@ -715,10 +824,9 @@ err:
|
|||
if (powerbuf!=NULL)
|
||||
{
|
||||
OPENSSL_cleanse(powerbuf,powerbufLen);
|
||||
OPENSSL_free(powerbufFree);
|
||||
if (powerbufFree) OPENSSL_free(powerbufFree);
|
||||
}
|
||||
if (am!=NULL) BN_clear(am);
|
||||
if (computeTemp!=NULL) BN_clear(computeTemp);
|
||||
BN_CTX_end(ctx);
|
||||
return(ret);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue