This commit completes recent modular exponentiation optimizations on

x86_64 platform. It targets specifically RSA1024 sign (using ideas
from http://eprint.iacr.org/2011/239) and adds more than 10% on most
platforms. Overall performance improvement relative to 1.0.0 is ~40%
in average, with best result of 54% on Westmere. Incidentally ~40%
is average improvement even for longer key lengths.
This commit is contained in:
Andy Polyakov 2011-08-12 16:44:32 +00:00
parent 20735f4c81
commit 361512da0d
6 changed files with 1027 additions and 81 deletions

View file

@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
my $x86_elf_asm="$x86_asm:elf";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
@ -513,9 +513,9 @@ my %table=(
#
# Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
# x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
# 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
"VC-WIN32","cl:-W3 -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32",
@ -1510,6 +1510,7 @@ $cflags.=" -DOPENSSL_BN_ASM_PART_WORDS" if ($bn_obj =~ /bn-586/);
$cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /86/);
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
$cflags.=" -DOPENSSL_BN_ASM_MONT5" if ($bn_obj =~ /-mont5/);
$cflags.=" -DOPENSSL_BN_ASM_GF2m" if ($bn_obj =~ /-gf2m/);
if ($fips)

24
TABLE
View file

@ -297,7 +297,7 @@ $sys_id =
$lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -777,7 +777,7 @@ $sys_id = WIN64A
$lflags =
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -1385,7 +1385,7 @@ $sys_id = MACOSX
$lflags = -Wl,-search_paths_first%
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -1545,9 +1545,9 @@ $sys_id = WIN64A
$lflags =
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = bn_asm.o x86_64-mont.o
$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
@ -2313,7 +2313,7 @@ $sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -2505,7 +2505,7 @@ $sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -2569,7 +2569,7 @@ $sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -4073,7 +4073,7 @@ $sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -4233,7 +4233,7 @@ $sys_id = MINGW64
$lflags = -lws2_32 -lgdi32 -lcrypt32
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -5193,7 +5193,7 @@ $sys_id =
$lflags = -lsocket -lnsl -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =
@ -5225,7 +5225,7 @@ $sys_id =
$lflags = -lsocket -lnsl -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj =
$aes_obj = aes-x86_64.o aesni-x86_64.o
$bf_obj =

View file

@ -98,6 +98,8 @@ x86_64-gcc.o: asm/x86_64-gcc.c
$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
x86_64-mont.s: asm/x86_64-mont.pl
$(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
x86_64-mont5.s: asm/x86_64-mont5.pl
$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
x86_64-gf2m.s: asm/x86_64-gf2m.pl
$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
modexp512-x86_64.s: asm/modexp512-x86_64.pl

View file

@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@ -142,6 +142,7 @@ $code.=<<___;
jne .L1st
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
@ -161,7 +162,6 @@ $code.=<<___;
.Louter:
mov ($bp,$i,8),$m0 # m0=bp[i]
xor $j,$j # j=0
mov ($ap),%rax # ap[0]
mov $n0,$m1
mov (%rsp),$lo0
mulq $m0 # ap[0]*bp[i]
@ -208,6 +208,7 @@ $code.=<<___;
jne .Linner
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0

834
crypto/bn/asm/x86_64-mont5.pl Executable file
View file

@ -0,0 +1,834 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# August 2011.
#
# Companion to x86_64-mont.pl that optimizes cache-timing attack
# countermeasures. The subroutines are produced by replacing bp[i]
# references in their x86_64-mont.pl counterparts with cache-neutral
# references to powers table computed in BN_mod_exp_mont_consttime.
# In addition subroutine that scatters elements of the powers table
# is implemented, so that scatter-/gathering can be tuned without
# bn_exp.c modifications.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output";
# int bn_mul_mont_gather5(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
$bp="%rdx"; # const BN_ULONG *bp,
$np="%rcx"; # const BN_ULONG *np,
$n0="%r8"; # const BN_ULONG *n0,
$num="%r9"; # int num,
# int idx); # 0 to 2^5-1, "index" in $bp holding
# pre-computed powers of a', interlaced
# in such manner that b[0] is $bp[idx],
# b[1] is [2^5+idx], etc.
$lo0="%r10";
$hi0="%r11";
$hi1="%r13";
$i="%r14";
$j="%r15";
$m0="%rbx";
$m1="%rbp";
$code=<<___;
.text
.globl bn_mul_mont_gather5
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
jb .Lmul_enter
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov ${num}d,${num}d
lea 2($num),%r11
mov %rsp,%rax
neg %r11
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
mov %r10,%r11
shr \$`log($N/8)/log(2)`,%r10
and \$`$N/8-1`,%r11
not %r10
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
lea 96($bp,%r11,8),$bp # pointer within 1st cache line
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
movq 24(%rax,%r10,8),%xmm7
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j # j++
jmp .L1st_enter
.align 16
.L1st:
add %rax,$hi1
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
mov $lo0,$hi0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
.L1st_enter:
mulq $m0 # ap[j]*bp[0]
add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
lea 1($j),$j # j++
mov %rdx,$lo0
mulq $m1 # np[j]*m1
cmp $num,$j
jne .L1st
movq %xmm0,$m0 # bp[1]
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
mov $lo0,$hi0
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
jmp .Louter
.align 16
.Louter:
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov 8(%rsp),$lo0 # tp[1]
mov %rdx,$hi1
lea 1($j),$j # j++
jmp .Linner_enter
.align 16
.Linner:
add %rax,$hi1
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
.Linner_enter:
mulq $m0 # ap[j]*bp[i]
add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
mov %rdx,$hi0
adc \$0,$hi0
lea 1($j),$j # j++
mulq $m1 # np[j]*m1
cmp $num,$j
jne .Linner
movq %xmm0,$m0 # bp[i+1]
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
add $lo0,$hi1 # pull upmost overflow bit
adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
cmp $num,$i
jl .Louter
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
lea (%rsp),$ap # borrow ap for tp
mov $num,$j # j=num
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
dec $j # doesnn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
xor $i,$i
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
mov $num,$j # j=num
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
mov ($ap,$i,8),%rax
mov $i,(%rsp,$i,8) # zap temporary vector
mov %rax,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 16
bn_mul4x_mont_gather5:
.Lmul4x_enter:
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov ${num}d,${num}d
lea 4($num),%r11
mov %rsp,%rax # !!!!
neg %r11
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
mov %r10,%r11
shr \$`log($N/8)/log(2)`,%r10
and \$`$N/8-1`,%r11
not %r10
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
lea 96($bp,%r11,8),$bp # pointer within 1st cache line
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
movq 24(%rax,%r10,8),%xmm7
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$A[0]
mov ($np),%rax
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq $A[0],$m1 # "tp[0]"*n0
mov %rdx,$A[1]
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0
add %rax,$A[1]
mov 8($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1
add %rax,$N[1]
mov 16($ap),%rax
adc \$0,%rdx
add $A[1],$N[1]
lea 4($j),$j # j++
adc \$0,%rdx
mov $N[1],(%rsp)
mov %rdx,$N[0]
jmp .L1st4x
.align 16
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov ($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov 8($np,$j,8),%rax
adc \$0,%rdx
lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov -16($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
jl .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[1]
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
mov $N[0],-8(%rsp,$j,8)
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
lea 1($i),$i # i++
.align 4
.Louter4x:
xor $j,$j # j=0
movq `0*$STRIDE/4-96`($bp),%xmm0
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
mov (%rsp),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq $A[0],$m1 # tp[0]*n0
mov %rdx,$A[1]
por %xmm2,%xmm0
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8($np),%rax
adc \$0,%rdx
add 8(%rsp),$A[1] # +tp[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov 16($ap),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4($j),$j # j+=2
adc \$0,%rdx
mov $N[1],(%rsp) # tp[j-1]
mov %rdx,$N[0]
jmp .Linner4x
.align 16
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
add -8(%rsp,$j,8),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov ($np,$j,8),%rax
adc \$0,%rdx
add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8($np,$j,8),%rax
adc \$0,%rdx
add 8(%rsp,$j,8),$A[1]
adc \$0,%rdx
lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov -16($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
jl .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
add -8(%rsp,$j,8),$A[1]
adc \$0,%rdx
lea 1($i),$i # i++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[i+1]
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
add (%rsp,$num,8),$N[0] # pull upmost overflow bit
adc \$0,$N[1]
mov $N[0],-8(%rsp,$j,8)
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
cmp $num,$i
jl .Louter4x
___
{
my @ri=("%rax","%rdx",$m0,$m1);
$code.=<<___;
mov 16(%rsp,$num,8),$rp # restore $rp
mov 0(%rsp),@ri[0] # tp[0]
pxor %xmm0,%xmm0
mov 8(%rsp),@ri[1] # tp[1]
shr \$2,$num # num/=4
lea (%rsp),$ap # borrow ap for tp
xor $i,$i # i=0 and clear CF!
sub 0($np),@ri[0]
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
lea -1($num),$j # j=num/4-1
jmp .Lsub4x
.align 16
.Lsub4x:
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 16($np,$i,8),@ri[2]
mov 32($ap,$i,8),@ri[0] # tp[i+1]
mov 40($ap,$i,8),@ri[1]
sbb 24($np,$i,8),@ri[3]
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 32($np,$i,8),@ri[0]
mov 48($ap,$i,8),@ri[2]
mov 56($ap,$i,8),@ri[3]
sbb 40($np,$i,8),@ri[1]
lea 4($i),$i # i++
dec $j # doesnn't affect CF!
jnz .Lsub4x
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 32($ap,$i,8),@ri[0] # load overflow bit
sbb 16($np,$i,8),@ri[2]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 24($np,$i,8),@ri[3]
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb \$0,@ri[0] # handle upmost overflow bit
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
xor $i,$i # i=0
and @ri[0],$ap
not @ri[0]
mov $rp,$np
and @ri[0],$np
lea -1($num),$j
or $np,$ap # ap=borrow?tp:rp
movdqu ($ap),%xmm1
movdqa %xmm0,(%rsp)
movdqu %xmm1,($rp)
jmp .Lcopy4x
.align 16
.Lcopy4x: # copy or in-place refresh
movdqu 16($ap,$i),%xmm2
movdqu 32($ap,$i),%xmm1
movdqa %xmm0,16(%rsp,$i)
movdqu %xmm2,16($rp,$i)
movdqa %xmm0,32(%rsp,$i)
movdqu %xmm1,32($rp,$i)
lea 32($i),$i
dec $j
jnz .Lcopy4x
shl \$2,$num
movdqu 16($ap,$i),%xmm2
movdqa %xmm0,16(%rsp,$i)
movdqu %xmm2,16($rp,$i)
___
}
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lmul4x_epilogue:
ret
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
___
}}}
{
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code.=<<___;
.globl bn_scatter5
.type bn_scatter5,\@abi-omnipotent
.align 16
bn_scatter5:
lea ($tbl,$idx,8),$tbl
.Lscatter:
mov ($inp),%rax
lea 8($inp),$inp
mov %rax,($tbl)
lea 32*8($tbl),$tbl
sub \$1,$num
jnz .Lscatter
ret
.size bn_scatter5,.-bn_scatter5
___
}
$code.=<<___;
.align 64
.Lmagic_masks:
.long 0,0, 0,0, 0,0, -1,-1
.long 0,0, 0,0, 0,0, 0,0
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View file

@ -114,6 +114,18 @@
#include "cryptlib.h"
#include "bn_lcl.h"
#include <stdlib.h>
#ifdef _WIN32
# include <malloc.h>
# ifndef alloca
# define alloca _alloca
# endif
#elif defined(__GNUC__)
# ifndef alloca
# define alloca(s) __builtin_alloca((s))
# endif
#endif
/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE 32
@ -562,7 +574,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
/* Given a pointer value, compute the next address that is a cache line multiple. */
#define MOD_EXP_CTIME_ALIGN(x_) \
((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
/* This variant of BN_mod_exp_mont() uses fixed windows and the special
* precomputation memory layout to limit data-dependency to a minimum
@ -573,17 +585,16 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
int i,bits,ret=0,idx,window,wvalue;
int i,bits,ret=0,window,wvalue;
int top;
BIGNUM *r;
const BIGNUM *aa;
BN_MONT_CTX *mont=NULL;
int numPowers;
unsigned char *powerbufFree=NULL;
int powerbufLen = 0;
unsigned char *powerbuf=NULL;
BIGNUM *computeTemp=NULL, *am=NULL;
BIGNUM computeTemp, *am=NULL;
bn_check_top(a);
bn_check_top(p);
@ -621,57 +632,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
/* Allocate a buffer large enough to hold all of the pre-computed
* powers of a.
*/
numPowers = 1 << window;
powerbufLen = sizeof(m->d[0])*top*numPowers;
if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
goto err;
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
memset(powerbuf, 0, powerbufLen);
/* Initialize the intermediate result. Do this early to save double conversion,
* once each for a^0 and intermediate result.
*/
if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
/* Initialize computeTemp as a^1 with montgomery precalcs */
computeTemp = BN_CTX_get(ctx);
am = BN_CTX_get(ctx);
if (computeTemp==NULL || am==NULL) goto err;
if (a->neg || BN_ucmp(a,m) >= 0)
{
if (!BN_mod(am,a,m,ctx))
goto err;
aa= am;
}
else
aa=a;
if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
if (!BN_copy(computeTemp, am)) goto err;
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
/* If the window size is greater than 1, then calculate
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
* (even powers could instead be computed as (a^(i/2))^2
* to use the slight performance advantage of sqr over mul).
*/
if (window > 1)
{
for (i=2; i<numPowers; i++)
{
/* Calculate a^i = a^(i-1) * a */
if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
goto err;
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
}
}
#if defined(OPENSSL_BN_ASM_MONT5)
if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
#endif
/* Adjust the number of bits up to a multiple of the window size.
* If the exponent length is not a multiple of the window size, then
* this pads the most significant bits with zeros to normalize the
@ -684,28 +647,174 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
* * to be taken in the BN_is_bit_set function.
*/
bits = ((bits+window-1)/window)*window;
idx=bits-1; /* The top bit of the window */
/* Scan the exponent one window at a time starting from the most
/* Allocate a buffer large enough to hold all of the pre-computed
* powers of a, plus computeTemp.
*/
numPowers = 1 << window;
powerbufLen = sizeof(m->d[0])*(top*numPowers +
(top>numPowers?top:numPowers));
if (powerbufLen < 3072)
powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
else if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
goto err;
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
memset(powerbuf, 0, powerbufLen);
if (powerbufLen < 3072)
powerbufFree = NULL;
computeTemp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
computeTemp.top = computeTemp.dmax = top;
computeTemp.neg = 0;
computeTemp.flags = BN_FLG_STATIC_DATA;
/* Initialize the intermediate result. Do this early to save double conversion,
* once each for a^0 and intermediate result.
*/
if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
/* Initialize computeTemp as a^1 with montgomery precalcs */
am = BN_CTX_get(ctx);
if (am==NULL) goto err;
if (a->neg || BN_ucmp(a,m) >= 0)
{
if (!BN_mod(am,a,m,ctx)) goto err;
if (!BN_to_montgomery(am,am,mont,ctx)) goto err;
}
else if (!BN_to_montgomery(am,a,mont,ctx)) goto err;
if (!BN_copy(&computeTemp, am)) goto err;
if (bn_wexpand(am,top)==NULL || bn_wexpand(r,top)==NULL)
goto err;
#if defined(OPENSSL_BN_ASM_MONT5)
/* This optimization uses ideas from http://eprint.iacr.org/2011/239,
* specifically optimization of cache-timing attack countermeasures
* and pre-computation optimization. */
/* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
* 512-bit RSA is hardly relevant, we omit it to spare size... */
if (window==5)
{
void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
const void *table,const BN_ULONG *np,
const BN_ULONG *n0,int num,int power);
void bn_scatter5(const BN_ULONG *inp,size_t num,
void *table,size_t power);
BN_ULONG *acc, *np=mont->N.d, *n0=mont->n0;
bn_scatter5(r->d,r->top,powerbuf,0);
bn_scatter5(am->d,am->top,powerbuf,1);
acc = computeTemp.d;
#if 0
for (i=2; i<32; i++)
{
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
bn_scatter5(acc,top,powerbuf,i);
}
#else
/* same as above, but uses squaring for 1/2 of operations */
for (i=2; i<32; i*=2)
{
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_scatter5(acc,top,powerbuf,i);
}
for (i=3; i<8; i+=2)
{
int j;
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
bn_scatter5(acc,top,powerbuf,i);
for (j=2*i; j<32; j*=2)
{
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_scatter5(acc,top,powerbuf,j);
}
}
for (; i<16; i+=2)
{
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
bn_scatter5(acc,top,powerbuf,i);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_scatter5(acc,top,powerbuf,2*i);
}
for (; i<32; i+=2)
{
bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
bn_scatter5(acc,top,powerbuf,i);
}
#endif
acc = r->d;
/* Scan the exponent one window at a time starting from the most
* significant bits.
*/
bits--;
while (bits >= 0)
{
for (wvalue=0, i=0; i<5; i++,bits--)
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_mul_mont(acc,acc,acc,np,n0,top);
bn_mul_mont_gather5(acc,acc,powerbuf,np,n0,top,wvalue);
}
r->top=top;
bn_correct_top(r);
}
else
#endif
{
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
/* If the window size is greater than 1, then calculate
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
* (even powers could instead be computed as (a^(i/2))^2
* to use the slight performance advantage of sqr over mul).
*/
if (window > 1)
{
for (i=2; i<numPowers; i++)
{
/* Calculate a^i = a^(i-1) * a */
if (!BN_mod_mul_montgomery(&computeTemp,am,&computeTemp,mont,ctx))
goto err;
if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&computeTemp, top, powerbuf, i, numPowers)) goto err;
}
}
/* Scan the exponent one window at a time starting from the most
* significant bits.
*/
while (idx >= 0)
bits--;
while (bits >= 0)
{
wvalue=0; /* The 'value' of the window */
/* Scan the window, squaring the result as we go */
for (i=0; i<window; i++,idx--)
for (i=0; i<window; i++,bits--)
{
if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err;
wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
}
/* Fetch the appropriate pre-computed value from the pre-buf */
if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
/* Multiply the result into the intermediate result */
if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
if (!BN_mod_mul_montgomery(r,r,&computeTemp,mont,ctx)) goto err;
}
}
/* Convert the final result from montgomery to standard format */
if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
@ -715,10 +824,9 @@ err:
if (powerbuf!=NULL)
{
OPENSSL_cleanse(powerbuf,powerbufLen);
OPENSSL_free(powerbufFree);
if (powerbufFree) OPENSSL_free(powerbufFree);
}
if (am!=NULL) BN_clear(am);
if (computeTemp!=NULL) BN_clear(computeTemp);
BN_CTX_end(ctx);
return(ret);
}