crypto/bn/x86_64-mont5.pl: constant-time gather procedure.
At the same time remove miniscule bias in final subtraction. Performance penalty varies from platform to platform, and even with key length. For rsa2048 sign it was observed to be 4% for Sandy Bridge and 7% on Broadwell. CVE-2016-0702 Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Rich Salz <rsalz@openssl.org> (cherry picked from master)
This commit is contained in:
parent
08ea966c01
commit
25d14c6c29
3 changed files with 676 additions and 556 deletions
|
@ -775,20 +775,20 @@ bn_sqr8x_mont:
|
|||
# 4096. this is done to allow memory disambiguation logic
|
||||
# do its job.
|
||||
#
|
||||
lea -64(%rsp,$num,4),%r11
|
||||
lea -64(%rsp,$num,2),%r11
|
||||
mov ($n0),$n0 # *n0
|
||||
sub $aptr,%r11
|
||||
and \$4095,%r11
|
||||
cmp %r11,%r10
|
||||
jb .Lsqr8x_sp_alt
|
||||
sub %r11,%rsp # align with $aptr
|
||||
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
|
||||
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
|
||||
jmp .Lsqr8x_sp_done
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_sp_alt:
|
||||
lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
|
||||
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
|
||||
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
|
||||
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
|
||||
sub %r10,%r11
|
||||
mov \$0,%r10
|
||||
cmovc %r10,%r11
|
||||
|
@ -798,37 +798,17 @@ bn_sqr8x_mont:
|
|||
mov $num,%r10
|
||||
neg $num
|
||||
|
||||
lea 64(%rsp,$num,2),%r11 # copy of modulus
|
||||
mov $n0, 32(%rsp)
|
||||
mov %rax, 40(%rsp) # save original %rsp
|
||||
.Lsqr8x_body:
|
||||
|
||||
mov $num,$i
|
||||
movq %r11, %xmm2 # save pointer to modulus copy
|
||||
shr \$3+2,$i
|
||||
mov OPENSSL_ia32cap_P+8(%rip),%eax
|
||||
jmp .Lsqr8x_copy_n
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_copy_n:
|
||||
movq 8*0($nptr),%xmm0
|
||||
movq 8*1($nptr),%xmm1
|
||||
movq 8*2($nptr),%xmm3
|
||||
movq 8*3($nptr),%xmm4
|
||||
lea 8*4($nptr),$nptr
|
||||
movdqa %xmm0,16*0(%r11)
|
||||
movdqa %xmm1,16*1(%r11)
|
||||
movdqa %xmm3,16*2(%r11)
|
||||
movdqa %xmm4,16*3(%r11)
|
||||
lea 16*4(%r11),%r11
|
||||
dec $i
|
||||
jnz .Lsqr8x_copy_n
|
||||
|
||||
movq $nptr, %xmm2 # save pointer to modulus
|
||||
pxor %xmm0,%xmm0
|
||||
movq $rptr,%xmm1 # save $rptr
|
||||
movq %r10, %xmm3 # -$num
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
mov OPENSSL_ia32cap_P+8(%rip),%eax
|
||||
and \$0x80100,%eax
|
||||
cmp \$0x80100,%eax
|
||||
jne .Lsqr8x_nox
|
||||
|
@ -837,7 +817,6 @@ $code.=<<___ if ($addx);
|
|||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
lea 64(%rsp,$num,2),%rdx
|
||||
shr \$3+2,$num
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lsqr8x_zero
|
||||
|
@ -850,7 +829,6 @@ $code.=<<___;
|
|||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
lea 64(%rsp,$num,2),%rdx
|
||||
shr \$3+2,$num
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lsqr8x_zero
|
||||
|
@ -862,11 +840,6 @@ $code.=<<___;
|
|||
movdqa %xmm0,16*2(%rax)
|
||||
movdqa %xmm0,16*3(%rax)
|
||||
lea 16*4(%rax),%rax
|
||||
movdqa %xmm0,16*0(%rdx) # wipe n
|
||||
movdqa %xmm0,16*1(%rdx)
|
||||
movdqa %xmm0,16*2(%rdx)
|
||||
movdqa %xmm0,16*3(%rdx)
|
||||
lea 16*4(%rdx),%rdx
|
||||
dec $num
|
||||
jnz .Lsqr8x_zero
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -788,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
if (window >= 5) {
|
||||
window = 5; /* ~5% improvement for RSA2048 sign, and even
|
||||
* for RSA4096 */
|
||||
if ((top & 7) == 0)
|
||||
powerbufLen += 2 * top * sizeof(m->d[0]);
|
||||
/* reserve space for mont->N.d[] copy */
|
||||
powerbufLen += top * sizeof(mont->N.d[0]);
|
||||
}
|
||||
#endif
|
||||
(void)0;
|
||||
|
@ -1010,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
const BN_ULONG *not_used, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num);
|
||||
|
||||
BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
|
||||
BN_ULONG *n0 = mont->n0, *np;
|
||||
|
||||
/*
|
||||
* BN_to_montgomery can contaminate words above .top [in
|
||||
|
@ -1021,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
for (i = tmp.top; i < top; i++)
|
||||
tmp.d[i] = 0;
|
||||
|
||||
if (top & 7)
|
||||
np2 = np;
|
||||
else
|
||||
for (np2 = am.d + top, i = 0; i < top; i++)
|
||||
np2[2 * i] = np[i];
|
||||
/*
|
||||
* copy mont->N.d[] to improve cache locality
|
||||
*/
|
||||
for (np = am.d + top, i = 0; i < top; i++)
|
||||
np[i] = mont->N.d[i];
|
||||
|
||||
bn_scatter5(tmp.d, top, powerbuf, 0);
|
||||
bn_scatter5(am.d, am.top, powerbuf, 1);
|
||||
|
@ -1035,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
# if 0
|
||||
for (i = 3; i < 32; i++) {
|
||||
/* Calculate a^i = a^(i-1) * a */
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
|
||||
bn_scatter5(tmp.d, top, powerbuf, i);
|
||||
}
|
||||
# else
|
||||
|
@ -1046,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
}
|
||||
for (i = 3; i < 8; i += 2) {
|
||||
int j;
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
|
||||
bn_scatter5(tmp.d, top, powerbuf, i);
|
||||
for (j = 2 * i; j < 32; j *= 2) {
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
|
@ -1054,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
}
|
||||
}
|
||||
for (; i < 16; i += 2) {
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
|
||||
bn_scatter5(tmp.d, top, powerbuf, i);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
|
||||
}
|
||||
for (; i < 32; i += 2) {
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
|
||||
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
|
||||
bn_scatter5(tmp.d, top, powerbuf, i);
|
||||
}
|
||||
# endif
|
||||
|
@ -1089,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
while (bits >= 0) {
|
||||
wvalue = bn_get_bits5(p->d, bits - 4);
|
||||
bits -= 5;
|
||||
bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
|
||||
bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
|
||||
}
|
||||
}
|
||||
|
||||
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
|
||||
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
|
||||
tmp.top = top;
|
||||
bn_correct_top(&tmp);
|
||||
if (ret) {
|
||||
|
|
Loading…
Reference in a new issue