crypto/bn/x86_64-mont5.pl: constant-time gather procedure.

At the same time remove miniscule bias in final subtraction.
Performance penalty varies from platform to platform, and even with
key length. For rsa2048 sign it was observed to be 4% for Sandy
Bridge and 7% on Broadwell.

CVE-2016-0702

Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from master)
This commit is contained in:
Andy Polyakov 2016-01-25 23:41:01 +01:00 committed by Matt Caswell
parent 08ea966c01
commit 25d14c6c29
3 changed files with 676 additions and 556 deletions

View file

@ -775,20 +775,20 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic
# do its job.
#
lea -64(%rsp,$num,4),%r11
lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@ -798,37 +798,17 @@ bn_sqr8x_mont:
mov $num,%r10
neg $num
lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body:
mov $num,$i
movq %r11, %xmm2 # save pointer to modulus copy
shr \$3+2,$i
mov OPENSSL_ia32cap_P+8(%rip),%eax
jmp .Lsqr8x_copy_n
.align 32
.Lsqr8x_copy_n:
movq 8*0($nptr),%xmm0
movq 8*1($nptr),%xmm1
movq 8*2($nptr),%xmm3
movq 8*3($nptr),%xmm4
lea 8*4($nptr),$nptr
movdqa %xmm0,16*0(%r11)
movdqa %xmm1,16*1(%r11)
movdqa %xmm3,16*2(%r11)
movdqa %xmm4,16*3(%r11)
lea 16*4(%r11),%r11
dec $i
jnz .Lsqr8x_copy_n
movq $nptr, %xmm2 # save pointer to modulus
pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax
cmp \$0x80100,%eax
jne .Lsqr8x_nox
@ -837,7 +817,6 @@ $code.=<<___ if ($addx);
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero
@ -850,7 +829,6 @@ $code.=<<___;
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero
@ -862,11 +840,6 @@ $code.=<<___;
movdqa %xmm0,16*2(%rax)
movdqa %xmm0,16*3(%rax)
lea 16*4(%rax),%rax
movdqa %xmm0,16*0(%rdx) # wipe n
movdqa %xmm0,16*1(%rdx)
movdqa %xmm0,16*2(%rdx)
movdqa %xmm0,16*3(%rdx)
lea 16*4(%rdx),%rdx
dec $num
jnz .Lsqr8x_zero

File diff suppressed because it is too large Load diff

View file

@ -788,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */
if ((top & 7) == 0)
powerbufLen += 2 * top * sizeof(m->d[0]);
/* reserve space for mont->N.d[] copy */
powerbufLen += top * sizeof(mont->N.d[0]);
}
#endif
(void)0;
@ -1010,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num);
BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
BN_ULONG *n0 = mont->n0, *np;
/*
* BN_to_montgomery can contaminate words above .top [in
@ -1021,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
if (top & 7)
np2 = np;
else
for (np2 = am.d + top, i = 0; i < top; i++)
np2[2 * i] = np[i];
/*
* copy mont->N.d[] to improve cache locality
*/
for (np = am.d + top, i = 0; i < top; i++)
np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
@ -1035,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@ -1046,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@ -1054,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@ -1089,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5;
bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
}
}
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
if (ret) {