ec/asm/ecp_nistz256-x86_64.pl: addition to perform stricter reduction.
Addition was not preserving inputs' property of being fully reduced.
Thanks to Brian Smith for reporting this.
Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from commit b62b2454fa
)
This commit is contained in:
parent
1f61e8f07a
commit
e76cf5c06d
2 changed files with 86 additions and 64 deletions
|
@ -128,6 +128,7 @@ ecp_nistz256_mul_by_2:
|
|||
push %r13
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4,$t4
|
||||
mov 8*1($a_ptr), $a1
|
||||
add $a0, $a0 # a0:a3+a0:a3
|
||||
mov 8*2($a_ptr), $a2
|
||||
|
@ -138,7 +139,7 @@ ecp_nistz256_mul_by_2:
|
|||
adc $a2, $a2
|
||||
adc $a3, $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub 8*0($a_ptr), $a0
|
||||
mov $a2, $t2
|
||||
|
@ -146,14 +147,14 @@ ecp_nistz256_mul_by_2:
|
|||
sbb 8*2($a_ptr), $a2
|
||||
mov $a3, $t3
|
||||
sbb 8*3($a_ptr), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -250,12 +251,12 @@ ecp_nistz256_mul_by_3:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb .Lpoly+8*3(%rip), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovz $t2, $a2
|
||||
cmovz $t3, $a3
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
cmovb $t2, $a2
|
||||
cmovb $t3, $a3
|
||||
|
||||
xor $t4, $t4
|
||||
add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
|
||||
|
@ -272,14 +273,14 @@ ecp_nistz256_mul_by_3:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb .Lpoly+8*3(%rip), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -318,14 +319,14 @@ ecp_nistz256_add:
|
|||
sbb 8*2($a_ptr), $a2
|
||||
mov $a3, $t3
|
||||
sbb 8*3($a_ptr), $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -1840,13 +1841,14 @@ $code.=<<___;
|
|||
.type __ecp_nistz256_add_toq,\@abi-omnipotent
|
||||
.align 32
|
||||
__ecp_nistz256_add_toq:
|
||||
xor $t4,$t4
|
||||
add 8*0($b_ptr), $a0
|
||||
adc 8*1($b_ptr), $a1
|
||||
mov $a0, $t0
|
||||
adc 8*2($b_ptr), $a2
|
||||
adc 8*3($b_ptr), $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $a0
|
||||
mov $a2, $t2
|
||||
|
@ -1854,14 +1856,14 @@ __ecp_nistz256_add_toq:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -1929,13 +1931,14 @@ __ecp_nistz256_subq:
|
|||
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
|
||||
.align 32
|
||||
__ecp_nistz256_mul_by_2q:
|
||||
xor $t4, $t4
|
||||
add $a0, $a0 # a0:a3+a0:a3
|
||||
adc $a1, $a1
|
||||
mov $a0, $t0
|
||||
adc $a2, $a2
|
||||
adc $a3, $a3
|
||||
mov $a1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $a0
|
||||
mov $a2, $t2
|
||||
|
@ -1943,14 +1946,14 @@ __ecp_nistz256_mul_by_2q:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $a0
|
||||
cmovz $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovz $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovz $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -2405,6 +2408,7 @@ $code.=<<___;
|
|||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
||||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
||||
|
||||
xor $t4, $t4
|
||||
add $acc0, $acc0 # a0:a3+a0:a3
|
||||
lea $Rsqr(%rsp), $a_ptr
|
||||
adc $acc1, $acc1
|
||||
|
@ -2412,7 +2416,7 @@ $code.=<<___;
|
|||
adc $acc2, $acc2
|
||||
adc $acc3, $acc3
|
||||
mov $acc1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $acc0
|
||||
mov $acc2, $t2
|
||||
|
@ -2420,15 +2424,15 @@ $code.=<<___;
|
|||
sbb \$0, $acc2
|
||||
mov $acc3, $t3
|
||||
sbb $poly3, $acc3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $acc0
|
||||
cmovb $t0, $acc0
|
||||
mov 8*0($a_ptr), $t0
|
||||
cmovz $t1, $acc1
|
||||
cmovb $t1, $acc1
|
||||
mov 8*1($a_ptr), $t1
|
||||
cmovz $t2, $acc2
|
||||
cmovb $t2, $acc2
|
||||
mov 8*2($a_ptr), $t2
|
||||
cmovz $t3, $acc3
|
||||
cmovb $t3, $acc3
|
||||
mov 8*3($a_ptr), $t3
|
||||
|
||||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
||||
|
@ -2710,6 +2714,7 @@ $code.=<<___;
|
|||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
||||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
||||
|
||||
xor $t4, $t4
|
||||
add $acc0, $acc0 # a0:a3+a0:a3
|
||||
lea $Rsqr(%rsp), $a_ptr
|
||||
adc $acc1, $acc1
|
||||
|
@ -2717,7 +2722,7 @@ $code.=<<___;
|
|||
adc $acc2, $acc2
|
||||
adc $acc3, $acc3
|
||||
mov $acc1, $t1
|
||||
sbb $t4, $t4
|
||||
adc \$0, $t4
|
||||
|
||||
sub \$-1, $acc0
|
||||
mov $acc2, $t2
|
||||
|
@ -2725,15 +2730,15 @@ $code.=<<___;
|
|||
sbb \$0, $acc2
|
||||
mov $acc3, $t3
|
||||
sbb $poly3, $acc3
|
||||
test $t4, $t4
|
||||
sbb \$0, $t4
|
||||
|
||||
cmovz $t0, $acc0
|
||||
cmovb $t0, $acc0
|
||||
mov 8*0($a_ptr), $t0
|
||||
cmovz $t1, $acc1
|
||||
cmovb $t1, $acc1
|
||||
mov 8*1($a_ptr), $t1
|
||||
cmovz $t2, $acc2
|
||||
cmovb $t2, $acc2
|
||||
mov 8*2($a_ptr), $t2
|
||||
cmovz $t3, $acc3
|
||||
cmovb $t3, $acc3
|
||||
mov 8*3($a_ptr), $t3
|
||||
|
||||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
||||
|
@ -2885,14 +2890,14 @@ __ecp_nistz256_add_tox:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
sbb \$0, $t4
|
||||
|
||||
bt \$0, $t4
|
||||
cmovnc $t0, $a0
|
||||
cmovnc $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovnc $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovnc $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
@ -2980,14 +2985,14 @@ __ecp_nistz256_mul_by_2x:
|
|||
sbb \$0, $a2
|
||||
mov $a3, $t3
|
||||
sbb $poly3, $a3
|
||||
sbb \$0, $t4
|
||||
|
||||
bt \$0, $t4
|
||||
cmovnc $t0, $a0
|
||||
cmovnc $t1, $a1
|
||||
cmovb $t0, $a0
|
||||
cmovb $t1, $a1
|
||||
mov $a0, 8*0($r_ptr)
|
||||
cmovnc $t2, $a2
|
||||
cmovb $t2, $a2
|
||||
mov $a1, 8*1($r_ptr)
|
||||
cmovnc $t3, $a3
|
||||
cmovb $t3, $a3
|
||||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
|
|
|
@ -82,19 +82,36 @@ typedef struct ec_pre_comp_st {
|
|||
} EC_PRE_COMP;
|
||||
|
||||
/* Functions implemented in assembly */
|
||||
/* Modular mul by 2: res = 2*a mod P */
|
||||
void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
/* Modular div by 2: res = a/2 mod P */
|
||||
void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
/* Modular mul by 3: res = 3*a mod P */
|
||||
void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
/*
|
||||
* Most of below mentioned functions *preserve* the property of inputs
|
||||
* being fully reduced, i.e. being in [0, modulus) range. Simply put if
|
||||
* inputs are fully reduced, then output is too. Note that reverse is
|
||||
* not true, in sense that given partially reduced inputs output can be
|
||||
* either, not unlikely reduced. And "most" in first sentence refers to
|
||||
* the fact that given the calculations flow one can tolerate that
|
||||
* addition, 1st function below, produces partially reduced result *if*
|
||||
* multiplications by 2 and 3, which customarily use addition, fully
|
||||
* reduce it. This effectively gives two options: a) addition produces
|
||||
* fully reduced result [as long as inputs are, just like remaining
|
||||
* functions]; b) addition is allowed to produce partially reduced
|
||||
* result, but multiplications by 2 and 3 perform additional reduction
|
||||
* step. Choice between the two can be platform-specific, but it was a)
|
||||
* in all cases so far...
|
||||
*/
|
||||
/* Modular add: res = a+b mod P */
|
||||
void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS],
|
||||
const BN_ULONG b[P256_LIMBS]);
|
||||
/* Modular mul by 2: res = 2*a mod P */
|
||||
void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
/* Modular mul by 3: res = 3*a mod P */
|
||||
void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
|
||||
/* Modular div by 2: res = a/2 mod P */
|
||||
void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS]);
|
||||
/* Modular sub: res = a-b mod P */
|
||||
void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
|
||||
const BN_ULONG a[P256_LIMBS],
|
||||
|
|
Loading…
Reference in a new issue