ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction.
Addition was not preserving inputs' property of being fully reduced. Thanks to Brian Smith for reporting this. Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
parent
b62b2454fa
commit
dfde4219fd
4 changed files with 158 additions and 89 deletions
|
@ -174,10 +174,7 @@ __ecp_nistz256_mul_by_2:
|
|||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
#ifdef __thumb2__
|
||||
it cs
|
||||
#endif
|
||||
movcs $ff,#-1 @ $ff = carry ? -1 : 0
|
||||
adc $ff,$ff,#0
|
||||
|
||||
b .Lreduce_by_sub
|
||||
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
|
||||
|
@ -228,35 +225,45 @@ __ecp_nistz256_add:
|
|||
adcs $a6,$a6,$t2
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$t3
|
||||
#ifdef __thumb2__
|
||||
it cs
|
||||
#endif
|
||||
movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
|
||||
adc $ff,$ff,#0
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
.Lreduce_by_sub:
|
||||
|
||||
@ if a+b carries, subtract modulus.
|
||||
@ if a+b >= modulus, subtract modulus.
|
||||
@
|
||||
@ But since comparison implies subtraction, we subtract
|
||||
@ modulus and then add it back if subraction borrowed.
|
||||
|
||||
subs $a0,$a0,#-1
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
sbcs $a4,$a4,#0
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ using value of broadcasted carry as a whole or extracting
|
||||
@ single bit. Follow $ff register...
|
||||
@ using value of borrow as a whole or extracting single bit.
|
||||
@ Follow $ff register...
|
||||
|
||||
subs $a0,$a0,$ff @ subtract synthesized modulus
|
||||
sbcs $a1,$a1,$ff
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
sbcs $a2,$a2,$ff
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
sbcs $a3,$a3,#0
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
sbcs $a4,$a4,#0
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
sbcs $a5,$a5,#0
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
sbcs $a6,$a6,$ff,lsr#31
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
sbcs $a7,$a7,$ff
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
|
@ -304,26 +311,29 @@ __ecp_nistz256_mul_by_3:
|
|||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
#ifdef __thumb2__
|
||||
it cs
|
||||
#endif
|
||||
movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
|
||||
adc $ff,$ff,#0
|
||||
|
||||
subs $a0,$a0,$ff @ subtract synthesized modulus, see
|
||||
@ .Lreduce_by_sub for details, except
|
||||
@ that we don't write anything to
|
||||
@ memory, but keep intermediate
|
||||
@ results in registers...
|
||||
sbcs $a1,$a1,$ff
|
||||
sbcs $a2,$a2,$ff
|
||||
subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
sbcs $a4,$a4,#0
|
||||
ldr $b_ptr,[$a_ptr,#0]
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
adcs $a2,$a2,$ff
|
||||
adcs $a3,$a3,#0
|
||||
adcs $a4,$a4,#0
|
||||
ldr $b_ptr,[$a_ptr,#0]
|
||||
adcs $a5,$a5,#0
|
||||
ldr $t1,[$a_ptr,#4]
|
||||
sbcs $a6,$a6,$ff,lsr#31
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
ldr $t2,[$a_ptr,#8]
|
||||
sbcs $a7,$a7,$ff
|
||||
adc $a7,$a7,$ff
|
||||
|
||||
ldr $t0,[$a_ptr,#12]
|
||||
adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
|
||||
|
@ -339,10 +349,7 @@ __ecp_nistz256_mul_by_3:
|
|||
adcs $a6,$a6,$t2
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$t3
|
||||
#ifdef __thumb2__
|
||||
it cs
|
||||
#endif
|
||||
movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
|
||||
adc $ff,$ff,#0
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
b .Lreduce_by_sub
|
||||
|
@ -1210,25 +1217,42 @@ __ecp_nistz256_add_self:
|
|||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
#ifdef __thumb2__
|
||||
it cs
|
||||
#endif
|
||||
movcs $ff,#-1 @ $ff = carry ? -1 : 0
|
||||
adc $ff,$ff,#0
|
||||
|
||||
subs $a0,$a0,$ff @ subtract synthesized modulus
|
||||
sbcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
sbcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
@ if a+b >= modulus, subtract modulus.
|
||||
@
|
||||
@ But since comparison implies subtraction, we subtract
|
||||
@ modulus and then add it back if subraction borrowed.
|
||||
|
||||
subs $a0,$a0,#-1
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
sbcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ using value of borrow as a whole or extracting single bit.
|
||||
@ Follow $ff register...
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
sbcs $a6,$a6,$ff,lsr#31
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
sbcs $a7,$a7,$ff
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
|
|
|
@ -583,14 +583,14 @@ __ecp_nistz256_add:
|
|||
adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
|
||||
sbcs $t1,$acc1,$poly1
|
||||
sbcs $t2,$acc2,xzr
|
||||
sbc $t3,$acc3,$poly3
|
||||
cmp $ap,xzr // did addition carry?
|
||||
sbcs $t3,$acc3,$poly3
|
||||
sbcs xzr,$ap,xzr // did subtraction borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,eq // ret = carry ? ret-modulus : ret
|
||||
csel $acc1,$acc1,$t1,eq
|
||||
csel $acc2,$acc2,$t2,eq
|
||||
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
|
||||
csel $acc1,$acc1,$t1,lo
|
||||
csel $acc2,$acc2,$t2,lo
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,eq
|
||||
csel $acc3,$acc3,$t3,lo
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
|
|
|
@ -406,33 +406,44 @@ __ecp_nistz256_add:
|
|||
addccc @acc[5],$t5,@acc[5]
|
||||
addccc @acc[6],$t6,@acc[6]
|
||||
addccc @acc[7],$t7,@acc[7]
|
||||
subc %g0,%g0,$carry ! broadcast carry bit
|
||||
addc %g0,%g0,$carry
|
||||
|
||||
.Lreduce_by_sub:
|
||||
|
||||
! if a+b carries, subtract modulus.
|
||||
! if a+b >= modulus, subtract modulus.
|
||||
!
|
||||
! But since comparison implies subtraction, we subtract
|
||||
! modulus and then add it back if subraction borrowed.
|
||||
|
||||
subcc @acc[0],-1,@acc[0]
|
||||
subccc @acc[1],-1,@acc[1]
|
||||
subccc @acc[2],-1,@acc[2]
|
||||
subccc @acc[3], 0,@acc[3]
|
||||
subccc @acc[4], 0,@acc[4]
|
||||
subccc @acc[5], 0,@acc[5]
|
||||
subccc @acc[6], 1,@acc[6]
|
||||
subccc @acc[7],-1,@acc[7]
|
||||
subc $carry,0,$carry
|
||||
|
||||
! Note that because mod has special form, i.e. consists of
|
||||
! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
! using value of broadcasted borrow and the borrow bit itself.
|
||||
! To minimize dependency chain we first broadcast and then
|
||||
! extract the bit by negating (follow $bi).
|
||||
! using value of borrow and its negative.
|
||||
|
||||
subcc @acc[0],$carry,@acc[0] ! subtract synthesized modulus
|
||||
subccc @acc[1],$carry,@acc[1]
|
||||
addcc @acc[0],$carry,@acc[0] ! add synthesized modulus
|
||||
addccc @acc[1],$carry,@acc[1]
|
||||
neg $carry,$bi
|
||||
st @acc[0],[$rp]
|
||||
subccc @acc[2],$carry,@acc[2]
|
||||
addccc @acc[2],$carry,@acc[2]
|
||||
st @acc[1],[$rp+4]
|
||||
subccc @acc[3],0,@acc[3]
|
||||
addccc @acc[3],0,@acc[3]
|
||||
st @acc[2],[$rp+8]
|
||||
subccc @acc[4],0,@acc[4]
|
||||
addccc @acc[4],0,@acc[4]
|
||||
st @acc[3],[$rp+12]
|
||||
subccc @acc[5],0,@acc[5]
|
||||
addccc @acc[5],0,@acc[5]
|
||||
st @acc[4],[$rp+16]
|
||||
subccc @acc[6],$bi,@acc[6]
|
||||
addccc @acc[6],$bi,@acc[6]
|
||||
st @acc[5],[$rp+20]
|
||||
subc @acc[7],$carry,@acc[7]
|
||||
addc @acc[7],$carry,@acc[7]
|
||||
st @acc[6],[$rp+24]
|
||||
retl
|
||||
st @acc[7],[$rp+28]
|
||||
|
@ -469,7 +480,7 @@ __ecp_nistz256_mul_by_2:
|
|||
addccc @acc[6],@acc[6],@acc[6]
|
||||
addccc @acc[7],@acc[7],@acc[7]
|
||||
b .Lreduce_by_sub
|
||||
subc %g0,%g0,$carry ! broadcast carry bit
|
||||
addc %g0,%g0,$carry
|
||||
.type __ecp_nistz256_mul_by_2,#function
|
||||
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
|
||||
|
||||
|
@ -502,17 +513,27 @@ __ecp_nistz256_mul_by_3:
|
|||
addccc @acc[5],@acc[5],$t5
|
||||
addccc @acc[6],@acc[6],$t6
|
||||
addccc @acc[7],@acc[7],$t7
|
||||
subc %g0,%g0,$carry ! broadcast carry bit
|
||||
addc %g0,%g0,$carry
|
||||
|
||||
subcc $t0,$carry,$t0 ! .Lreduce_by_sub but without stores
|
||||
subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores
|
||||
subccc $t1,-1,$t1
|
||||
subccc $t2,-1,$t2
|
||||
subccc $t3, 0,$t3
|
||||
subccc $t4, 0,$t4
|
||||
subccc $t5, 0,$t5
|
||||
subccc $t6, 1,$t6
|
||||
subccc $t7,-1,$t7
|
||||
subc $carry,0,$carry
|
||||
|
||||
addcc $t0,$carry,$t0 ! add synthesized modulus
|
||||
addccc $t1,$carry,$t1
|
||||
neg $carry,$bi
|
||||
subccc $t1,$carry,$t1
|
||||
subccc $t2,$carry,$t2
|
||||
subccc $t3,0,$t3
|
||||
subccc $t4,0,$t4
|
||||
subccc $t5,0,$t5
|
||||
subccc $t6,$bi,$t6
|
||||
subc $t7,$carry,$t7
|
||||
addccc $t2,$carry,$t2
|
||||
addccc $t3,0,$t3
|
||||
addccc $t4,0,$t4
|
||||
addccc $t5,0,$t5
|
||||
addccc $t6,$bi,$t6
|
||||
addc $t7,$carry,$t7
|
||||
|
||||
addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a
|
||||
addccc $t1,@acc[1],@acc[1]
|
||||
|
@ -523,7 +544,7 @@ __ecp_nistz256_mul_by_3:
|
|||
addccc $t6,@acc[6],@acc[6]
|
||||
addccc $t7,@acc[7],@acc[7]
|
||||
b .Lreduce_by_sub
|
||||
subc %g0,%g0,$carry ! broadcast carry bit
|
||||
addc %g0,%g0,$carry
|
||||
.type __ecp_nistz256_mul_by_3,#function
|
||||
.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
|
||||
|
||||
|
@ -1662,14 +1683,15 @@ __ecp_nistz256_add_noload_vis3:
|
|||
addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
|
||||
addxccc $acc1,$poly1,$t1
|
||||
addxccc $acc2,$minus1,$t2
|
||||
addxc $acc3,$poly3,$t3
|
||||
addxccc $acc3,$poly3,$t3
|
||||
addxc $acc4,$minus1,$acc4
|
||||
|
||||
movrnz $acc4,$t0,$acc0 ! if a+b carried, ret = ret-mod
|
||||
movrnz $acc4,$t1,$acc1
|
||||
movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus
|
||||
movrz $acc4,$t1,$acc1
|
||||
stx $acc0,[$rp]
|
||||
movrnz $acc4,$t2,$acc2
|
||||
movrz $acc4,$t2,$acc2
|
||||
stx $acc1,[$rp+8]
|
||||
movrnz $acc4,$t3,$acc3
|
||||
movrz $acc4,$t3,$acc3
|
||||
stx $acc2,[$rp+16]
|
||||
retl
|
||||
stx $acc3,[$rp+24]
|
||||
|
|
|
@ -284,18 +284,41 @@ for(1..37) {
|
|||
&mov (&DWP(16,"edi"),"eax");
|
||||
&adc ("ecx",&DWP(24,"ebp"));
|
||||
&mov (&DWP(20,"edi"),"ebx");
|
||||
&mov ("esi",0);
|
||||
&adc ("edx",&DWP(28,"ebp"));
|
||||
&mov (&DWP(24,"edi"),"ecx");
|
||||
&sbb ("esi","esi"); # broadcast carry bit
|
||||
&adc ("esi",0);
|
||||
&mov (&DWP(28,"edi"),"edx");
|
||||
|
||||
# if a+b carries, subtract modulus.
|
||||
# if a+b >= modulus, subtract modulus.
|
||||
#
|
||||
# But since comparison implies subtraction, we subtract modulus
|
||||
# to see if it borrows, and then subtract it for real if
|
||||
# subtraction didn't borrow.
|
||||
|
||||
&mov ("eax",&DWP(0,"edi"));
|
||||
&mov ("ebx",&DWP(4,"edi"));
|
||||
&mov ("ecx",&DWP(8,"edi"));
|
||||
&sub ("eax",-1);
|
||||
&mov ("edx",&DWP(12,"edi"));
|
||||
&sbb ("ebx",-1);
|
||||
&mov ("eax",&DWP(16,"edi"));
|
||||
&sbb ("ecx",-1);
|
||||
&mov ("ebx",&DWP(20,"edi"));
|
||||
&sbb ("edx",0);
|
||||
&mov ("ecx",&DWP(24,"edi"));
|
||||
&sbb ("eax",0);
|
||||
&mov ("edx",&DWP(28,"edi"));
|
||||
&sbb ("ebx",0);
|
||||
&sbb ("ecx",1);
|
||||
&sbb ("edx",-1);
|
||||
&sbb ("esi",0);
|
||||
|
||||
# Note that because mod has special form, i.e. consists of
|
||||
# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
# assigning carry bit to one register, %ebp, and its negative
|
||||
# to another, %esi. But we started by calculating %esi...
|
||||
# by using borrow.
|
||||
|
||||
¬ ("esi");
|
||||
&mov ("eax",&DWP(0,"edi"));
|
||||
&mov ("ebp","esi");
|
||||
&mov ("ebx",&DWP(4,"edi"));
|
||||
|
|
Loading…
Reference in a new issue