Eliminate conditional final subtraction in Montgomery assembler modules.

This commit is contained in:
Andy Polyakov 2007-06-17 17:10:03 +00:00
parent 55525742f4
commit 7d9cf7c0bb
10 changed files with 273 additions and 272 deletions

View file

@ -258,56 +258,48 @@ bn_mul_mont:
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$ap
mov $rp,$bp
s8addq $num,sp,$tj # &tp[num]
mov $rp,$bp # put rp aside
mov sp,$tp
mov 0,$hi0
bne $hi1,.Lsub
cmpult $nj,$lo1,AT
bne AT,.Lsub
.align 4
.Lcopy: ldq AT,($tp)
lda $tp,8($tp)
stq AT,($rp)
cmpult $tp,$ap,AT
stq zero,-8($tp)
nop
lda $rp,8($rp)
bne AT,.Lcopy
mov 1,v0
br .Lexit
mov sp,$ap
srl $nj,62,AT # boundary condition...
beq AT,.Lcopy # ... is met
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,($tp)
ldq $lo1,($np)
subq $lo0,$lo1,$lo1
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
lda $tp,8($tp)
or $hi0,AT,$hi0
lda $np,8($np)
stq $lo0,($rp)
cmpult $tp,$ap,v0
cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
cmpule $hi1,$hi0,AT
mov $bp,$rp
bne AT,.Lcopy
mov $bp,$rp # restore rp
and sp,$hi0,$ap
bic $bp,$hi0,$bp
bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
.Lzap: stq zero,($tp)
cmpult $tp,$ap,AT
.Lcopy: ldq $aj,($ap) # copy or in-place refresh
lda $tp,8($tp)
bne AT,.Lzap
lda $rp,8($rp)
lda $ap,8($ap)
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.align 4
.Lexit:
.set noreorder
mov fp,sp

View file

@ -61,7 +61,7 @@ bn_mul_mont:
cmp $num,#2
movlt r0,#0
addlt sp,sp,#2*4
blt .Labort
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
@ -160,27 +160,13 @@ bn_mul_mont:
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
cmp $nhi,#0 @ upmost carry
bne .Lsub
cmp $nlo,$nj @ tp[num-1]-np[num-1]
bhs .Lsub
.Lcopy: ldr $tj,[$tp]
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lcopy
.Lexit: add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labort:tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
movs $tj,$nj,lsr#30 @ boundary condition...
beq .Lcopy @ ... is met
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
@ -190,12 +176,24 @@ bn_mul_mont:
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
blo .Lcopy @ tp was less after all
.Lzap: str sp,[$tp],#4
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lzap
bal .Lexit
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___

View file

@ -265,27 +265,50 @@ bn_mul_mont:
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
.set noreorder
PTR_ADD $ap,sp,$num
PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
bnez $hi1,.Lsub
li $hi0,0
sgeu AT,$lo1,$nj
beqz AT,.Lsub
nop
dsrl AT,$nj,62 # boundary condition...
beqz AT,.Lcopy # ... is met
li $hi0,0 # clear borrow bit
.align 4
.Lcopy: ld AT,($tp)
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
PTR_ADD $tp,8
PTR_ADD $np,8
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
sd $lo0,($rp)
or $hi0,AT
sltu AT,$tp,$tj
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,sp
PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: ld $aj,($ap)
PTR_ADD $ap,8
PTR_ADD $tp,8
sd AT,($rp)
sltu AT,$tp,$ap
sd zero,-8($tp)
sltu AT,$tp,$tj
sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
.Lexit:
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
@ -297,34 +320,6 @@ bn_mul_mont:
li v0,1
jr ra
PTR_ADD sp,$fp,64
.align 4
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
dsubu $lo1,$lo0,$lo1
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
PTR_ADD $tp,8
or $hi0,AT
PTR_ADD $np,8
sd $lo0,($rp)
sltu AT,$tp,$ap
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0
move $tp,sp
sgtu AT,$hi0,$hi1
bnez AT,.Lcopy
PTR_SUB $rp,$num
.align 4
.Lzap: sd zero,($tp)
sltu AT,$tp,$ap
bnez AT,.Lzap
PTR_ADD $tp,8
b .Lexit
nop
.set reorder
END(bn_mul_mont)
.rdata

View file

@ -2,8 +2,9 @@
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($output =~ /64\-mont\.s/) {
@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $output"; }
@ -264,24 +267,37 @@ Linner:
addi $i,$i,$BNSZ
ble- Louter
$SHRI. $nj,$nj,$BITS-2 ; check boundary condition
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $ap,$sp,$FRAME
mtctr $num
li $j,0
beq Lcopy ; boundary condition is met
subfc. $ovf,$j,$ovf ; sets XER[CA]
bne Lsub
$UCMP $hi1,$nj
bge Lsub
.align 4
Lcopy:
$LDX $tj,$tp,$j
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $aj,$nj,$tj ; tp[j]-np[j]
$STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
.align 4
Lcopy: ; copy or in-place refresh
$LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
Lexit:
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
@ -298,22 +314,7 @@ Lexit:
li r3,1
blr
.long 0
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $tj,$nj,$tj ; tp[j]-np[j]
$STX $tj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
subfe. $ovf,$j,$ovf
mtctr $num
bne Lcopy
.align 4
Lzap: $STX $j,$tp,$j
addi $j,$j,$BNSZ
bdnz- Lzap
b Lexit
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View file

@ -176,45 +176,45 @@ bn_mul_mont:
___
undef $bi;
$count=$ap; undef $ap;
$count=$bp; undef $bp;
$code.=<<___;
lg $rp,16+16($fp) # reincarnate rp
la $ap,8($fp)
lgr $j,$num
ltgr $AHI,$AHI
jnz .Lsub # upmost overflow bit is not zero
#slg $NHI,-8($np) # tp[num-1]-np[num-1]
#lg $nhi,-8($np) # buggy assembler
lghi $count,-8 # buggy assembler
slg $NHI,0($count,$np) # buggy assembler
jnle .Lsub # branch if not borrow
lg $nhi,0($count,$np) # buggy assembler
srag $nhi,$nhi,62 # boundary condition...
jz .Lcopy # ... is met
.Lcopy: lg $alo,8($j,$fp)
stg $j,8($j,$fp)
stg $alo,0($j,$rp)
aghi $j,8
jnz .Lcopy
.Lexit:
lmg %r6,%r15,16+48($fp)
lghi %r2,1 # signal "processed"
br %r14
.Lsub: lcgr $count,$num
lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
.Lsubloop:
lg $alo,8($j,$fp)
.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsubloop
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi
lgr $j,$num
jle .Lcopy # branch if borrow
slbgr $AHI,$ahi # handle upmost carry
.Lzap: stg $j,8($j,$fp)
ngr $ap,$AHI
lghi $np,-1
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,8($j,$fp) # zap tp
stg $alo,0($j,$rp)
aghi $j,8
jnz .Lzap
j .Lexit
jnz .Lcopy
lmg %r6,%r15,16+48($fp)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___

View file

@ -2,8 +2,9 @@
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
@ -254,44 +255,36 @@ $fname:
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
cmp $car2,0 ! clears %icc.c
bne,pn %icc,.Lsub
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
cmp $car1,$npj ! compare top-most $tp and $np words
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
nop
srl $npj,30,%o0 ! boundary condition...
brz,pn %o0,.Lcopy ! ... is met
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16,0x1000000
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1
subccc %o0,%o1,%o1 ! tp[j]-np[j]
st %o1,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lsub
nop
subccc $car2,0,$car2
bcc %icc,.Lzap
subc $car2,0,$car2 ! handle upmost overflow bit
and $tp,$car2,$ap
andn $rp,$car2,$np
or $ap,$np,$ap
sub %g0,$num,%o7
.align 16,0x1000000
.Lcopy:
ld [$tp+%o7],%o0
ld [$ap+%o7],%o0 ! copy or in-place refresh
st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
ba .Lzap
sub %g0,$num,%o7
.align 32
.Lzap:
st %g0,[$tp+%o7]
add %o7,4,%o7
brnz %o7,.Lzap
nop
mov 1,%i0
ret
restore
@ -609,6 +602,7 @@ $code.=<<___;
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;

View file

@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
.ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
.section ".text",#alloc,#execinstr
.global $fname
@ -799,17 +798,14 @@ $fname:
bnz %icc,.Louter
nop
sub %g0,$num,%o7 ! n=-num
cmp $carry,0 ! clears %icc.c
bne,pn %icc,.Lsub
add $tp,8,$tp ! adjust tp to point at the end
ld [$tp-8],%o0
ld [$np-4],%o1
cmp %o0,%o1 ! compare topmost words
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
nop
subcc %g0,%g0,%g0 ! clear %icc.c
add $tp,8,$tp ! adjust tp to point at the end
srl %o1,30,%o1 ! boundary condition...
orn %g0,%g0,%g4
brz,pn %o1,.Lcopy ! ... is met
sub %g0,$num,%o7 ! n=-num
.align 32,0x1000000
.Lsub:
ldx [$tp+%o7],%o0
@ -824,24 +820,30 @@ $fname:
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subccc $carry,0,$carry
bcc,pt %icc,.Lzap
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
.align 16,0x1000000
.align 32,0x1000000
.Lcopy:
ldx [$tp+%o7],%o0
srlx %o0,32,%o1
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.align 32
.align 32,0x1000000
.Lzap:
stx %g0,[$tp+%o7]
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]

View file

@ -77,7 +77,8 @@
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
push(@INC,".","../../perlasm");
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"via-mont.pl");
@ -100,7 +101,7 @@ $sp=&DWP(28,"esp");
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of np[num]
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
@ -115,7 +116,7 @@ $sp=&DWP(28,"esp");
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
&cmp ("ecx",256);
&cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
@ -148,74 +149,91 @@ $sp=&DWP(28,"esp");
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
&add ("ecx",32/4); # (4 vectors + 32 byte scratch)/4
&xor ("eax","eax");
&mov ("ecx","ebp");
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded ap copy...
# edi points at the end of ap copy...
&mov ("ecx","ebp");
&add ("edi",$pad); # skip padding to point at bp copy
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded bp copy...
# edi points at the end of bp copy...
&mov ("ecx","ebp");
&add ("edi",$pad); # skip padding to point at np copy
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
&xor ("eax","eax");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
&xor ("edx","edx"); # i=0
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of np copy...
&xor ("edx","edx"); # i=0
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
&mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1]
&neg ("ebp");
&lea ("ebp",&DWP(0,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&mov ("ebx",&DWP(0,"esi","ecx",4)); # upmost overflow bit
&cmp ("ebx",0); # clears CF unconfitionally
&jnz (&label("sub"));
&mov ("eax",&DWP(-4,"esi","ecx",4));
&cmp ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]?
&jae (&label("sub")); # if taken CF is cleared
&shr ("eax",30); # boundary condition...
&jz (&label("copy")); # ... is met
&xor ("edx","edx"); # clear CF
&set_label("copy",4);
&mov ("ebx","ecx");
&data_byte(0xf3,0xa5); # rep movsl
&mov ("ecx","ebx");
&jmp (&label("zap"));
&set_label("sub",16);
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
&dec ("ecx"); # doesn't affect CF!
&jg (&label("sub"));
&sbb ("ebx",0); # upmost overflow is still there
&mov ("ecx","edx");
&jc (&label("copy"));
&loop (&label("sub")); # doesn't affect CF!
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
&and ("esi","eax");
&not ("eax");
&mov ("ebp","edi");
&and ("ebp","eax");
&or ("esi","ebp"); # tp=carry?tp:rp
&mov ("ecx","edx"); # num
&xor ("edx","edx"); # i=0
&set_label("copy",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
&set_label("zap",4);
&mov ("ebp",$sp);
&xor ("eax","eax");
&lea ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4
&mov ("edi","esp");
&mov ("ecx",64/4);
&mov ("edi","esp"); # zap frame including scratch area
&data_byte(0xf3,0xab); # rep stosl, bzero
# zap copies of ap, bp and np
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
@ -224,4 +242,6 @@ $sp=&DWP(28,"esp");
&set_label("leave");
&function_end($func);
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View file

@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
$i="edx";
$j="ecx";
$ap="esi";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
@ -551,41 +551,39 @@ $sbit=$num;
}
&set_label("common_tail",16);
&mov ($np,$_np);
&mov ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
# [$ap and $bp are zapped]
&xor ($i,$i); # i=0
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$np,$num,4)); # np[num-1]
&shr ("eax",30); # check for boundary condition
&jz (&label("copy"));
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&cmp ("esi",0); # clears CF unconditionally
&jnz (&label("sub"));
&mov ("eax",&DWP($frame,"esp",$j,4));
&cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]?
&jae (&label("sub")); # if taken CF is cleared
&set_label("copy",16);
&mov ("eax",&DWP($frame,"esp",$j,4));
&mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$j,4),$j); # zap temporary vector
&dec ($j);
&jge (&label("copy"));
&jmp (&label("exit"));
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&mov ("eax",&DWP($frame,"esp",$i,4));
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&lea ($i,&DWP(1,$i)); # i++
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&mov ($j,$num); # j=num-1
&sbb ("esi",0); # esi holds upmost overflow bit
&jc (&label("copy"));
&set_label("zap",8);
&mov (&DWP($frame,"esp",$j,4),$i); # zap temporary vector
&dec ($j);
&jge (&label("zap"));
&set_label("exit",8);
&sbb ("eax",0); # handle upmost overflow bit
&and ($tp,"eax");
&not ("eax");
&mov ($np,$rp);
&and ($np,"eax");
&or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
&mov ("eax",&DWP(0,$tp,$num,4));
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");

View file

@ -59,6 +59,7 @@ bn_mul_mont:
neg %rax
lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
mov %rdx,$bp # $bp reassigned, remember?
@ -166,22 +167,38 @@ bn_mul_mont:
cmp $num,$i
jl .Louter
xor $i,$i # i=0
mov -8($np,$num,8),%rax # np[num-1]
lea (%rsp),$ap # borrow ap for tp
shr \$62,%rax # check for boundary condition
jz .Lcopy
mov ($ap),%rax # tp[0]
lea -1($num),$j # j=num-1
cmp \$0,%rdx # %rdx still holds upmost overflow bit
jnz .Lsub # CF is cleared by compare with 0
mov (%rsp,$j,8),%rax
cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
jae .Lsub # if taken CF was cleared by above cmp
.align 4
.Lcopy:
mov (%rsp,$j,8),%rax
xor $i,$i # i=0 and clear CF!
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
dec $j # doesn't affect CF!
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
jge .Lsub
sbb \$0,%rax # handle upmost overflow bit
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
lea -1($num),$j
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
mov ($ap,$j,8),%rax
mov %rax,($rp,$j,8) # rp[i]=tp[i]
mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lcopy
.align 4
.Lexit:
mov 8(%rsp,$num,8),%rsp # restore %rsp
mov \$1,%rax
pop %r15
@ -191,22 +208,6 @@ bn_mul_mont:
pop %rbp
pop %rbx
ret
.align 16
.Lsub: mov (%rsp,$i,8),%rax
sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
lea 1($i),$i # i++
dec $j # doesn't affect CF!
jge .Lsub
lea -1($num),$j # j=num-1
sbb \$0,%rdx
jc .Lcopy # tp was less than np
.align 4
.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lzap
jmp .Lexit
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___