Eliminate conditional final subtraction in Montgomery assembler modules.
This commit is contained in:
parent
55525742f4
commit
7d9cf7c0bb
10 changed files with 273 additions and 272 deletions
|
@ -258,56 +258,48 @@ bn_mul_mont:
|
|||
stq $hi1,16($tp)
|
||||
bne $tj,.Louter
|
||||
|
||||
s8addq $num,sp,$ap
|
||||
mov $rp,$bp
|
||||
s8addq $num,sp,$tj # &tp[num]
|
||||
mov $rp,$bp # put rp aside
|
||||
mov sp,$tp
|
||||
mov 0,$hi0
|
||||
|
||||
bne $hi1,.Lsub
|
||||
cmpult $nj,$lo1,AT
|
||||
bne AT,.Lsub
|
||||
|
||||
.align 4
|
||||
.Lcopy: ldq AT,($tp)
|
||||
lda $tp,8($tp)
|
||||
stq AT,($rp)
|
||||
cmpult $tp,$ap,AT
|
||||
stq zero,-8($tp)
|
||||
nop
|
||||
lda $rp,8($rp)
|
||||
bne AT,.Lcopy
|
||||
mov 1,v0
|
||||
br .Lexit
|
||||
mov sp,$ap
|
||||
srl $nj,62,AT # boundary condition...
|
||||
beq AT,.Lcopy # ... is met
|
||||
mov 0,$hi0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: ldq $lo0,($tp)
|
||||
ldq $lo1,($np)
|
||||
subq $lo0,$lo1,$lo1
|
||||
lda $tp,8($tp)
|
||||
lda $np,8($np)
|
||||
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
|
||||
cmpult $lo0,$lo1,AT
|
||||
subq $lo1,$hi0,$lo0
|
||||
cmpult $lo1,$lo0,$hi0
|
||||
lda $tp,8($tp)
|
||||
or $hi0,AT,$hi0
|
||||
lda $np,8($np)
|
||||
stq $lo0,($rp)
|
||||
cmpult $tp,$ap,v0
|
||||
cmpult $tp,$tj,v0
|
||||
lda $rp,8($rp)
|
||||
bne v0,.Lsub
|
||||
|
||||
subq $hi1,$hi0,$hi0
|
||||
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
|
||||
mov sp,$tp
|
||||
cmpule $hi1,$hi0,AT
|
||||
mov $bp,$rp
|
||||
bne AT,.Lcopy
|
||||
mov $bp,$rp # restore rp
|
||||
|
||||
and sp,$hi0,$ap
|
||||
bic $bp,$hi0,$bp
|
||||
bis $bp,$ap,$ap # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lzap: stq zero,($tp)
|
||||
cmpult $tp,$ap,AT
|
||||
.Lcopy: ldq $aj,($ap) # copy or in-place refresh
|
||||
lda $tp,8($tp)
|
||||
bne AT,.Lzap
|
||||
lda $rp,8($rp)
|
||||
lda $ap,8($ap)
|
||||
stq zero,-8($tp) # zap tp
|
||||
cmpult $tp,$tj,AT
|
||||
stq $aj,-8($rp)
|
||||
bne AT,.Lcopy
|
||||
mov 1,v0
|
||||
|
||||
.align 4
|
||||
.Lexit:
|
||||
.set noreorder
|
||||
mov fp,sp
|
||||
|
|
|
@ -61,7 +61,7 @@ bn_mul_mont:
|
|||
cmp $num,#2
|
||||
movlt r0,#0
|
||||
addlt sp,sp,#2*4
|
||||
blt .Labort
|
||||
blt .Labrt
|
||||
|
||||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||||
|
||||
|
@ -160,27 +160,13 @@ bn_mul_mont:
|
|||
add $num,$num,#4 @ $num to point at &tp[num]
|
||||
sub $aj,$num,sp @ "original" num value
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
mov $ap,$tp @ "borrow" $ap
|
||||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||||
|
||||
cmp $nhi,#0 @ upmost carry
|
||||
bne .Lsub
|
||||
cmp $nlo,$nj @ tp[num-1]-np[num-1]
|
||||
bhs .Lsub
|
||||
|
||||
.Lcopy: ldr $tj,[$tp]
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
str $tj,[$rp],#4
|
||||
cmp $tp,$num
|
||||
bne .Lcopy
|
||||
|
||||
.Lexit: add sp,$num,#4 @ skip over tp[num+1]
|
||||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labort:tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
movs $tj,$nj,lsr#30 @ boundary condition...
|
||||
beq .Lcopy @ ... is met
|
||||
|
||||
subs $tj,$tj,$tj @ "clear" carry flag
|
||||
.Lsub: ldr $tj,[$tp],#4
|
||||
ldr $nj,[$np],#4
|
||||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||||
|
@ -190,12 +176,24 @@ bn_mul_mont:
|
|||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
sub $rp,$rp,$aj @ "rewind" $rp
|
||||
blo .Lcopy @ tp was less after all
|
||||
|
||||
.Lzap: str sp,[$tp],#4
|
||||
and $ap,$tp,$nhi
|
||||
bic $np,$rp,$nhi
|
||||
orr $ap,$ap,$np @ ap=borrow?tp:rp
|
||||
|
||||
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
str $tj,[$rp],#4
|
||||
cmp $tp,$num
|
||||
bne .Lzap
|
||||
bal .Lexit
|
||||
bne .Lcopy
|
||||
|
||||
add sp,$num,#4 @ skip over tp[num+1]
|
||||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labrt: tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
|
|
@ -265,27 +265,50 @@ bn_mul_mont:
|
|||
addu $i,8
|
||||
sltu s7,$i,$num
|
||||
bnez s7,.Louter
|
||||
|
||||
|
||||
.set noreorder
|
||||
PTR_ADD $ap,sp,$num
|
||||
PTR_ADD $tj,sp,$num # &tp[num]
|
||||
move $tp,sp
|
||||
move $ap,sp
|
||||
|
||||
bnez $hi1,.Lsub
|
||||
li $hi0,0
|
||||
sgeu AT,$lo1,$nj
|
||||
beqz AT,.Lsub
|
||||
nop
|
||||
dsrl AT,$nj,62 # boundary condition...
|
||||
beqz AT,.Lcopy # ... is met
|
||||
li $hi0,0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lcopy: ld AT,($tp)
|
||||
.Lsub: ld $lo0,($tp)
|
||||
ld $lo1,($np)
|
||||
PTR_ADD $tp,8
|
||||
PTR_ADD $np,8
|
||||
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
|
||||
sgtu AT,$lo1,$lo0
|
||||
dsubu $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
sd $lo0,($rp)
|
||||
or $hi0,AT
|
||||
sltu AT,$tp,$tj
|
||||
bnez AT,.Lsub
|
||||
PTR_ADD $rp,8
|
||||
|
||||
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
|
||||
move $tp,sp
|
||||
PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ld $aj,($ap)
|
||||
PTR_ADD $ap,8
|
||||
PTR_ADD $tp,8
|
||||
sd AT,($rp)
|
||||
sltu AT,$tp,$ap
|
||||
sd zero,-8($tp)
|
||||
sltu AT,$tp,$tj
|
||||
sd $aj,($rp)
|
||||
bnez AT,.Lcopy
|
||||
PTR_ADD $rp,8
|
||||
|
||||
.Lexit:
|
||||
ld s0,0($fp)
|
||||
ld s1,8($fp)
|
||||
ld s2,16($fp)
|
||||
|
@ -297,34 +320,6 @@ bn_mul_mont:
|
|||
li v0,1
|
||||
jr ra
|
||||
PTR_ADD sp,$fp,64
|
||||
|
||||
.align 4
|
||||
.Lsub: ld $lo0,($tp)
|
||||
ld $lo1,($np)
|
||||
dsubu $lo1,$lo0,$lo1
|
||||
sgtu AT,$lo1,$lo0
|
||||
dsubu $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
PTR_ADD $tp,8
|
||||
or $hi0,AT
|
||||
PTR_ADD $np,8
|
||||
sd $lo0,($rp)
|
||||
sltu AT,$tp,$ap
|
||||
bnez AT,.Lsub
|
||||
PTR_ADD $rp,8
|
||||
|
||||
dsubu $hi0,$hi1,$hi0
|
||||
move $tp,sp
|
||||
sgtu AT,$hi0,$hi1
|
||||
bnez AT,.Lcopy
|
||||
PTR_SUB $rp,$num
|
||||
.align 4
|
||||
.Lzap: sd zero,($tp)
|
||||
sltu AT,$tp,$ap
|
||||
bnez AT,.Lzap
|
||||
PTR_ADD $tp,8
|
||||
b .Lexit
|
||||
nop
|
||||
.set reorder
|
||||
END(bn_mul_mont)
|
||||
.rdata
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. Rights for redistribution and usage in source and binary
|
||||
# forms are granted according to the OpenSSL license.
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# April 2006
|
||||
|
@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
|
|||
$UMULL= "mullw"; # unsigned multiply low
|
||||
$UMULH= "mulhwu"; # unsigned multiply high
|
||||
$UCMP= "cmplw"; # unsigned compare
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} elsif ($output =~ /64\-mont\.s/) {
|
||||
|
@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
|
|||
$UMULL= "mulld"; # unsigned multiply low
|
||||
$UMULH= "mulhdu"; # unsigned multiply high
|
||||
$UCMP= "cmpld"; # unsigned compare
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} else { die "nonsense $output"; }
|
||||
|
@ -264,24 +267,37 @@ Linner:
|
|||
addi $i,$i,$BNSZ
|
||||
ble- Louter
|
||||
|
||||
$SHRI. $nj,$nj,$BITS-2 ; check boundary condition
|
||||
addi $num,$num,2 ; restore $num
|
||||
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
|
||||
addi $tp,$sp,$FRAME
|
||||
addi $ap,$sp,$FRAME
|
||||
mtctr $num
|
||||
li $j,0
|
||||
beq Lcopy ; boundary condition is met
|
||||
|
||||
subfc. $ovf,$j,$ovf ; sets XER[CA]
|
||||
bne Lsub
|
||||
$UCMP $hi1,$nj
|
||||
bge Lsub
|
||||
.align 4
|
||||
Lcopy:
|
||||
$LDX $tj,$tp,$j
|
||||
Lsub: $LDX $tj,$tp,$j
|
||||
$LDX $nj,$np,$j
|
||||
subfe $aj,$nj,$tj ; tp[j]-np[j]
|
||||
$STX $aj,$rp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lsub
|
||||
|
||||
li $j,0
|
||||
mtctr $num
|
||||
subfe $ovf,$j,$ovf ; handle upmost overflow bit
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
$LDX $tj,$ap,$j
|
||||
$STX $tj,$rp,$j
|
||||
$STX $j,$tp,$j ; zap at once
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lcopy
|
||||
|
||||
Lexit:
|
||||
$POP r14,`4*$SIZE_T`($sp)
|
||||
$POP r15,`5*$SIZE_T`($sp)
|
||||
$POP r16,`6*$SIZE_T`($sp)
|
||||
|
@ -298,22 +314,7 @@ Lexit:
|
|||
li r3,1
|
||||
blr
|
||||
.long 0
|
||||
.align 4
|
||||
Lsub: $LDX $tj,$tp,$j
|
||||
$LDX $nj,$np,$j
|
||||
subfe $tj,$nj,$tj ; tp[j]-np[j]
|
||||
$STX $tj,$rp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lsub
|
||||
li $j,0
|
||||
subfe. $ovf,$j,$ovf
|
||||
mtctr $num
|
||||
bne Lcopy
|
||||
.align 4
|
||||
Lzap: $STX $j,$tp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lzap
|
||||
b Lexit
|
||||
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
|
|
@ -176,45 +176,45 @@ bn_mul_mont:
|
|||
___
|
||||
|
||||
undef $bi;
|
||||
$count=$ap; undef $ap;
|
||||
$count=$bp; undef $bp;
|
||||
|
||||
$code.=<<___;
|
||||
lg $rp,16+16($fp) # reincarnate rp
|
||||
la $ap,8($fp)
|
||||
lgr $j,$num
|
||||
ltgr $AHI,$AHI
|
||||
jnz .Lsub # upmost overflow bit is not zero
|
||||
#slg $NHI,-8($np) # tp[num-1]-np[num-1]
|
||||
|
||||
#lg $nhi,-8($np) # buggy assembler
|
||||
lghi $count,-8 # buggy assembler
|
||||
slg $NHI,0($count,$np) # buggy assembler
|
||||
jnle .Lsub # branch if not borrow
|
||||
lg $nhi,0($count,$np) # buggy assembler
|
||||
srag $nhi,$nhi,62 # boundary condition...
|
||||
jz .Lcopy # ... is met
|
||||
|
||||
.Lcopy: lg $alo,8($j,$fp)
|
||||
stg $j,8($j,$fp)
|
||||
stg $alo,0($j,$rp)
|
||||
aghi $j,8
|
||||
jnz .Lcopy
|
||||
.Lexit:
|
||||
lmg %r6,%r15,16+48($fp)
|
||||
lghi %r2,1 # signal "processed"
|
||||
br %r14
|
||||
|
||||
.Lsub: lcgr $count,$num
|
||||
lcgr $count,$num
|
||||
sra $count,3 # incidentally clears "borrow"
|
||||
.Lsubloop:
|
||||
lg $alo,8($j,$fp)
|
||||
.Lsub: lg $alo,0($j,$ap)
|
||||
slbg $alo,0($j,$np)
|
||||
stg $alo,0($j,$rp)
|
||||
la $j,8($j)
|
||||
brct $count,.Lsubloop
|
||||
brct $count,.Lsub
|
||||
lghi $ahi,0
|
||||
slbgr $AHI,$ahi
|
||||
lgr $j,$num
|
||||
jle .Lcopy # branch if borrow
|
||||
slbgr $AHI,$ahi # handle upmost carry
|
||||
|
||||
.Lzap: stg $j,8($j,$fp)
|
||||
ngr $ap,$AHI
|
||||
lghi $np,-1
|
||||
xgr $np,$AHI
|
||||
ngr $np,$rp
|
||||
ogr $ap,$np # ap=borrow?tp:rp
|
||||
lgr $j,$num
|
||||
|
||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||
stg $j,8($j,$fp) # zap tp
|
||||
stg $alo,0($j,$rp)
|
||||
aghi $j,8
|
||||
jnz .Lzap
|
||||
j .Lexit
|
||||
jnz .Lcopy
|
||||
|
||||
lmg %r6,%r15,16+48($fp)
|
||||
lghi %r2,1 # signal "processed"
|
||||
br %r14
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. Rights for redistribution and usage in source and binary
|
||||
# forms are granted according to the OpenSSL license.
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# December 2005
|
||||
|
@ -254,44 +255,36 @@ $fname:
|
|||
.Ltail:
|
||||
add $np,$num,$np
|
||||
add $rp,$num,$rp
|
||||
|
||||
cmp $car2,0 ! clears %icc.c
|
||||
bne,pn %icc,.Lsub
|
||||
mov $tp,$ap
|
||||
sub %g0,$num,%o7 ! k=-num
|
||||
|
||||
cmp $car1,$npj ! compare top-most $tp and $np words
|
||||
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
|
||||
nop
|
||||
srl $npj,30,%o0 ! boundary condition...
|
||||
brz,pn %o0,.Lcopy ! ... is met
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
|
||||
.align 16,0x1000000
|
||||
.Lsub:
|
||||
ld [$tp+%o7],%o0
|
||||
ld [$np+%o7],%o1
|
||||
subccc %o0,%o1,%o1
|
||||
subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
||||
st %o1,[$rp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lsub
|
||||
nop
|
||||
subccc $car2,0,$car2
|
||||
bcc %icc,.Lzap
|
||||
subc $car2,0,$car2 ! handle upmost overflow bit
|
||||
and $tp,$car2,$ap
|
||||
andn $rp,$car2,$np
|
||||
or $ap,$np,$ap
|
||||
sub %g0,$num,%o7
|
||||
|
||||
.align 16,0x1000000
|
||||
.Lcopy:
|
||||
ld [$tp+%o7],%o0
|
||||
ld [$ap+%o7],%o0 ! copy or in-place refresh
|
||||
st %g0,[$tp+%o7] ! zap tp
|
||||
st %o0,[$rp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lcopy
|
||||
nop
|
||||
ba .Lzap
|
||||
sub %g0,$num,%o7
|
||||
|
||||
.align 32
|
||||
.Lzap:
|
||||
st %g0,[$tp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lzap
|
||||
nop
|
||||
mov 1,%i0
|
||||
ret
|
||||
restore
|
||||
|
@ -609,6 +602,7 @@ $code.=<<___;
|
|||
add $tp,8,$tp
|
||||
.type $fname,#function
|
||||
.size $fname,(.-$fname)
|
||||
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
|
|
|
@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
|
|||
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
|
||||
|
||||
$code=<<___;
|
||||
.ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.global $fname
|
||||
|
@ -799,17 +798,14 @@ $fname:
|
|||
bnz %icc,.Louter
|
||||
nop
|
||||
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
cmp $carry,0 ! clears %icc.c
|
||||
bne,pn %icc,.Lsub
|
||||
add $tp,8,$tp ! adjust tp to point at the end
|
||||
|
||||
ld [$tp-8],%o0
|
||||
ld [$np-4],%o1
|
||||
cmp %o0,%o1 ! compare topmost words
|
||||
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
|
||||
nop
|
||||
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
add $tp,8,$tp ! adjust tp to point at the end
|
||||
srl %o1,30,%o1 ! boundary condition...
|
||||
orn %g0,%g0,%g4
|
||||
brz,pn %o1,.Lcopy ! ... is met
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.align 32,0x1000000
|
||||
.Lsub:
|
||||
ldx [$tp+%o7],%o0
|
||||
|
@ -824,24 +820,30 @@ $fname:
|
|||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lsub
|
||||
st %o3,[%g1+4]
|
||||
subccc $carry,0,$carry
|
||||
bcc,pt %icc,.Lzap
|
||||
subc $carry,0,%g4
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.align 16,0x1000000
|
||||
.align 32,0x1000000
|
||||
.Lcopy:
|
||||
ldx [$tp+%o7],%o0
|
||||
srlx %o0,32,%o1
|
||||
add $rp,%o7,%g1
|
||||
ld [%g1+0],%o2
|
||||
ld [%g1+4],%o3
|
||||
stx %g0,[$tp+%o7]
|
||||
and %o0,%g4,%o0
|
||||
srlx %o0,32,%o1
|
||||
andn %o2,%g4,%o2
|
||||
andn %o3,%g4,%o3
|
||||
or %o2,%o0,%o0
|
||||
or %o3,%o1,%o1
|
||||
st %o0,[%g1+0]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lcopy
|
||||
st %o1,[%g1+4]
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.align 32
|
||||
.align 32,0x1000000
|
||||
.Lzap:
|
||||
stx %g0,[$tp+%o7]
|
||||
stx %g0,[$ap_l+%o7]
|
||||
stx %g0,[$ap_h+%o7]
|
||||
stx %g0,[$np_l+%o7]
|
||||
|
|
|
@ -77,7 +77,8 @@
|
|||
# - in terms of absolute performance it delivers approximately as much
|
||||
# as modern out-of-order 32-bit cores [again, for longer keys].
|
||||
|
||||
push(@INC,".","../../perlasm");
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"via-mont.pl");
|
||||
|
@ -100,7 +101,7 @@ $sp=&DWP(28,"esp");
|
|||
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
|
||||
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
|
||||
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
|
||||
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of np[num]
|
||||
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
|
||||
# Note that SDK suggests to unconditionally allocate 2K per vector. This
|
||||
# has quite an impact on performance. It naturally depends on key length,
|
||||
# but to give an example 1024 bit private RSA key operations suffer >30%
|
||||
|
@ -115,7 +116,7 @@ $sp=&DWP(28,"esp");
|
|||
&jnz (&label("leave")); # num % 4 != 0
|
||||
&cmp ("ecx",8);
|
||||
&jb (&label("leave")); # num < 8
|
||||
&cmp ("ecx",256);
|
||||
&cmp ("ecx",1024);
|
||||
&ja (&label("leave")); # num > 1024
|
||||
|
||||
&pushf ();
|
||||
|
@ -148,74 +149,91 @@ $sp=&DWP(28,"esp");
|
|||
&lea ("ebp",&DWP(-$pad,"ecx"));
|
||||
&shr ("ebp",2); # restore original num value in ebp
|
||||
|
||||
&add ("ecx",32/4); # (4 vectors + 32 byte scratch)/4
|
||||
&xor ("eax","eax");
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
|
||||
&mov ($A,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded ap copy...
|
||||
|
||||
# edi points at the end of ap copy...
|
||||
&mov ("ecx","ebp");
|
||||
&add ("edi",$pad); # skip padding to point at bp copy
|
||||
&mov ("esi","ebx");
|
||||
&mov ($B,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded bp copy...
|
||||
|
||||
# edi points at the end of bp copy...
|
||||
&mov ("ecx","ebp");
|
||||
&add ("edi",$pad); # skip padding to point at np copy
|
||||
&mov ("esi","edx");
|
||||
&mov ($M,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded np copy...
|
||||
|
||||
# let magic happen...
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","esp");
|
||||
&xor ("eax","eax");
|
||||
&shl ("ecx",5); # convert word counter to bit counter
|
||||
&align (4);
|
||||
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&xor ("edx","edx"); # i=0
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
# edi still points at the end of np copy...
|
||||
&xor ("edx","edx"); # i=0
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
# edi still points at the end of padded np copy...
|
||||
&mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1]
|
||||
&neg ("ebp");
|
||||
&lea ("ebp",&DWP(0,"edi","ebp",4)); # so just "rewind"
|
||||
&mov ("edi",$rp); # restore rp
|
||||
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
|
||||
&mov ("edi",$rp); # restore rp
|
||||
|
||||
&mov ("ebx",&DWP(0,"esi","ecx",4)); # upmost overflow bit
|
||||
&cmp ("ebx",0); # clears CF unconfitionally
|
||||
&jnz (&label("sub"));
|
||||
&mov ("eax",&DWP(-4,"esi","ecx",4));
|
||||
&cmp ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]?
|
||||
&jae (&label("sub")); # if taken CF is cleared
|
||||
&shr ("eax",30); # boundary condition...
|
||||
&jz (&label("copy")); # ... is met
|
||||
&xor ("edx","edx"); # clear CF
|
||||
|
||||
&set_label("copy",4);
|
||||
&mov ("ebx","ecx");
|
||||
&data_byte(0xf3,0xa5); # rep movsl
|
||||
&mov ("ecx","ebx");
|
||||
&jmp (&label("zap"));
|
||||
|
||||
&set_label("sub",16);
|
||||
&set_label("sub",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&sbb ("eax",&DWP(0,"ebp","edx",4));
|
||||
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&dec ("ecx"); # doesn't affect CF!
|
||||
&jg (&label("sub"));
|
||||
&sbb ("ebx",0); # upmost overflow is still there
|
||||
&mov ("ecx","edx");
|
||||
&jc (&label("copy"));
|
||||
&loop (&label("sub")); # doesn't affect CF!
|
||||
|
||||
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
|
||||
&sbb ("eax",0);
|
||||
&and ("esi","eax");
|
||||
¬ ("eax");
|
||||
&mov ("ebp","edi");
|
||||
&and ("ebp","eax");
|
||||
&or ("esi","ebp"); # tp=carry?tp:rp
|
||||
|
||||
&mov ("ecx","edx"); # num
|
||||
&xor ("edx","edx"); # i=0
|
||||
|
||||
&set_label("copy",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
|
||||
&mov (&DWP(0,"edi","edx",4),"eax");
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("copy"));
|
||||
|
||||
&set_label("zap",4);
|
||||
&mov ("ebp",$sp);
|
||||
&xor ("eax","eax");
|
||||
&lea ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4
|
||||
&mov ("edi","esp");
|
||||
|
||||
&mov ("ecx",64/4);
|
||||
&mov ("edi","esp"); # zap frame including scratch area
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
# zap copies of ap, bp and np
|
||||
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
|
||||
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("esp","ebp");
|
||||
|
@ -224,4 +242,6 @@ $sp=&DWP(28,"esp");
|
|||
&set_label("leave");
|
||||
&function_end($func);
|
||||
|
||||
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
|
|
|
@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||
|
||||
$i="edx";
|
||||
$j="ecx";
|
||||
$ap="esi";
|
||||
$ap="esi"; $tp="esi"; # overlapping variables!!!
|
||||
$rp="edi"; $bp="edi"; # overlapping variables!!!
|
||||
$np="ebp";
|
||||
$num="ebx";
|
||||
|
@ -551,41 +551,39 @@ $sbit=$num;
|
|||
}
|
||||
|
||||
&set_label("common_tail",16);
|
||||
&mov ($np,$_np);
|
||||
&mov ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
|
||||
&mov ($np,$_np); # load modulus pointer
|
||||
&mov ($rp,$_rp); # load result pointer
|
||||
# [$ap and $bp are zapped]
|
||||
&xor ($i,$i); # i=0
|
||||
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
|
||||
&mov ("eax",&DWP(0,$np,$num,4)); # np[num-1]
|
||||
&shr ("eax",30); # check for boundary condition
|
||||
&jz (&label("copy"));
|
||||
|
||||
&mov ("eax",&DWP(0,$tp)); # tp[0]
|
||||
&mov ($j,$num); # j=num-1
|
||||
&cmp ("esi",0); # clears CF unconditionally
|
||||
&jnz (&label("sub"));
|
||||
&mov ("eax",&DWP($frame,"esp",$j,4));
|
||||
&cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]?
|
||||
&jae (&label("sub")); # if taken CF is cleared
|
||||
&set_label("copy",16);
|
||||
&mov ("eax",&DWP($frame,"esp",$j,4));
|
||||
&mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i]
|
||||
&mov (&DWP($frame,"esp",$j,4),$j); # zap temporary vector
|
||||
&dec ($j);
|
||||
&jge (&label("copy"));
|
||||
&jmp (&label("exit"));
|
||||
&xor ($i,$i); # i=0 and clear CF!
|
||||
|
||||
&set_label("sub",16);
|
||||
&mov ("eax",&DWP($frame,"esp",$i,4));
|
||||
&sbb ("eax",&DWP(0,$np,$i,4));
|
||||
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&dec ($j); # doesn't affect CF!
|
||||
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&jge (&label("sub"));
|
||||
&mov ($j,$num); # j=num-1
|
||||
&sbb ("esi",0); # esi holds upmost overflow bit
|
||||
&jc (&label("copy"));
|
||||
&set_label("zap",8);
|
||||
&mov (&DWP($frame,"esp",$j,4),$i); # zap temporary vector
|
||||
&dec ($j);
|
||||
&jge (&label("zap"));
|
||||
|
||||
&set_label("exit",8);
|
||||
&sbb ("eax",0); # handle upmost overflow bit
|
||||
&and ($tp,"eax");
|
||||
¬ ("eax");
|
||||
&mov ($np,$rp);
|
||||
&and ($np,"eax");
|
||||
&or ($tp,$np); # tp=carry?tp:rp
|
||||
|
||||
&set_label("copy",16); # copy or in-place refresh
|
||||
&mov ("eax",&DWP(0,$tp,$num,4));
|
||||
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
|
||||
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
||||
&dec ($num);
|
||||
&jge (&label("copy"));
|
||||
|
||||
&mov ("esp",$_sp); # pull saved stack pointer
|
||||
&mov ("eax",1);
|
||||
&set_label("just_leave");
|
||||
|
|
|
@ -59,6 +59,7 @@ bn_mul_mont:
|
|||
neg %rax
|
||||
lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
|
||||
and \$-1024,%rsp # minimize TLB usage
|
||||
|
||||
mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||||
mov %rdx,$bp # $bp reassigned, remember?
|
||||
|
||||
|
@ -166,22 +167,38 @@ bn_mul_mont:
|
|||
cmp $num,$i
|
||||
jl .Louter
|
||||
|
||||
xor $i,$i # i=0
|
||||
mov -8($np,$num,8),%rax # np[num-1]
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
shr \$62,%rax # check for boundary condition
|
||||
jz .Lcopy
|
||||
|
||||
mov ($ap),%rax # tp[0]
|
||||
lea -1($num),$j # j=num-1
|
||||
cmp \$0,%rdx # %rdx still holds upmost overflow bit
|
||||
jnz .Lsub # CF is cleared by compare with 0
|
||||
mov (%rsp,$j,8),%rax
|
||||
cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
|
||||
jae .Lsub # if taken CF was cleared by above cmp
|
||||
.align 4
|
||||
.Lcopy:
|
||||
mov (%rsp,$j,8),%rax
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
jmp .Lsub
|
||||
.align 16
|
||||
.Lsub: sbb ($np,$i,8),%rax
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
dec $j # doesn't affect CF!
|
||||
mov 8($ap,$i,8),%rax # tp[i+1]
|
||||
lea 1($i),$i # i++
|
||||
jge .Lsub
|
||||
|
||||
sbb \$0,%rax # handle upmost overflow bit
|
||||
and %rax,$ap
|
||||
not %rax
|
||||
mov $rp,$np
|
||||
and %rax,$np
|
||||
lea -1($num),$j
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
.align 16
|
||||
.Lcopy: # copy or in-place refresh
|
||||
mov ($ap,$j,8),%rax
|
||||
mov %rax,($rp,$j,8) # rp[i]=tp[i]
|
||||
mov $i,(%rsp,$j,8) # zap temporary vector
|
||||
dec $j
|
||||
jge .Lcopy
|
||||
.align 4
|
||||
.Lexit:
|
||||
|
||||
mov 8(%rsp,$num,8),%rsp # restore %rsp
|
||||
mov \$1,%rax
|
||||
pop %r15
|
||||
|
@ -191,22 +208,6 @@ bn_mul_mont:
|
|||
pop %rbp
|
||||
pop %rbx
|
||||
ret
|
||||
|
||||
.align 16
|
||||
.Lsub: mov (%rsp,$i,8),%rax
|
||||
sbb ($np,$i,8),%rax
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
|
||||
lea 1($i),$i # i++
|
||||
dec $j # doesn't affect CF!
|
||||
jge .Lsub
|
||||
lea -1($num),$j # j=num-1
|
||||
sbb \$0,%rdx
|
||||
jc .Lcopy # tp was less than np
|
||||
.align 4
|
||||
.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
|
||||
dec $j
|
||||
jge .Lzap
|
||||
jmp .Lexit
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
|
Loading…
Reference in a new issue