Eliminate conditional final subtraction in Montgomery assembler modules.

2007-06-17 17:10:03 +00:00 · 2007-06-17 17:10:03 +00:00 · 7d9cf7c0bb
commit 7d9cf7c0bb
parent 55525742f4
10 changed files with 273 additions and 272 deletions
--- a/crypto/bn/asm/alpha-mont.pl
+++ b/crypto/bn/asm/alpha-mont.pl
@ -258,56 +258,48 @@ bn_mul_mont:
 	stq	$hi1,16($tp)
 	bne	$tj,.Louter

-	s8addq	$num,sp,$ap
-	mov	$rp,$bp
+	s8addq	$num,sp,$tj	# &tp[num]
+	mov	$rp,$bp		# put rp aside
 	mov	sp,$tp
-	mov	0,$hi0
-
-	bne	$hi1,.Lsub
-	cmpult	$nj,$lo1,AT
-	bne	AT,.Lsub
-
-.align	4
-.Lcopy:	ldq	AT,($tp)
-	lda	$tp,8($tp)
-	stq	AT,($rp)
-	cmpult	$tp,$ap,AT
-	stq	zero,-8($tp)
-	nop
-	lda	$rp,8($rp)
-	bne	AT,.Lcopy
-	mov	1,v0
-	br	.Lexit
+	mov	sp,$ap
+	srl	$nj,62,AT	# boundary condition...
+	beq	AT,.Lcopy	# ... is met
+	mov	0,$hi0		# clear borrow bit

 .align	4
 .Lsub:	ldq	$lo0,($tp)
 	ldq	$lo1,($np)
-	subq	$lo0,$lo1,$lo1
+	lda	$tp,8($tp)
+	lda	$np,8($np)
+	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
 	cmpult	$lo0,$lo1,AT
 	subq	$lo1,$hi0,$lo0
 	cmpult	$lo1,$lo0,$hi0
-	lda	$tp,8($tp)
 	or	$hi0,AT,$hi0
-	lda	$np,8($np)
 	stq	$lo0,($rp)
-	cmpult	$tp,$ap,v0
+	cmpult	$tp,$tj,v0
 	lda	$rp,8($rp)
 	bne	v0,.Lsub

-	subq	$hi1,$hi0,$hi0
+	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
 	mov	sp,$tp
-	cmpule	$hi1,$hi0,AT
-	mov	$bp,$rp
-	bne	AT,.Lcopy
+	mov	$bp,$rp		# restore rp
+
+	and	sp,$hi0,$ap
+	bic	$bp,$hi0,$bp
+	bis	$bp,$ap,$ap	# ap=borrow?tp:rp

 .align	4
-.Lzap:	stq	zero,($tp)
-	cmpult	$tp,$ap,AT
+.Lcopy:	ldq	$aj,($ap)	# copy or in-place refresh
 	lda	$tp,8($tp)
-	bne	AT,.Lzap
+	lda	$rp,8($rp)
+	lda	$ap,8($ap)
+	stq	zero,-8($tp)	# zap tp
+	cmpult	$tp,$tj,AT
+	stq	$aj,-8($rp)
+	bne	AT,.Lcopy
 	mov	1,v0

-.align	4
 .Lexit:
 	.set	noreorder
 	mov	fp,sp
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@ -61,7 +61,7 @@ bn_mul_mont:
 	cmp	$num,#2
 	movlt	r0,#0
 	addlt	sp,sp,#2*4
-	blt	.Labort
+	blt	.Labrt

 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers

@ -160,27 +160,13 @@ bn_mul_mont:
 	add	$num,$num,#4		@ $num to point at &tp[num]
 	sub	$aj,$num,sp		@ "original" num value
 	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]

-	cmp	$nhi,#0			@ upmost carry
-	bne	.Lsub
-	cmp	$nlo,$nj		@ tp[num-1]-np[num-1]
-	bhs	.Lsub
-
-.Lcopy:	ldr	$tj,[$tp]
-	str	sp,[$tp],#4		@ zap tp
-	str	$tj,[$rp],#4
-	cmp	$tp,$num
-	bne	.Lcopy
-
-.Lexit:	add	sp,$num,#4		@ skip over tp[num+1]
-	ldmia	sp!,{r4-r12,lr}		@ restore registers
-	add	sp,sp,#2*4		@ skip over {r0,r2}
-	mov	r0,#1
-.Labort:tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	movs	$tj,$nj,lsr#30		@ boundary condition...
+	beq	.Lcopy			@ ... is met

+	subs	$tj,$tj,$tj		@ "clear" carry flag
 .Lsub:	ldr	$tj,[$tp],#4
 	ldr	$nj,[$np],#4
 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
@ -190,12 +176,24 @@ bn_mul_mont:
 	sbcs	$nhi,$nhi,#0		@ upmost carry
 	mov	$tp,sp			@ "rewind" $tp
 	sub	$rp,$rp,$aj		@ "rewind" $rp
-	blo	.Lcopy			@ tp was less after all

-.Lzap:	str	sp,[$tp],#4
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
 	cmp	$tp,$num
-	bne	.Lzap
-	bal	.Lexit
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
 .size	bn_mul_mont,.-bn_mul_mont
 .asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 ___
--- a/crypto/bn/asm/mips3-mont.pl
+++ b/crypto/bn/asm/mips3-mont.pl
@ -265,27 +265,50 @@ bn_mul_mont:
 	addu	$i,8
 	sltu	s7,$i,$num
 	bnez	s7,.Louter
-
+
 	.set	noreorder
-	PTR_ADD	$ap,sp,$num
+	PTR_ADD	$tj,sp,$num	# &tp[num]
 	move	$tp,sp
+	move	$ap,sp

-	bnez	$hi1,.Lsub
-	li	$hi0,0
-	sgeu	AT,$lo1,$nj
-	beqz	AT,.Lsub
-	nop
+	dsrl	AT,$nj,62	# boundary condition...
+	beqz	AT,.Lcopy	# ... is met
+	li	$hi0,0		# clear borrow bit

 .align	4
-.Lcopy:	ld	AT,($tp)
+.Lsub:	ld	$lo0,($tp)
+	ld	$lo1,($np)
+	PTR_ADD	$tp,8
+	PTR_ADD	$np,8
+	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	AT,$lo1,$lo0
+	dsubu	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	sd	$lo0,($rp)
+	or	$hi0,AT
+	sltu	AT,$tp,$tj
+	bnez	AT,.Lsub
+	PTR_ADD	$rp,8
+
+	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,sp
+	PTR_SUB	$rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ld	$aj,($ap)
+	PTR_ADD	$ap,8
 	PTR_ADD	$tp,8
-	sd	AT,($rp)
-	sltu	AT,$tp,$ap
 	sd	zero,-8($tp)
+	sltu	AT,$tp,$tj
+	sd	$aj,($rp)
 	bnez	AT,.Lcopy
 	PTR_ADD	$rp,8

-.Lexit:
 	ld	s0,0($fp)
 	ld	s1,8($fp)
 	ld	s2,16($fp)
@ -297,34 +320,6 @@ bn_mul_mont:
 	li	v0,1
 	jr	ra
 	PTR_ADD	sp,$fp,64
-
-.align	4
-.Lsub:	ld	$lo0,($tp)
-	ld	$lo1,($np)
-	dsubu	$lo1,$lo0,$lo1
-	sgtu	AT,$lo1,$lo0
-	dsubu	$lo0,$lo1,$hi0
-	sgtu	$hi0,$lo0,$lo1
-	PTR_ADD	$tp,8
-	or	$hi0,AT
-	PTR_ADD	$np,8
-	sd	$lo0,($rp)
-	sltu	AT,$tp,$ap
-	bnez	AT,.Lsub
-	PTR_ADD	$rp,8
-
-	dsubu	$hi0,$hi1,$hi0
-	move	$tp,sp
-	sgtu	AT,$hi0,$hi1
-	bnez	AT,.Lcopy
-	PTR_SUB	$rp,$num
-.align	4
-.Lzap:	sd	zero,($tp)
-	sltu	AT,$tp,$ap
-	bnez	AT,.Lzap
-	PTR_ADD	$tp,8
-	b	.Lexit
-	nop
 	.set	reorder
 END(bn_mul_mont)
 .rdata
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@ -2,8 +2,9 @@

 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================

 # April 2006
@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
 	$UMULL=	"mullw";	# unsigned multiply low
 	$UMULH=	"mulhwu";	# unsigned multiply high
 	$UCMP=	"cmplw";	# unsigned compare
+	$SHRI=	"srwi";		# unsigned shift right by immediate	
 	$PUSH=	$ST;
 	$POP=	$LD;
 } elsif ($output =~ /64\-mont\.s/) {
@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
 	$UMULL=	"mulld";	# unsigned multiply low
 	$UMULH=	"mulhdu";	# unsigned multiply high
 	$UCMP=	"cmpld";	# unsigned compare
+	$SHRI=	"srdi";		# unsigned shift right by immediate	
 	$PUSH=	$ST;
 	$POP=	$LD;
 } else { die "nonsense $output"; }
@ -264,24 +267,37 @@ Linner:
 	addi	$i,$i,$BNSZ
 	ble-	Louter

+	$SHRI.	$nj,$nj,$BITS-2	; check boundary condition
 	addi	$num,$num,2	; restore $num
+	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
 	addi	$tp,$sp,$FRAME
+	addi	$ap,$sp,$FRAME
 	mtctr	$num
-	li	$j,0
+	beq	Lcopy		; boundary condition is met

-	subfc.	$ovf,$j,$ovf	; sets XER[CA]
-	bne	Lsub
-	$UCMP	$hi1,$nj
-	bge	Lsub
 .align	4
-Lcopy:
-	$LDX	$tj,$tp,$j
+Lsub:	$LDX	$tj,$tp,$j
+	$LDX	$nj,$np,$j
+	subfe	$aj,$nj,$tj	; tp[j]-np[j]
+	$STX	$aj,$rp,$j
+	addi	$j,$j,$BNSZ
+	bdnz-	Lsub
+
+	li	$j,0
+	mtctr	$num
+	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	$LDX	$tj,$ap,$j
 	$STX	$tj,$rp,$j
 	$STX	$j,$tp,$j	; zap at once
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy

-Lexit:
 	$POP	r14,`4*$SIZE_T`($sp)
 	$POP	r15,`5*$SIZE_T`($sp)
 	$POP	r16,`6*$SIZE_T`($sp)
@ -298,22 +314,7 @@ Lexit:
 	li	r3,1
 	blr
 	.long	0
-.align	4
-Lsub:	$LDX	$tj,$tp,$j
-	$LDX	$nj,$np,$j
-	subfe	$tj,$nj,$tj	; tp[j]-np[j]
-	$STX	$tj,$rp,$j
-	addi	$j,$j,$BNSZ
-	bdnz-	Lsub
-	li	$j,0
-	subfe.	$ovf,$j,$ovf
-	mtctr	$num
-	bne	Lcopy
-.align	4
-Lzap:	$STX	$j,$tp,$j
-	addi	$j,$j,$BNSZ
-	bdnz-	Lzap
-	b	Lexit
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
 ___

 $code =~ s/\`([^\`]*)\`/eval $1/gem;
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@ -176,45 +176,45 @@ bn_mul_mont:
 ___

 undef $bi;
-$count=$ap; undef $ap;
+$count=$bp; undef $bp;

 $code.=<<___;
 	lg	$rp,16+16($fp)	# reincarnate rp
+	la	$ap,8($fp)
 	lgr	$j,$num
-	ltgr	$AHI,$AHI
-	jnz	.Lsub		# upmost overflow bit is not zero
-	#slg	$NHI,-8($np)	# tp[num-1]-np[num-1]
+
+	#lg	$nhi,-8($np)		# buggy assembler
 	lghi	$count,-8		# buggy assembler
-	slg	$NHI,0($count,$np)	# buggy assembler
-	jnle	.Lsub		# branch if not borrow 
+	lg	$nhi,0($count,$np)	# buggy assembler
+	srag	$nhi,$nhi,62	# boundary condition...
+	jz	.Lcopy		# ... is met

-.Lcopy:	lg	$alo,8($j,$fp)
-	stg	$j,8($j,$fp)
-	stg	$alo,0($j,$rp)
-	aghi	$j,8
-	jnz	.Lcopy
-.Lexit:
-	lmg	%r6,%r15,16+48($fp)
-	lghi	%r2,1		# signal "processed"
-	br	%r14
-
-.Lsub:	lcgr	$count,$num
+	lcgr	$count,$num
 	sra	$count,3	# incidentally clears "borrow"
-.Lsubloop:
-	lg	$alo,8($j,$fp)
+.Lsub:	lg	$alo,0($j,$ap)
 	slbg	$alo,0($j,$np)
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
-	brct	$count,.Lsubloop
+	brct	$count,.Lsub
 	lghi	$ahi,0
-	slbgr	$AHI,$ahi
-	lgr	$j,$num
-	jle	.Lcopy		# branch if borrow
+	slbgr	$AHI,$ahi	# handle upmost carry

-.Lzap:	stg	$j,8($j,$fp)
+	ngr	$ap,$AHI
+	lghi	$np,-1
+	xgr	$np,$AHI
+	ngr	$np,$rp
+	ogr	$ap,$np		# ap=borrow?tp:rp
+	lgr	$j,$num
+
+.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
+	stg	$j,8($j,$fp)	# zap tp
+	stg	$alo,0($j,$rp)
 	aghi	$j,8
-	jnz	.Lzap
-	j	.Lexit
+	jnz	.Lcopy
+
+	lmg	%r6,%r15,16+48($fp)
+	lghi	%r2,1		# signal "processed"
+	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
--- a/crypto/bn/asm/sparcv9-mont.pl
+++ b/crypto/bn/asm/sparcv9-mont.pl
@ -2,8 +2,9 @@

 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================

 # December 2005
@ -254,44 +255,36 @@ $fname:
 .Ltail:
 	add	$np,$num,$np
 	add	$rp,$num,$rp
-
-	cmp	$car2,0			! clears %icc.c
-	bne,pn	%icc,.Lsub
+	mov	$tp,$ap
 	sub	%g0,$num,%o7		! k=-num

-	cmp	$car1,$npj		! compare top-most $tp and $np words
-	bcs,pt	%icc,.Lcopy		! %icc.c is clean if not taken
-	nop
+	srl	$npj,30,%o0		! boundary condition...
+	brz,pn	%o0,.Lcopy		! ... is met
+	subcc	%g0,%g0,%g0		! clear %icc.c

 .align	16,0x1000000
 .Lsub:
 	ld	[$tp+%o7],%o0
 	ld	[$np+%o7],%o1
-	subccc	%o0,%o1,%o1
+	subccc	%o0,%o1,%o1		! tp[j]-np[j]
 	st	%o1,[$rp+%o7]
 	add	%o7,4,%o7
 	brnz	%o7,.Lsub
 	nop
-	subccc	$car2,0,$car2
-	bcc	%icc,.Lzap
+	subc	$car2,0,$car2		! handle upmost overflow bit
+	and	$tp,$car2,$ap
+	andn	$rp,$car2,$np
+	or	$ap,$np,$ap
 	sub	%g0,$num,%o7

 .align	16,0x1000000
 .Lcopy:
-	ld	[$tp+%o7],%o0
+	ld	[$ap+%o7],%o0		! copy or in-place refresh
+	st	%g0,[$tp+%o7]		! zap tp
 	st	%o0,[$rp+%o7]
 	add	%o7,4,%o7
 	brnz	%o7,.Lcopy
 	nop
-	ba	.Lzap
-	sub	%g0,$num,%o7
-
-.align	32
-.Lzap:
-	st	%g0,[$tp+%o7]
-	add	%o7,4,%o7
-	brnz	%o7,.Lzap
-	nop
 	mov	1,%i0
 	ret
 	restore
@ -609,6 +602,7 @@ $code.=<<___;
 	add	$tp,8,$tp
 .type	$fname,#function
 .size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 print $code;
--- a/crypto/bn/asm/sparcv9a-mont.pl
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
 $ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load

 $code=<<___;
-.ident		"UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
 .section	".text",#alloc,#execinstr

 .global $fname
@ -799,17 +798,14 @@ $fname:
 	bnz	%icc,.Louter
 	nop

-	sub	%g0,$num,%o7		! n=-num
-	cmp	$carry,0		! clears %icc.c
-	bne,pn	%icc,.Lsub
-	add	$tp,8,$tp		! adjust tp to point at the end
-
-	ld	[$tp-8],%o0
 	ld	[$np-4],%o1
-	cmp	%o0,%o1			! compare topmost words
-	bcs,pt	%icc,.Lcopy		! %icc.c is clean if not taken
-	nop
-
+	subcc	%g0,%g0,%g0		! clear %icc.c
+	add	$tp,8,$tp		! adjust tp to point at the end
+	srl	%o1,30,%o1		! boundary condition...
+	orn	%g0,%g0,%g4
+	brz,pn	%o1,.Lcopy		! ... is met
+	sub	%g0,$num,%o7		! n=-num
+	
 .align	32,0x1000000
 .Lsub:
 	ldx	[$tp+%o7],%o0
@ -824,24 +820,30 @@ $fname:
 	add	%o7,8,%o7
 	brnz,pt	%o7,.Lsub
 	st	%o3,[%g1+4]
-	subccc	$carry,0,$carry
-	bcc,pt	%icc,.Lzap
+	subc	$carry,0,%g4
 	sub	%g0,$num,%o7		! n=-num

-.align	16,0x1000000
+.align	32,0x1000000
 .Lcopy:
 	ldx	[$tp+%o7],%o0
-	srlx	%o0,32,%o1
 	add	$rp,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	stx	%g0,[$tp+%o7]
+	and	%o0,%g4,%o0
+	srlx	%o0,32,%o1
+	andn	%o2,%g4,%o2
+	andn	%o3,%g4,%o3
+	or	%o2,%o0,%o0
+	or	%o3,%o1,%o1
 	st	%o0,[%g1+0]
 	add	%o7,8,%o7
 	brnz,pt	%o7,.Lcopy
 	st	%o1,[%g1+4]
 	sub	%g0,$num,%o7		! n=-num

-.align	32
+.align	32,0x1000000
 .Lzap:
-	stx	%g0,[$tp+%o7]
 	stx	%g0,[$ap_l+%o7]
 	stx	%g0,[$ap_h+%o7]
 	stx	%g0,[$np_l+%o7]
--- a/crypto/bn/asm/via-mont.pl
+++ b/crypto/bn/asm/via-mont.pl
@ -77,7 +77,8 @@
 # - in terms of absolute performance it delivers approximately as much
 #   as modern out-of-order 32-bit cores [again, for longer keys].

-push(@INC,".","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";

 &asm_init($ARGV[0],"via-mont.pl");
@ -100,7 +101,7 @@ $sp=&DWP(28,"esp");
 # &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
 # &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
 # &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
-# &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of np[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
 # Note that SDK suggests to unconditionally allocate 2K per vector. This
 # has quite an impact on performance. It naturally depends on key length,
 # but to give an example 1024 bit private RSA key operations suffer >30%
@ -115,7 +116,7 @@ $sp=&DWP(28,"esp");
 	&jnz	(&label("leave"));	# num % 4 != 0
 	&cmp	("ecx",8);
 	&jb	(&label("leave"));	# num < 8
-	&cmp	("ecx",256);
+	&cmp	("ecx",1024);
 	&ja	(&label("leave"));	# num > 1024

 	&pushf	();
@ -148,74 +149,91 @@ $sp=&DWP(28,"esp");
 	&lea	("ebp",&DWP(-$pad,"ecx"));
 	&shr	("ebp",2);		# restore original num value in ebp

-	&add	("ecx",32/4);		# (4 vectors + 32 byte scratch)/4
 	&xor	("eax","eax");
+
+	&mov	("ecx","ebp");
+	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
 	&data_byte(0xf3,0xab);		# rep stosl, bzero

 	&mov	("ecx","ebp");
 	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
 	&mov	($A,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded ap copy...

-	# edi points at the end of ap copy...
 	&mov	("ecx","ebp");
-	&add	("edi",$pad);		# skip padding to point at bp copy
 	&mov	("esi","ebx");
 	&mov	($B,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded bp copy...

-	# edi points at the end of bp copy...
 	&mov	("ecx","ebp");
-	&add	("edi",$pad);		# skip padding to point at np copy
 	&mov	("esi","edx");
 	&mov	($M,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded np copy...

 	# let magic happen...
 	&mov	("ecx","ebp");
 	&mov	("esi","esp");
-	&xor	("eax","eax");
 	&shl	("ecx",5);		# convert word counter to bit counter
 	&align	(4);
 	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul

 	&mov	("ecx","ebp");
-	&xor	("edx","edx");		# i=0
-	&lea	("esi",&DWP(64,"esp"));	# tp
-	# edi still points at the end of np copy...
+	&xor	("edx","edx");			# i=0
+	&lea	("esi",&DWP(64,"esp"));		# tp
+	# edi still points at the end of padded np copy...
+	&mov	("eax",&DWP(-4-$pad,"edi"));	# np[num-1]
 	&neg	("ebp");
-	&lea	("ebp",&DWP(0,"edi","ebp",4));	# so just "rewind"
-	&mov	("edi",$rp);		# restore rp
+	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
+	&mov	("edi",$rp);			# restore rp

-	&mov	("ebx",&DWP(0,"esi","ecx",4));	# upmost overflow bit
-	&cmp	("ebx",0);			# clears CF unconfitionally
-	&jnz	(&label("sub"));
-	&mov	("eax",&DWP(-4,"esi","ecx",4));
-	&cmp	("eax",&DWP(-4,"ebp","ecx",4));	# tp[num-1]-np[num-1]?
-	&jae	(&label("sub"));		# if taken CF is cleared
+	&shr	("eax",30);			# boundary condition...
+	&jz	(&label("copy"));		# ... is met
+	&xor	("edx","edx");			# clear CF

-&set_label("copy",4);
-	&mov	("ebx","ecx");
-	&data_byte(0xf3,0xa5);			# rep movsl
-	&mov	("ecx","ebx");
-	&jmp	(&label("zap"));
-
-&set_label("sub",16);
+&set_label("sub",8);
 	&mov	("eax",&DWP(0,"esi","edx",4));
 	&sbb	("eax",&DWP(0,"ebp","edx",4));
 	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
 	&lea	("edx",&DWP(1,"edx"));		# i++
-	&dec	("ecx");			# doesn't affect CF!
-	&jg	(&label("sub"));
-	&sbb	("ebx",0);			# upmost overflow is still there
-	&mov	("ecx","edx");
-	&jc	(&label("copy"));
+	&loop	(&label("sub"));		# doesn't affect CF!
+
+	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
+	&sbb	("eax",0);
+	&and	("esi","eax");
+	&not	("eax");
+	&mov	("ebp","edi");
+	&and	("ebp","eax");
+	&or	("esi","ebp");			# tp=carry?tp:rp
+
+	&mov	("ecx","edx");			# num
+	&xor	("edx","edx");			# i=0
+
+&set_label("copy",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
+	&mov	(&DWP(0,"edi","edx",4),"eax");
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("copy"));

-&set_label("zap",4);
 	&mov	("ebp",$sp);
 	&xor	("eax","eax");
-	&lea	("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4
-	&mov	("edi","esp");
+
+	&mov	("ecx",64/4);
+	&mov	("edi","esp");		# zap frame including scratch area
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	# zap copies of ap, bp and np
+	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
 	&data_byte(0xf3,0xab);		# rep stosl, bzero

 	&mov	("esp","ebp");
@ -224,4 +242,6 @@ $sp=&DWP(28,"esp");
 &set_label("leave");
 &function_end($func);

+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
 &asm_finish();
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

 $i="edx";
 $j="ecx";
-$ap="esi";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
 $rp="edi";	$bp="edi";		# overlapping variables!!!
 $np="ebp";
 $num="ebx";
@ -551,41 +551,39 @@ $sbit=$num;
 }

 &set_label("common_tail",16);
-	&mov	($np,$_np);
-	&mov	("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
+	&mov	($np,$_np);			# load modulus pointer
 	&mov	($rp,$_rp);			# load result pointer
-						# [$ap and $bp are zapped]
-	&xor	($i,$i);			# i=0
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+	&mov	("eax",&DWP(0,$np,$num,4));	# np[num-1]
+	&shr	("eax",30);			# check for boundary condition
+	&jz	(&label("copy"));
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
 	&mov	($j,$num);			# j=num-1
-	&cmp	("esi",0);			# clears CF unconditionally
-	&jnz	(&label("sub"));
-	&mov	("eax",&DWP($frame,"esp",$j,4));
-	&cmp	("eax",&DWP(0,$np,$j,4));	# tp[num-1]-np[num-1]?
-	&jae	(&label("sub"));		# if taken CF is cleared
-&set_label("copy",16);
-	&mov	("eax",&DWP($frame,"esp",$j,4));
-	&mov	(&DWP(0,$rp,$j,4),"eax");	# rp[i]=tp[i]
-	&mov	(&DWP($frame,"esp",$j,4),$j);	# zap temporary vector
-	&dec	($j);
-	&jge	(&label("copy"));
-	&jmp	(&label("exit"));
+	&xor	($i,$i);			# i=0 and clear CF!

 &set_label("sub",16);
-	&mov	("eax",&DWP($frame,"esp",$i,4));
 	&sbb	("eax",&DWP(0,$np,$i,4));
 	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
-	&lea	($i,&DWP(1,$i));		# i++
 	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
 	&jge	(&label("sub"));
-	&mov	($j,$num);			# j=num-1
-	&sbb	("esi",0);			# esi holds upmost overflow bit
-	&jc	(&label("copy"));
-&set_label("zap",8);
-	&mov	(&DWP($frame,"esp",$j,4),$i);	# zap temporary vector
-	&dec	($j);
-	&jge	(&label("zap"));

-&set_label("exit",8);
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
 	&mov	("esp",$_sp);		# pull saved stack pointer
 	&mov	("eax",1);
 &set_label("just_leave");
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@ -59,6 +59,7 @@ bn_mul_mont:
 	neg	%rax
 	lea	(%rsp,%rax,8),%rsp	# tp=alloca(8*(num+2))
 	and	\$-1024,%rsp		# minimize TLB usage
+
 	mov	%rbp,8(%rsp,$num,8)	# tp[num+1]=%rsp
 	mov	%rdx,$bp		# $bp reassigned, remember?

@ -166,22 +167,38 @@ bn_mul_mont:
 	cmp	$num,$i
 	jl	.Louter

-	xor	$i,$i			# i=0
+	mov	-8($np,$num,8),%rax	# np[num-1]
+	lea	(%rsp),$ap		# borrow ap for tp
+	shr	\$62,%rax		# check for boundary condition
+	jz	.Lcopy
+
+	mov	($ap),%rax		# tp[0]
 	lea	-1($num),$j		# j=num-1
-	cmp	\$0,%rdx		# %rdx still holds upmost overflow bit
-	jnz	.Lsub			# CF is cleared by compare with 0
-	mov	(%rsp,$j,8),%rax
-	cmp	($np,$j,8),%rax		# tp[num-1]-np[num-1]
-	jae	.Lsub			# if taken CF was cleared by above cmp
-.align	4
-.Lcopy:
-	mov	(%rsp,$j,8),%rax
+	xor	$i,$i			# i=0 and clear CF!
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	dec	$j			# doesn't affect CF!
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	jge	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$j,8),%rax
 	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
 	mov	$i,(%rsp,$j,8)		# zap temporary vector
 	dec	$j
 	jge	.Lcopy
-.align	4
-.Lexit:
+	
 	mov	8(%rsp,$num,8),%rsp	# restore %rsp
 	mov	\$1,%rax
 	pop	%r15
@ -191,22 +208,6 @@ bn_mul_mont:
 	pop	%rbp
 	pop	%rbx
 	ret
-
-.align	16
-.Lsub:	mov	(%rsp,$i,8),%rax
-	sbb	($np,$i,8),%rax
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[j]
-	lea	1($i),$i		# i++
-	dec	$j			# doesn't affect CF!
-	jge	.Lsub
-	lea	-1($num),$j		# j=num-1
-	sbb	\$0,%rdx
-	jc	.Lcopy			# tp was less than np
-.align	4
-.Lzap:	mov	$i,(%rsp,$j,8)		# zap temporary vector
-	dec	$j
-	jge	.Lzap
-	jmp	.Lexit
 .size	bn_mul_mont,.-bn_mul_mont
 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___