openssl/crypto/bn/asm/x86-mont.pl

#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html


# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)

# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";

$output = pop;
open STDOUT,">$output";

&asm_init($ARGV[0]);

$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

&external_label("OPENSSL_ia32cap_P") if ($sse2);

&function_begin("bn_mul_mont");

$i="edx";
$j="ecx";
$ap="esi";	$tp="esi";		# overlapping variables!!!
$rp="edi";	$bp="edi";		# overlapping variables!!!
$np="ebp";
$num="ebx";

$_num=&DWP(4*0,"esp");			# stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32;				# size of above frame rounded up to 16n

	&xor	("eax","eax");
	&mov	("edi",&wparam(5));	# int num
	&cmp	("edi",4);
	&jl	(&label("just_leave"));

	&lea	("esi",&wparam(0));	# put aside pointer to argument block
	&lea	("edx",&wparam(1));	# load ap
	&add	("edi",2);		# extra two words on top of tp
	&neg	("edi");
	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
	&neg	("edi");

	# minimize cache contention by arranging 2K window between stack
	# pointer and ap argument [np is also position sensitive vector,
	# but it's assumed to be near ap, as it's allocated at ~same
	# time].
	&mov	("eax","ebp");
	&sub	("eax","edx");
	&and	("eax",2047);
	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048

	&xor	("edx","ebp");
	&and	("edx",2048);
	&xor	("edx",2048);
	&sub	("ebp","edx");		# this splits them apart modulo 4096

	&and	("ebp",-64);		# align to cache line

	# An OS-agnostic version of __chkstk.
	#
	# Some OSes (Windows) insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	&mov	("eax","esp");
	&sub	("eax","ebp");
	&and	("eax",-4096);
	&mov	("edx","esp");		# saved stack pointer!
	&lea	("esp",&DWP(0,"ebp","eax"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
	&jmp	(&label("page_walk_done"));

&set_label("page_walk",16);
	&lea	("esp",&DWP(-4096,"esp"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
&set_label("page_walk_done");

	################################# load argument block...
	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
	#&mov	("edi",&DWP(5*4,"esi"));# int num

	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
	&mov	($_rp,"eax");		# ... save a copy of argument block
	&mov	($_ap,"ebx");
	&mov	($_bp,"ecx");
	&mov	($_np,"ebp");
	&mov	($_n0,"esi");
	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
	#&mov	($_num,$num);		# redundant as $num is not reused
	&mov	($_sp,"edx");		# saved stack pointer!

if($sse2) {
$acc0="mm0";	# mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";

	&picmeup("eax","OPENSSL_ia32cap_P");
	&bt	(&DWP(0,"eax"),26);
	&jnc	(&label("non_sse2"));

	&mov	("eax",-1);
	&movd	($mask,"eax");		# mask 32 lower bits

	&mov	($ap,$_ap);		# load input pointers
	&mov	($bp,$_bp);
	&mov	($np,$_np);

	&xor	($i,$i);		# i=0
	&xor	($j,$j);		# j=0

	&movd	($mul0,&DWP(0,$bp));		# bp[0]
	&movd	($mul1,&DWP(0,$ap));		# ap[0]
	&movd	($car1,&DWP(0,$np));		# np[0]

	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
	&movq	($car0,$mul1);
	&movq	($acc0,$mul1);			# I wish movd worked for
	&pand	($acc0,$mask);			# inter-register transfers

	&pmuludq($mul1,$_n0q);			# *=n0

	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
	&paddq	($car1,$acc0);

	&movd	($acc1,&DWP(4,$np));		# np[1]
	&movd	($acc0,&DWP(4,$ap));		# ap[1]

	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&inc	($j);				# j++
&set_label("1st",16);
	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
	&pmuludq($acc1,$mul1);			# np[j]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
	&psrlq	($car0,32);
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
	&psrlq	($car1,32);

	&lea	($j,&DWP(1,$j));
	&cmp	($j,$num);
	&jl	(&label("1st"));

	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
	&pmuludq($acc1,$mul1);			# np[num-1]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=

	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&paddq	($car1,$car0);
	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]

	&inc	($i);				# i++
&set_label("outer");
	&xor	($j,$j);			# j=0

	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
	&movd	($mul1,&DWP(0,$ap));		# ap[0]
	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
	&movd	($car1,&DWP(0,$np));		# np[0]
	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]

	&paddq	($mul1,$temp);			# +=tp[0]
	&movq	($acc0,$mul1);
	&movq	($car0,$mul1);
	&pand	($acc0,$mask);

	&pmuludq($mul1,$_n0q);			# *=n0

	&pmuludq($car1,$mul1);
	&paddq	($car1,$acc0);

	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
	&movd	($acc1,&DWP(4,$np));		# np[1]
	&movd	($acc0,&DWP(4,$ap));		# ap[1]

	&psrlq	($car0,32);
	&psrlq	($car1,32);
	&paddq	($car0,$temp);			# +=tp[1]

	&inc	($j);				# j++
	&dec	($num);
&set_label("inner");
	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
	&pmuludq($acc1,$mul1);			# np[j]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
	&pand	($acc0,$mask);
	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
	&psrlq	($car0,32);
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
	&psrlq	($car1,32);
	&paddq	($car0,$temp);			# +=tp[j+1]

	&dec	($num);
	&lea	($j,&DWP(1,$j));		# j++
	&jnz	(&label("inner"));

	&mov	($num,$j);
	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
	&pmuludq($acc1,$mul1);			# np[num-1]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
	&paddq	($car1,$car0);
	&paddq	($car1,$temp);
	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]

	&lea	($i,&DWP(1,$i));		# i++
	&cmp	($i,$num);
	&jle	(&label("outer"));

	&emms	();				# done with mmx bank
	&jmp	(&label("common_tail"));

&set_label("non_sse2",16);
}

if (0) {
	&mov	("esp",$_sp);
	&xor	("eax","eax");	# signal "not fast enough [yet]"
	&jmp	(&label("just_leave"));
	# While the below code provides competitive performance for
	# all key lengths on modern Intel cores, it's still more
	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
	# means compared to the original integer-only assembler.
	# 512-bit RSA sign is better by ~40%, but that's about all
	# one can say about all CPUs...
} else {
$inp="esi";	# integer path uses these registers differently
$word="edi";
$carry="ebp";

	&mov	($inp,$_ap);
	&lea	($carry,&DWP(1,$num));
	&mov	($word,$_bp);
	&xor	($j,$j);				# j=0
	&mov	("edx",$inp);
	&and	($carry,1);				# see if num is even
	&sub	("edx",$word);				# see if ap==bp
	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
	&or	($carry,"edx");
	&mov	($word,&DWP(0,$word));			# bp[0]
	&jz	(&label("bn_sqr_mont"));
	&mov	($_bpend,"eax");
	&mov	("eax",&DWP(0,$inp));
	&xor	("edx","edx");

&set_label("mull",16);
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*bp[0]
	&add	($carry,"eax");
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
	&cmp	($j,$num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("mull"));

	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*bp[0]
	 &mov	($word,$_n0);
	&add	("eax",$carry);
	 &mov	($inp,$_np);
	&adc	("edx",0);
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
	&xor	($j,$j);
	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=

	&mov	("eax",&DWP(0,$inp));			# np[0]
	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&adc	("edx",0);
	&inc	($j);

	&jmp	(&label("2ndmadd"));

&set_label("1stmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*bp[i]
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("1stmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*bp[i]
	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	 &mov	($word,$_n0);
	&adc	("edx",0);
	 &mov	($inp,$_np);
	&add	($carry,"eax");
	&adc	("edx",0);
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&xor	($j,$j);
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
	&adc	($j,0);
	 &mov	("eax",&DWP(0,$inp));			# np[0]
	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&adc	("edx",0);
	&mov	($j,1);

&set_label("2ndmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
	&jl	(&label("2ndmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	&adc	("edx",0);
	&add	($carry,"eax");
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=

	&xor	("eax","eax");
	 &mov	($j,$_bp);				# &bp[i]
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
	 &lea	($j,&DWP(4,$j));
	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
	 &cmp	($j,$_bpend);
	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
	&je	(&label("common_tail"));

	&mov	($word,&DWP(0,$j));			# bp[i+1]
	&mov	($inp,$_ap);
	&mov	($_bp,$j);				# &bp[++i]
	&xor	($j,$j);
	&xor	("edx","edx");
	&mov	("eax",&DWP(0,$inp));
	&jmp	(&label("1stmadd"));

&set_label("bn_sqr_mont",16);
$sbit=$num;
	&mov	($_num,$num);
	&mov	($_bp,$j);				# i=0

	&mov	("eax",$word);				# ap[0]
	&mul	($word);				# ap[0]*ap[0]
	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
	&mov	($sbit,"edx");
	&shr	("edx",1);
	&and	($sbit,1);
	&inc	($j);
&set_label("sqr",16);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*ap[0]
	&add	("eax",$carry);
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&lea	($carry,&DWP(0,$sbit,"eax",2));
	&shr	("eax",31);
	&cmp	($j,$_num);
	&mov	($sbit,"eax");
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("sqr"));

	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*ap[0]
	&add	("eax",$carry);
	 &mov	($word,$_n0);
	&adc	("edx",0);
	 &mov	($inp,$_np);
	&lea	($carry,&DWP(0,$sbit,"eax",2));
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
	&shr	("eax",31);
	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=

	&lea	($carry,&DWP(0,"eax","edx",2));
	 &mov	("eax",&DWP(0,$inp));			# np[0]
	&shr	("edx",31);
	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	($num,$j);
	&adc	("edx",0);
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&mov	($j,1);

&set_label("3rdmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=

	&mov	($carry,"edx");
	&mul	($word);				# np[j+1]*m
	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
	&lea	($j,&DWP(2,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("3rdmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	&adc	("edx",0);
	&add	($carry,"eax");
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=

	&mov	($j,$_bp);				# i
	&xor	("eax","eax");
	&mov	($inp,$_ap);
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
	&cmp	($j,$num);
	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
	&je	(&label("common_tail"));

	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
	&lea	($j,&DWP(1,$j));
	&mov	("eax",$word);
	&mov	($_bp,$j);				# ++i
	&mul	($word);				# ap[i]*ap[i]
	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
	&adc	("edx",0);
	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
	&xor	($carry,$carry);
	&cmp	($j,$num);
	&lea	($j,&DWP(1,$j));
	&je	(&label("sqrlast"));

	&mov	($sbit,"edx");				# zaps $num
	&shr	("edx",1);
	&and	($sbit,1);
&set_label("sqradd",16);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*ap[i]
	&add	("eax",$carry);
	&lea	($carry,&DWP(0,"eax","eax"));
	&adc	("edx",0);
	&shr	("eax",31);
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("eax",0);
	&add	($carry,$sbit);
	&adc	("eax",0);
	&cmp	($j,$_num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&mov	($sbit,"eax");
	&jle	(&label("sqradd"));

	&mov	($carry,"edx");
	&add	("edx","edx");
	&shr	($carry,31);
	&add	("edx",$sbit);
	&adc	($carry,0);
&set_label("sqrlast");
	&mov	($word,$_n0);
	&mov	($inp,$_np);
	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
	&mov	("eax",&DWP(0,$inp));			# np[0]
	&adc	($carry,0);
	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&lea	($num,&DWP(-1,$j));
	&adc	("edx",0);
	&mov	($j,1);
	&mov	("eax",&DWP(4,$inp));			# np[1]

	&jmp	(&label("3rdmadd"));
}

&set_label("common_tail",16);
	&mov	($np,$_np);			# load modulus pointer
	&mov	($rp,$_rp);			# load result pointer
	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]

	&mov	("eax",&DWP(0,$tp));		# tp[0]
	&mov	($j,$num);			# j=num-1
	&xor	($i,$i);			# i=0 and clear CF!

&set_label("sub",16);
	&sbb	("eax",&DWP(0,$np,$i,4));
	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
	&dec	($j);				# doesn't affect CF!
	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
	&lea	($i,&DWP(1,$i));		# i++
	&jge	(&label("sub"));

	&sbb	("eax",0);			# handle upmost overflow bit
	&and	($tp,"eax");
	&not	("eax");
	&mov	($np,$rp);
	&and	($np,"eax");
	&or	($tp,$np);			# tp=carry?tp:rp

&set_label("copy",16);				# copy or in-place refresh
	&mov	("eax",&DWP(0,$tp,$num,4));
	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
	&dec	($num);
	&jge	(&label("copy"));

	&mov	("esp",$_sp);		# pull saved stack pointer
	&mov	("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");

&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");

&asm_finish();

close STDOUT;
-												Add OpenSSL copyright to .pl files

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-05-21 12:23:39 +00:00
+								#! /usr/bin/env perl
 								# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
 								#
 								# Licensed under the OpenSSL license (the "License").  You may not use
 								# this file except in compliance with the License.  You can obtain a copy
 								# in the file LICENSE in the source distribution or at
 								# https://www.openssl.org/source/license.html
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 								# ====================================================================
-												Remove email addresses from source code.

Names were not removed.
Some comments were updated.
Replace Andy's address with openssl.org

Reviewed-by: Andy Polyakov <appro@openssl.org>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
(Merged from https://github.com/openssl/openssl/pull/4516)

											
										
										
											2017-10-10 21:55:09 +00:00
+								# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								# project. The module is, however, dual licensed under OpenSSL and
 								# CRYPTOGAMS licenses depending on where you obtain it. For further
 								# details see http://www.openssl.org/~appro/cryptogams/.
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								# ====================================================================
-												Add timestamp to x86-mont.pl.

											
										
										
											2005-10-09 10:26:56 +00:00
+								# October 2005
 								#
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								# This is a "teaser" code, as it can be improved in several ways...
 								# First of all non-SSE2 path should be implemented (yes, for now it
 								# performs Montgomery multiplication/convolution only on SSE2-capable
 								# CPUs such as P4, others fall down to original code). Then inner loop
 								# can be unrolled and modulo-scheduled to improve ILP and possibly
 								# moved to 128-bit XMM register bank (though it would require input
 								# rearrangement and/or increase bus bandwidth utilization). Dedicated
 								# squaring procedure should give further performance improvement...
 								# Yet, for being draft, the code improves rsa512 *sign* benchmark by
 								# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+								# December 2006
 								#
 								# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
 								# Integer-only code [being equipped with dedicated squaring procedure]
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+								# gives ~40% on rsa512 sign benchmark...
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
 								$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 								push(@INC,"${dir}","${dir}../../perlasm");
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								require "x86asm.pl";
-												Unified - adapt the generation of bignum assembler to use GENERATE

This gets rid of the BEGINRAW..ENDRAW sections in crypto/bn/build.info.

This also moves the assembler generating perl scripts to take the
output file name as last command line argument, where necessary.

Reviewed-by: Rich Salz <rsalz@openssl.org>

											
										
										
											2016-03-07 14:41:33 +00:00
+								$output = pop;
 								open STDOUT,">$output";
-												Remove trailing whitespace from some files.

The prevailing style seems to not have trailing whitespace, but a few
lines do. This is mostly in the perlasm files, but a few C files got
them after the reformat. This is the result of:

  find . -name '*.pl' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.c' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.h' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'

Then bn_prime.h was excluded since this is a generated file.

Note mkerr.pl has some changes in a heredoc for some help output, but
other lines there lack trailing whitespace too.

Reviewed-by: Kurt Roeckx <kurt@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>

											
										
										
											2016-10-10 16:01:24 +00:00
-												Remove filename argument to x86 asm_init.

The assembler already knows the actual path to the generated file and,
in other perlasm architectures, is left to manage debug symbols itself.
Notably, in OpenSSL 1.1.x's new build system, which allows a separate
build directory, converting .pl to .s as the scripts currently do result
in the wrong paths.

This also avoids inconsistencies from some of the files using $0 and
some passing in the filename.

Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Andy Polyakov <appro@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/3431)

											
										
										
											2017-05-10 18:24:56 +00:00
+								&asm_init($ARGV[0]);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 								$sse2=0;
 								for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 								&external_label("OPENSSL_ia32cap_P") if ($sse2);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+								&function_begin("bn_mul_mont");
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								$i="edx";
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								$j="ecx";
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
+								$ap="esi";	$tp="esi";		# overlapping variables!!!
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								$rp="edi";	$bp="edi";		# overlapping variables!!!
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								$np="ebp";
 								$num="ebx";
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+								$_num=&DWP(4*0,"esp");			# stack top layout
 								$_rp=&DWP(4*1,"esp");
 								$_ap=&DWP(4*2,"esp");
 								$_bp=&DWP(4*3,"esp");
 								$_np=&DWP(4*4,"esp");
-												nasm fixes.

											
										
										
											2007-03-20 08:55:58 +00:00
+								$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
-												Flip saved argument block and tp [required for non-SSE2 path].

											
										
										
											2005-10-14 16:05:21 +00:00
+								$_sp=&DWP(4*6,"esp");
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								$_bpend=&DWP(4*7,"esp");
-												Flip saved argument block and tp [required for non-SSE2 path].

											
										
										
											2005-10-14 16:05:21 +00:00
+								$frame=32;				# size of above frame rounded up to 16n
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&xor	("eax","eax");
 									&mov	("edi",&wparam(5));	# int num
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&cmp	("edi",4);
 									&jl	(&label("just_leave"));
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&lea	("esi",&wparam(0));	# put aside pointer to argument block
 									&lea	("edx",&wparam(1));	# load ap
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&add	("edi",2);		# extra two words on top of tp
 									&neg	("edi");
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&neg	("edi");
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
-												Many spelling fixes/typo's corrected.

Around 138 distinct errors found and fixed; thanks!

Reviewed-by: Kurt Roeckx <kurt@roeckx.be>
Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/3459)

											
										
										
											2017-11-12 00:03:10 +00:00
+									# minimize cache contention by arranging 2K window between stack
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									# pointer and ap argument [np is also position sensitive vector,
 									# but it's assumed to be near ap, as it's allocated at ~same
 									# time].
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	("eax","ebp");
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&sub	("eax","edx");
 									&and	("eax",2047);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&xor	("edx","ebp");
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&and	("edx",2048);
 									&xor	("edx",2048);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&sub	("ebp","edx");		# this splits them apart modulo 4096
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&and	("ebp",-64);		# align to cache line
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
-												On Windows, page walking is known as __chkstk.

Reviewed-by: Andy Polyakov <appro@openssl.org>

											
										
										
											2016-03-15 22:04:14 +00:00
+									# An OS-agnostic version of __chkstk.
 									#
-												Explain *cough*-dows

Reviewed-by: Andy Polyakov <appro@openssl.org>

											
										
										
											2016-03-15 20:52:31 +00:00
+									# Some OSes (Windows) insist on stack being "wired" to
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>

											
										
										
											2016-03-04 10:39:11 +00:00
+									# physical memory in strictly sequential manner, i.e. if stack
 									# allocation spans two pages, then reference to farmost one can
 									# be punishable by SEGV. But page walking can do good even on
 									# other OSes, because it guarantees that villain thread hits
 									# the guard page before it can make damage to innocent one...
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	("eax","esp");
 									&sub	("eax","ebp");
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>

											
										
										
											2016-03-04 10:39:11 +00:00
+									&and	("eax",-4096);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	("edx","esp");		# saved stack pointer!
 									&lea	("esp",&DWP(0,"ebp","eax"));
 									&mov	("eax",&DWP(0,"esp"));
 									&cmp	("esp","ebp");
 									&ja	(&label("page_walk"));
 									&jmp	(&label("page_walk_done"));
 								&set_label("page_walk",16);
 									&lea	("esp",&DWP(-4096,"esp"));
 									&mov	("eax",&DWP(0,"esp"));
 									&cmp	("esp","ebp");
 									&ja	(&label("page_walk"));
 								&set_label("page_walk_done");
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>

											
										
										
											2016-03-04 10:39:11 +00:00
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									################################# load argument block...
 									&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
 									&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
 									&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
 									#&mov	("edi",&DWP(5*4,"esi"));# int num
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Change bn_mul_mont declaration and BN_MONT_CTX. Update CHANGES.

											
										
										
											2005-10-22 17:57:18 +00:00
+									&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&mov	($_rp,"eax");		# ... save a copy of argument block
 									&mov	($_ap,"ebx");
 									&mov	($_bp,"ecx");
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	($_np,"ebp");
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&mov	($_n0,"esi");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									#&mov	($_num,$num);		# redundant as $num is not reused
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>

											
										
										
											2016-03-16 22:33:53 +00:00
+									&mov	($_sp,"edx");		# saved stack pointer!
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								if($sse2) {
 								$acc0="mm0";	# mmx register bank layout
 								$acc1="mm1";
 								$car0="mm2";
 								$car1="mm3";
 								$mul0="mm4";
 								$mul1="mm5";
 								$temp="mm6";
 								$mask="mm7";
 									&picmeup("eax","OPENSSL_ia32cap_P");
 									&bt	(&DWP(0,"eax"),26);
 									&jnc	(&label("non_sse2"));
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&mov	("eax",-1);
 									&movd	($mask,"eax");		# mask 32 lower bits
 									&mov	($ap,$_ap);		# load input pointers
 									&mov	($bp,$_bp);
 									&mov	($np,$_np);
 									&xor	($i,$i);		# i=0
 									&xor	($j,$j);		# j=0
 									&movd	($mul0,&DWP(0,$bp));		# bp[0]
 									&movd	($mul1,&DWP(0,$ap));		# ap[0]
 									&movd	($car1,&DWP(0,$np));		# np[0]
 									&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
 									&movq	($car0,$mul1);
 									&movq	($acc0,$mul1);			# I wish movd worked for
 									&pand	($acc0,$mask);			# inter-register transfers
-												nasm fixes.

											
										
										
											2007-03-20 08:55:58 +00:00
+									&pmuludq($mul1,$_n0q);			# *=n0
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
 									&paddq	($car1,$acc0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($acc1,&DWP(4,$np));		# np[1]
 									&movd	($acc0,&DWP(4,$ap));		# ap[1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&inc	($j);				# j++
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+								&set_label("1st",16);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
 									&pmuludq($acc1,$mul1);			# np[j]*m1
 									&paddq	($car0,$acc0);			# +=c0
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&paddq	($car1,$acc1);			# +=c1
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car0,32);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car1,32);
 									&lea	($j,&DWP(1,$j));
 									&cmp	($j,$num);
 									&jl	(&label("1st"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
 									&pmuludq($acc1,$mul1);			# np[num-1]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
 									&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&paddq	($car1,$car0);
-												nasm fixes.

											
										
										
											2007-03-20 08:55:58 +00:00
+									&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&inc	($i);				# i++
 								&set_label("outer");
 									&xor	($j,$j);			# j=0
 									&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
 									&movd	($mul1,&DWP(0,$ap));		# ap[0]
-												Flip saved argument block and tp [required for non-SSE2 path].

											
										
										
											2005-10-14 16:05:21 +00:00
+									&movd	($temp,&DWP($frame,"esp"));	# tp[0]
 									&movd	($car1,&DWP(0,$np));		# np[0]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
 									&paddq	($mul1,$temp);			# +=tp[0]
 									&movq	($acc0,$mul1);
 									&movq	($car0,$mul1);
 									&pand	($acc0,$mask);
-												nasm fixes.

											
										
										
											2007-03-20 08:55:58 +00:00
+									&pmuludq($mul1,$_n0q);			# *=n0
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&pmuludq($car1,$mul1);
 									&paddq	($car1,$acc0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
 									&movd	($acc1,&DWP(4,$np));		# np[1]
 									&movd	($acc0,&DWP(4,$ap));		# ap[1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car0,32);
 									&psrlq	($car1,32);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&paddq	($car0,$temp);			# +=tp[1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&inc	($j);				# j++
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&dec	($num);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								&set_label("inner");
 									&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
 									&pmuludq($acc1,$mul1);			# np[j]*m1
 									&paddq	($car0,$acc0);			# +=c0
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&paddq	($car1,$acc1);			# +=c1
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&movq	($acc0,$car0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&pand	($acc0,$mask);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car0,32);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&psrlq	($car1,32);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&paddq	($car0,$temp);			# +=tp[j+1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&dec	($num);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&lea	($j,&DWP(1,$j));		# j++
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&jnz	(&label("inner"));
 									&mov	($num,$j);
 									&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
 									&pmuludq($acc1,$mul1);			# np[num-1]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
 									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
 									&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&paddq	($car1,$car0);
 									&paddq	($car1,$temp);
-												nasm fixes.

											
										
										
											2007-03-20 08:55:58 +00:00
+									&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&lea	($i,&DWP(1,$i));		# i++
 									&cmp	($i,$num);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&jle	(&label("outer"));
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
 									&emms	();				# done with mmx bank
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&jmp	(&label("common_tail"));
 								&set_label("non_sse2",16);
 								}
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
 								if (0) {
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	("esp",$_sp);
 									&xor	("eax","eax");	# signal "not fast enough [yet]"
 									&jmp	(&label("just_leave"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									# While the below code provides competitive performance for
-												spelling fixes, just comments and readme.

Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/1413)
											
										
										
											2016-08-05 17:56:58 +00:00
+									# all key lengths on modern Intel cores, it's still more
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 									# means compared to the original integer-only assembler.
 									# 512-bit RSA sign is better by ~40%, but that's about all
 									# one can say about all CPUs...
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								} else {
 								$inp="esi";	# integer path uses these registers differently
 								$word="edi";
 								$carry="ebp";
 									&mov	($inp,$_ap);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&lea	($carry,&DWP(1,$num));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	($word,$_bp);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&xor	($j,$j);				# j=0
 									&mov	("edx",$inp);
 									&and	($carry,1);				# see if num is even
 									&sub	("edx",$word);				# see if ap==bp
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&or	($carry,"edx");
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	($word,&DWP(0,$word));			# bp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&jz	(&label("bn_sqr_mont"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	($_bpend,"eax");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	("eax",&DWP(0,$inp));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&xor	("edx","edx");
 								&set_label("mull",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*bp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,"eax");
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&cmp	($j,$num);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("mull"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*bp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	($word,$_n0);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&add	("eax",$carry);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	($inp,$_np);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
 									&xor	($j,$j);
 									&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 									&mov	("eax",&DWP(0,$inp));			# np[0]
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	("eax",&DWP(4,$inp));			# np[1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&inc	($j);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&jmp	(&label("2ndmadd"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								&set_label("1stmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*bp[i]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
 									&cmp	($j,$num);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("1stmadd"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*bp[i]
 									&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	($word,$_n0);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	($inp,$_np);
 									&add	($carry,"eax");
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&xor	($j,$j);
 									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	($j,0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	("eax",&DWP(0,$inp));			# np[0]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	("eax",&DWP(4,$inp));			# np[1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
 									&mov	($j,1);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								&set_label("2ndmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
 									&cmp	($j,$num);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
 									&jl	(&label("2ndmadd"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&mov	($carry,"edx");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mul	($word);				# np[j]*m
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
+									&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&add	($carry,"eax");
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&adc	("edx",0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
 									&xor	("eax","eax");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &mov	($j,$_bp);				# &bp[i]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 									&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &lea	($j,&DWP(4,$j));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									 &cmp	($j,$_bpend);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
+									&je	(&label("common_tail"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
+									&mov	($word,&DWP(0,$j));			# bp[i+1]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&mov	($inp,$_ap);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	($_bp,$j);				# &bp[++i]
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&xor	($j,$j);
 									&xor	("edx","edx");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	("eax",&DWP(0,$inp));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+									&jmp	(&label("1stmadd"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
 								&set_label("bn_sqr_mont",16);
 								$sbit=$num;
 									&mov	($_num,$num);
 									&mov	($_bp,$j);				# i=0
 									&mov	("eax",$word);				# ap[0]
 									&mul	($word);				# ap[0]*ap[0]
 									&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
 									&mov	($sbit,"edx");
 									&shr	("edx",1);
 									&and	($sbit,1);
 									&inc	($j);
 								&set_label("sqr",16);
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*ap[0]
 									&add	("eax",$carry);
 									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
 									&lea	($carry,&DWP(0,$sbit,"eax",2));
 									&shr	("eax",31);
 									&cmp	($j,$_num);
 									&mov	($sbit,"eax");
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("sqr"));
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*ap[0]
 									&add	("eax",$carry);
 									 &mov	($word,$_n0);
 									&adc	("edx",0);
 									 &mov	($inp,$_np);
 									&lea	($carry,&DWP(0,$sbit,"eax",2));
 									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&shr	("eax",31);
 									&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
 									&lea	($carry,&DWP(0,"eax","edx",2));
 									 &mov	("eax",&DWP(0,$inp));			# np[0]
 									&shr	("edx",31);
 									&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&mov	($num,$j);
 									&adc	("edx",0);
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&mov	($j,1);
 								&set_label("3rdmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
 									&adc	("edx",0);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j+1]*m
 									&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
 									&lea	($j,&DWP(2,$j));
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
 									&adc	("edx",0);
 									&cmp	($j,$num);
 									&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("3rdmadd"));
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&adc	("edx",0);
 									&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
 									&mov	($j,$_bp);				# i
 									&xor	("eax","eax");
 									&mov	($inp,$_ap);
 									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 									&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
 									&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
 									&cmp	($j,$num);
 									&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
+									&je	(&label("common_tail"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
 									&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
 									&lea	($j,&DWP(1,$j));
 									&mov	("eax",$word);
 									&mov	($_bp,$j);				# ++i
 									&mul	($word);				# ap[i]*ap[i]
 									&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
 									&adc	("edx",0);
 									&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
 									&xor	($carry,$carry);
 									&cmp	($j,$num);
 									&lea	($j,&DWP(1,$j));
 									&je	(&label("sqrlast"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	($sbit,"edx");				# zaps $num
 									&shr	("edx",1);
 									&and	($sbit,1);
 								&set_label("sqradd",16);
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*ap[i]
 									&add	("eax",$carry);
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&lea	($carry,&DWP(0,"eax","eax"));
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&adc	("edx",0);
 									&shr	("eax",31);
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&lea	($j,&DWP(1,$j));
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
+									&adc	("eax",0);
-												Minor performance improvements to x86-mont.pl.

											
										
										
											2006-12-28 12:43:16 +00:00
+									&add	($carry,$sbit);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&adc	("eax",0);
 									&cmp	($j,$_num);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&mov	($sbit,"eax");
 									&jle	(&label("sqradd"));
 									&mov	($carry,"edx");
-												x86-mont.pl: fix bug in integer-only squaring path.
PR: 2648

											
										
										
											2011-12-09 14:21:25 +00:00
+									&add	("edx","edx");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&shr	($carry,31);
-												x86-mont.pl: fix bug in integer-only squaring path.
PR: 2648

											
										
										
											2011-12-09 14:21:25 +00:00
+									&add	("edx",$sbit);
 									&adc	($carry,0);
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+								&set_label("sqrlast");
 									&mov	($word,$_n0);
 									&mov	($inp,$_np);
 									&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
 									&mov	("eax",&DWP(0,$inp));			# np[0]
 									&adc	($carry,0);
 									&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&lea	($num,&DWP(-1,$j));
 									&adc	("edx",0);
 									&mov	($j,1);
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&jmp	(&label("3rdmadd"));
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								}
-												Fix for "strange errors" exposed by ccgost engine. The fix is
two extra insructions in sqradd loop at line #503.

											
										
										
											2006-12-27 10:59:51 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								&set_label("common_tail",16);
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
+									&mov	($np,$_np);			# load modulus pointer
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&mov	($rp,$_rp);			# load result pointer
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
+									&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
 									&mov	("eax",&DWP(0,$tp));		# tp[0]
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+									&mov	($j,$num);			# j=num-1
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
+									&xor	($i,$i);			# i=0 and clear CF!
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								&set_label("sub",16);
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&sbb	("eax",&DWP(0,$np,$i,4));
 									&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
 									&dec	($j);				# doesn't affect CF!
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
+									&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
 									&lea	($i,&DWP(1,$i));		# i++
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&jge	(&label("sub"));
-												Eliminate conditional final subtraction in Montgomery assembler modules.

											
										
										
											2007-06-17 17:10:03 +00:00
 									&sbb	("eax",0);			# handle upmost overflow bit
 									&and	($tp,"eax");
 									&not	("eax");
 									&mov	($np,$rp);
 									&and	($np,"eax");
 									&or	($tp,$np);			# tp=carry?tp:rp
 								&set_label("copy",16);				# copy or in-place refresh
 									&mov	("eax",&DWP(0,$tp,$num,4));
 									&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
 									&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
 									&dec	($num);
 									&jge	(&label("copy"));
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+									&mov	("esp",$_sp);		# pull saved stack pointer
 									&mov	("eax",1);
-												Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.

											
										
										
											2006-11-27 14:59:35 +00:00
+								&set_label("just_leave");
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								&function_end("bn_mul_mont");
-												x86-mont.pl sse2 tune-up and integer-only squaring procedure.

											
										
										
											2006-12-22 15:28:07 +00:00
+								&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
-												Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser".

											
										
										
											2005-10-09 09:53:58 +00:00
+								&asm_finish();
-												Unified - adapt the generation of bignum assembler to use GENERATE

This gets rid of the BEGINRAW..ENDRAW sections in crypto/bn/build.info.

This also moves the assembler generating perl scripts to take the
output file name as last command line argument, where necessary.

Reviewed-by: Rich Salz <rsalz@openssl.org>

											
										
										
											2016-03-07 14:41:33 +00:00
 								close STDOUT;