gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.

ghash-x86*.pl: fix performance numbers for Core2, as it turned out previous ones were "tainted" by variable clock frequency.
2010-04-14 19:04:51 +00:00 · 2010-04-14 19:04:51 +00:00 · 4f39edbff1
commit 4f39edbff1
parent 8decc967dc
6 changed files with 38 additions and 47 deletions
--- a/crypto/modes/asm/ghash-alpha.pl
+++ b/crypto/modes/asm/ghash-alpha.pl
@ -31,10 +31,10 @@ $Thi1="t5";
 $Tlo1="t6";
 $rem="t7";	# $8
 #################
-$Xi="a0";	# $16
+$Xi="a0";	# $16, input argument block
 $Htbl="a1";
-
-
+$inp="a2";
+$len="a3";
 $nlo="a4";	# $20
 $nhi="a5";
 $Zhi="t8";
@ -314,12 +314,6 @@ $code.=<<___;
 .end	gcm_gmult_4bit
 ___

-# argument block for gcm_ghash_4bit
-$inp="a0";	# $16
-$len="a1";
-$Xi ="a2";
-$Htbl="a3";
-
 $inhi="s0";
 $inlo="s1";

--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@ -142,13 +142,13 @@ gcm_ghash_4bit:
 	.prologue
 { .mmi;	.save	ar.pfs,prevfs
 	alloc	prevfs=ar.pfs,4,4,0,8
-	$ADDP	inp=15,in0			// &inp[15]
+	$ADDP	inp=15,in2			// &inp[15]
 	mov	rem_4bitp=ip		}
-{ .mmi;	$ADDP	end=in1,in0			// &inp[len]
-	$ADDP	Xi=15,in2			// &Xi[15]
+{ .mmi;	$ADDP	end=in3,in2			// &inp[len]
+	$ADDP	Xi=15,in0			// &Xi[15]
 	.save	ar.lc,prevlc
 	mov	prevlc=ar.lc		};;
-{ .mmi;	$ADDP	Htbl=8,in3			// &Htbl[0].lo
+{ .mmi;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
 	mov	mask0xf0=0xf0
 	.save	pr,prevpr
 	mov	prevpr=pr		}
--- a/crypto/modes/asm/ghash-sparcv9.pl
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@ -54,10 +54,10 @@ $remi="%l5";
 $Htblo="%l6";
 $cnt="%l7";

-$inp="%i0";	# input arguments for gcm_ghash_4bit
-$len="%i1";
-$Xi="%i2";
-$Htbl="%i3";
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";

 $code.=<<___;
 .section	".text",#alloc,#execinstr
@ -208,8 +208,6 @@ gcm_ghash_4bit:
 .size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
 ___

-$Xi="%i0";	# input arguments for gcm_gmult_4bit
-$Htbl="%i1";
 undef $inp;
 undef $len;

--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@ -23,7 +23,7 @@
 # PIII		63 /77		16		24
 # P4		96 /122		30		84(***)
 # Opteron	50 /71		21		30
-# Core2		63 /102		19		28
+# Core2		54 /68		13		18
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
 #	which is one of reasons why 2.95.3 results were chosen,
@ -317,12 +317,12 @@ if ($unroll) {

 	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));

-	&mov	($inp,&wparam(0));	# load in
-	&mov	($Zlh,&wparam(1));	# load len
-	&mov	($Zhh,&wparam(2));	# load Xi
-	&mov	($Htbl,&wparam(3));	# load Htable
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
 	&add	($Zlh,$inp);
-	&mov	(&wparam(1),$Zlh);	# len to point at the end of input
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
 	&stack_push(4+1);		# +1 for stack alignment
 	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
 	&mov	($Zhl,&DWP(4,$Zhh));
@ -344,10 +344,10 @@ if ($unroll) {
 	&mmx_loop("esp","eax");

 	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(1));
+	&cmp	($inp,&wparam(3));
 	&jb	(&label("mmx_outer_loop"));

-	&mov	($inp,&wparam(2));	# load Xi
+	&mov	($inp,&wparam(0));	# load Xi
 	&emms	();
 	&mov	(&DWP(12,$inp),$Zll);
 	&mov	(&DWP(4,$inp),$Zhl);
@ -359,12 +359,12 @@ if ($unroll) {
    &set_label("x86",16);
    }
 	&stack_push(16+4+1);			# +1 for 64-bit alignment
-	&mov	($inp,&wparam(0));		# load in
-	&mov	("ecx",&wparam(1));		# load len
-	&mov	($Zll,&wparam(2));		# load Xi
-	&mov	($Htbl,&wparam(3));		# load Htable
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
 	&add	("ecx",$inp);
-	&mov	(&wparam(1),"ecx");
+	&mov	(&wparam(3),"ecx");

 	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
 	&mov	($Zhl,&DWP(4,$Zll));
@ -390,14 +390,14 @@ if ($unroll) {
 		&call	("_x86_gmult_4bit_inner");
 	} else {
 		&x86_loop(0);
-		&mov	($inp,&wparam(0));
+		&mov	($inp,&wparam(2));
 	}
 	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(1));
-	&mov	(&wparam(0),$inp)	if (!$unroll);
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
 	&jb	(&label("x86_outer_loop"));

-	&mov	($inp,&wparam(2));	# load Xi
+	&mov	($inp,&wparam(0));	# load Xi
 	&mov	(&DWP(12,$inp),$Zll);
 	&mov	(&DWP(8,$inp),$Zlh);
 	&mov	(&DWP(4,$inp),$Zhl);
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@ -18,7 +18,7 @@
 #		gcc 3.4.x	assembler
 #
 # Opteron	18.5		10.2		+80%
-# Core2		26.0		16.4		+58%
+# Core2		17.5		11.0		+59%

 $flavour = shift;
 $output  = shift;
@ -41,10 +41,10 @@ $Zhi="%r9";
 $tmp="%r10";
 $rem_4bit = "%r11";

-# per-function register layout
 $Xi="%rdi";
 $Htbl="%rsi";

+# per-function register layout
 $cnt="%rcx";
 $rem="%rdx";

@ -159,10 +159,8 @@ ___


 # per-function register layout
-$inp="%rdi";
-$len="%rsi";
-$Xi="%rdx";
-$Htbl="%rcx";
+$inp="%rdx";
+$len="%rcx";

 $cnt="%rbp";
 $rem="%r12";
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@ -339,7 +339,7 @@ static const size_t rem_4bit[16] = {
 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };

-static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 {
 	u128 Z;
 	int cnt = 15;
@ -410,7 +410,8 @@ static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
 * mostly as reference and a placeholder for possible future
 * non-trivial optimization[s]...
 */
-static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)
 {
    u128 Z;
    int cnt;
@ -479,13 +480,13 @@ static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
 }
 #endif
 #else
-void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
-void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #endif

 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable)
+#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 * trashing effect. In other words idea is to hash data while it's
 * still in L1 cache after encryption pass... */