openssl/crypto/rc4/asm/rc4-586.pl

#!/usr/local/bin/perl

# At some point it became apparent that the original SSLeay RC4
# assembler implementation performs suboptimaly on latest IA-32
# microarchitectures. After re-tuning performance has changed as
# following:
#
# Pentium	+0%
# Pentium III	+17%
# AMD		+52%(*)
# P4		+180%(**)
#
# (*)	This number is actually a trade-off:-) It's possible to
#	achieve	+72%, but at the cost of -48% off PIII performance.
#	In other words code performing further 13% faster on AMD
#	would perform almost 2 times slower on Intel PIII...
#	For reference! This code delivers ~80% of rc4-amd64.pl
#	performance on the same Opteron machine.
# (**)	This number requires compressed key schedule set up by
#	RC4_set_key and therefore doesn't apply to 0.9.7 [option for
#	compressed key schedule is implemented in 0.9.8 and later,
#	see commentary section in rc4_skey.c for further details].
#
#					<appro@fy.chalmers.se>

push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],"rc4-586.pl");

$x="eax";
$y="ebx";
$tx="ecx";
$ty="edx";
$in="esi";
$out="edi";
$d="ebp";

&RC4("RC4");

&asm_finish();

sub RC4_loop
	{
	local($n,$p,$char)=@_;

	&comment("Round $n");

	if ($char)
		{
		if ($p >= 0)
			{
			 &mov($ty,	&swtmp(2));
			&cmp($ty,	$in);
			 &jbe(&label("finished"));
			&inc($in);
			}
		else
			{
			&add($ty,	8);
			 &inc($in);
			&cmp($ty,	$in);
			 &jb(&label("finished"));
			&mov(&swtmp(2),	$ty);
			}
		}
	# Moved out
	# &mov(	$tx,		&DWP(0,$d,$x,4)) if $p < 0;

	&add(	&LB($y),	&LB($tx));
	&mov(	$ty,		&DWP(0,$d,$y,4));
	 # XXX
	&mov(	&DWP(0,$d,$x,4),$ty);
	 &add(	$ty,		$tx);
	&mov(	&DWP(0,$d,$y,4),$tx);
	 &and(	$ty,		0xff);
	 &inc(	&LB($x));			# NEXT ROUND
	&mov(	$tx,		&DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
	 &mov(	$ty,		&DWP(0,$d,$ty,4));

	if (!$char)
		{
		#moved up into last round
		if ($p >= 1)
			{
			&add(	$out,	8)
			}
		&movb(	&BP($n,"esp","",0),	&LB($ty));
		}
	else
		{
		# Note in+=8 has occured
		&movb(	&HB($ty),	&BP(-1,$in,"",0));
		 # XXX
		&xorb(&LB($ty),		&HB($ty));
		 # XXX
		&movb(&BP($n,$out,"",0),&LB($ty));
		}
	}


sub RC4
	{
	local($name)=@_;

	&function_begin_B($name,"");

	&mov($ty,&wparam(1));		# len
	&cmp($ty,0);
	&jne(&label("proceed"));
	&ret();
	&set_label("proceed");

	&comment("");

	&push("ebp");
	 &push("ebx");
	&push("esi");
	 &xor(	$x,	$x);		# avoid partial register stalls
	&push("edi");
	 &xor(	$y,	$y);		# avoid partial register stalls
	&mov(	$d,	&wparam(0));	# key
	 &mov(	$in,	&wparam(2));

	&movb(	&LB($x),	&BP(0,$d,"",1));
	 &movb(	&LB($y),	&BP(4,$d,"",1));

	&mov(	$out,	&wparam(3));
	 &inc(	&LB($x));

	&stack_push(3);	# 3 temp variables
	 &add(	$d,	8);

	# detect compressed schedule, see commentary section in rc4_skey.c...
	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
	# as compressed key schedule is set up in 0.9.8 and later.
	&cmp(&DWP(256,$d),-1);
	&je(&label("RC4_CHAR"));

	 &lea(	$ty,	&DWP(-8,$ty,$in));

	# check for 0 length input

	 &mov(	&swtmp(2),	$ty);	# this is now address to exit at
	&mov(	$tx,	&DWP(0,$d,$x,4));

	 &cmp(	$ty,	$in);
	&jb(	&label("end")); # less than 8 bytes

	&set_label("start");

	# filling DELAY SLOT
	&add(	$in,	8);

	&RC4_loop(0,-1,0);
	&RC4_loop(1,0,0);
	&RC4_loop(2,0,0);
	&RC4_loop(3,0,0);
	&RC4_loop(4,0,0);
	&RC4_loop(5,0,0);
	&RC4_loop(6,0,0);
	&RC4_loop(7,1,0);
	
	&comment("apply the cipher text");
	# xor the cipher data with input

	#&add(	$out,	8); #moved up into last round

	&mov(	$tx,	&swtmp(0));
	 &mov(	$ty,	&DWP(-8,$in,"",0));
	&xor(	$tx,	$ty);
	 &mov(	$ty,	&DWP(-4,$in,"",0)); 
	&mov(	&DWP(-8,$out,"",0),	$tx);
	 &mov(	$tx,	&swtmp(1));
	&xor(	$tx,	$ty);
	 &mov(	$ty,	&swtmp(2));	# load end ptr;
	&mov(	&DWP(-4,$out,"",0),	$tx);
	 &mov(	$tx,		&DWP(0,$d,$x,4));
	&cmp($in,	$ty);
	 &jbe(&label("start"));

	&set_label("end");

	# There is quite a bit of extra crap in RC4_loop() for this
	# first round
	&RC4_loop(0,-1,1);
	&RC4_loop(1,0,1);
	&RC4_loop(2,0,1);
	&RC4_loop(3,0,1);
	&RC4_loop(4,0,1);
	&RC4_loop(5,0,1);
	&RC4_loop(6,1,1);

	&jmp(&label("finished"));

	&align(16);
	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
	# and is engaged in 0.9.8 and later context...
	&set_label("RC4_CHAR");

	&lea	($ty,&DWP(0,$in,$ty));
	&mov	(&swtmp(2),$ty);
	&movz	($tx,&BP(0,$d,$x));

	# strangely enough unrolled loop performs over 20% slower...
	&set_label("RC4_CHAR_loop");
		&add	(&LB($y),&LB($tx));
		&movz	($ty,&BP(0,$d,$y));
		&movb	(&BP(0,$d,$y),&LB($tx));
		&movb	(&BP(0,$d,$x),&LB($ty));
		&add	(&LB($ty),&LB($tx));
		&movz	($ty,&BP(0,$d,$ty));
		&add	(&LB($x),1);
		&xorb	(&LB($ty),&BP(0,$in));
		&lea	($in,&BP(1,$in));
		&movz	($tx,&BP(0,$d,$x));
		&cmp	($in,&swtmp(2));
		&movb	(&BP(0,$out),&LB($ty));
		&lea	($out,&BP(1,$out));
	&jb	(&label("RC4_CHAR_loop"));

	&set_label("finished");
	&dec(	$x);
	 &stack_pop(3);
	&movb(	&BP(-4,$d,"",0),&LB($y));
	 &movb(	&BP(-8,$d,"",0),&LB($x));

	&function_end($name);
	}
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`#!/usr/local/bin/perl`

Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`# At some point it became apparent that the original SSLeay RC4`
Add 0.9.7 specific comments to RC4 assembler modules. 2004-11-30 15:46:46 +00:00			`# assembler implementation performs suboptimaly on latest IA-32`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`# microarchitectures. After re-tuning performance has changed as`
			`# following:`
			`#`
			`# Pentium +0%`
			`# Pentium III +17%`
			`# AMD +52%(*)`
			`# P4 +180%(**)`
			`#`
			`# (*) This number is actually a trade-off:-) It's possible to`
			`# achieve +72%, but at the cost of -48% off PIII performance.`
			`# In other words code performing further 13% faster on AMD`
			`# would perform almost 2 times slower on Intel PIII...`
			`# For reference! This code delivers ~80% of rc4-amd64.pl`
Add 0.9.7 specific comments to RC4 assembler modules. 2004-11-30 15:46:46 +00:00			`# performance on the same Opteron machine.`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`# (**) This number requires compressed key schedule set up by`
Add 0.9.7 specific comments to RC4 assembler modules. 2004-11-30 15:46:46 +00:00			`# RC4_set_key and therefore doesn't apply to 0.9.7 [option for`
			`# compressed key schedule is implemented in 0.9.8 and later,`
			`# see commentary section in rc4_skey.c for further details].`
			`#`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`# <appro@fy.chalmers.se>`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
			`push(@INC,"perlasm","../../perlasm");`
			`require "x86asm.pl";`

			`&asm_init($ARGV[0],"rc4-586.pl");`

RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`$x="eax";`
			`$y="ebx";`
			`$tx="ecx";`
			`$ty="edx";`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`$in="esi";`
			`$out="edi";`
			`$d="ebp";`

			`&RC4("RC4");`

			`&asm_finish();`

			`sub RC4_loop`
			`{`
			`local($n,$p,$char)=@_;`

			`&comment("Round $n");`

			`if ($char)`
			`{`
			`if ($p >= 0)`
			`{`
			`&mov($ty, &swtmp(2));`
			`&cmp($ty, $in);`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jbe(&label("finished"));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&inc($in);`
			`}`
			`else`
			`{`
			`&add($ty, 8);`
			`&inc($in);`
			`&cmp($ty, $in);`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jb(&label("finished"));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&mov(&swtmp(2), $ty);`
			`}`
			`}`
			`# Moved out`
			`# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;`

Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&add( &LB($y), &LB($tx));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&mov( $ty, &DWP(0,$d,$y,4));`
			`# XXX`
I've introduced a bug to i386 RC4 assembler, which would emerge with certain mix of calls to RC4 routine not covered by rc4test.c. It's fixed now. In addition this patch inadvertently fixes minor performance problem: in 0.9.7 context P4 was performing 12% slower than the original implementation... 2004-12-01 15:28:18 +00:00			`&mov( &DWP(0,$d,$x,4),$ty);`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&add( $ty, $tx);`
			`&mov( &DWP(0,$d,$y,4),$tx);`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&and( $ty, 0xff);`
I've introduced a bug to i386 RC4 assembler, which would emerge with certain mix of calls to RC4 routine not covered by rc4test.c. It's fixed now. In addition this patch inadvertently fixes minor performance problem: in 0.9.7 context P4 was performing 12% slower than the original implementation... 2004-12-01 15:28:18 +00:00			`&inc( &LB($x)); # NEXT ROUND`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND`
			`&mov( $ty, &DWP(0,$d,$ty,4));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
			`if (!$char)`
			`{`
			`#moved up into last round`
			`if ($p >= 1)`
			`{`
			`&add( $out, 8)`
			`}`
			`&movb( &BP($n,"esp","",0), &LB($ty));`
			`}`
			`else`
			`{`
			`# Note in+=8 has occured`
			`&movb( &HB($ty), &BP(-1,$in,"",0));`
			`# XXX`
			`&xorb(&LB($ty), &HB($ty));`
			`# XXX`
			`&movb(&BP($n,$out,"",0),&LB($ty));`
			`}`
			`}`


			`sub RC4`
			`{`
			`local($name)=@_;`

			`&function_begin_B($name,"");`

RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&mov($ty,&wparam(1)); # len`
			`&cmp($ty,0);`
			`&jne(&label("proceed"));`
			`&ret();`
			`&set_label("proceed");`

Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&comment("");`

			`&push("ebp");`
			`&push("ebx");`
			`&push("esi");`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&xor( $x, $x); # avoid partial register stalls`
			`&push("edi");`
			`&xor( $y, $y); # avoid partial register stalls`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&mov( $d, &wparam(0)); # key`
			`&mov( $in, &wparam(2));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&movb( &LB($x), &BP(0,$d,"",1));`
			`&movb( &LB($y), &BP(4,$d,"",1));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&mov( $out, &wparam(3));`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&inc( &LB($x));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
			`&stack_push(3); # 3 temp variables`
			`&add( $d, 8);`

RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`# detect compressed schedule, see commentary section in rc4_skey.c...`
Add 0.9.7 specific comments to RC4 assembler modules. 2004-11-30 15:46:46 +00:00			`# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,`
			`# as compressed key schedule is set up in 0.9.8 and later.`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&cmp(&DWP(256,$d),-1);`
			`&je(&label("RC4_CHAR"));`

Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&lea( $ty, &DWP(-8,$ty,$in));`

			`# check for 0 length input`

			`&mov( &swtmp(2), $ty); # this is now address to exit at`
			`&mov( $tx, &DWP(0,$d,$x,4));`

			`&cmp( $ty, $in);`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jb( &label("end")); # less than 8 bytes`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
			`&set_label("start");`

			`# filling DELAY SLOT`
			`&add( $in, 8);`

			`&RC4_loop(0,-1,0);`
			`&RC4_loop(1,0,0);`
			`&RC4_loop(2,0,0);`
			`&RC4_loop(3,0,0);`
			`&RC4_loop(4,0,0);`
			`&RC4_loop(5,0,0);`
			`&RC4_loop(6,0,0);`
			`&RC4_loop(7,1,0);`

			`&comment("apply the cipher text");`
			`# xor the cipher data with input`

			`#&add( $out, 8); #moved up into last round`

			`&mov( $tx, &swtmp(0));`
			`&mov( $ty, &DWP(-8,$in,"",0));`
			`&xor( $tx, $ty);`
			`&mov( $ty, &DWP(-4,$in,"",0));`
			`&mov( &DWP(-8,$out,"",0), $tx);`
			`&mov( $tx, &swtmp(1));`
			`&xor( $tx, $ty);`
			`&mov( $ty, &swtmp(2)); # load end ptr;`
			`&mov( &DWP(-4,$out,"",0), $tx);`
			`&mov( $tx, &DWP(0,$d,$x,4));`
			`&cmp($in, $ty);`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jbe(&label("start"));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00
			`&set_label("end");`

			`# There is quite a bit of extra crap in RC4_loop() for this`
			`# first round`
			`&RC4_loop(0,-1,1);`
			`&RC4_loop(1,0,1);`
			`&RC4_loop(2,0,1);`
			`&RC4_loop(3,0,1);`
			`&RC4_loop(4,0,1);`
			`&RC4_loop(5,0,1);`
			`&RC4_loop(6,1,1);`

RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jmp(&label("finished"));`

			`&align(16);`
Add 0.9.7 specific comments to RC4 assembler modules. 2004-11-30 15:46:46 +00:00			`# this is essentially Intel P4 specific codepath, see rc4_skey.c,`
			`# and is engaged in 0.9.8 and later context...`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&set_label("RC4_CHAR");`

			`&lea ($ty,&DWP(0,$in,$ty));`
			`&mov (&swtmp(2),$ty);`
+20% performance improvement of P4-specific RC4_CHAR loop. 2005-05-15 22:43:00 +00:00			`&movz ($tx,&BP(0,$d,$x));`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00
			`# strangely enough unrolled loop performs over 20% slower...`
			`&set_label("RC4_CHAR_loop");`
			`&add (&LB($y),&LB($tx));`
			`&movz ($ty,&BP(0,$d,$y));`
			`&movb (&BP(0,$d,$y),&LB($tx));`
			`&movb (&BP(0,$d,$x),&LB($ty));`
			`&add (&LB($ty),&LB($tx));`
			`&movz ($ty,&BP(0,$d,$ty));`
+20% performance improvement of P4-specific RC4_CHAR loop. 2005-05-15 22:43:00 +00:00			`&add (&LB($x),1);`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&xorb (&LB($ty),&BP(0,$in));`
+20% performance improvement of P4-specific RC4_CHAR loop. 2005-05-15 22:43:00 +00:00			`&lea ($in,&BP(1,$in));`
			`&movz ($tx,&BP(0,$d,$x));`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&cmp ($in,&swtmp(2));`
+20% performance improvement of P4-specific RC4_CHAR loop. 2005-05-15 22:43:00 +00:00			`&movb (&BP(0,$out),&LB($ty));`
			`&lea ($out,&BP(1,$out));`
RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). 2004-11-21 10:36:25 +00:00			`&jb (&label("RC4_CHAR_loop"));`

Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&set_label("finished");`
			`&dec( $x);`
			`&stack_pop(3);`
Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. 2004-11-29 21:12:58 +00:00			`&movb( &BP(-4,$d,"",0),&LB($y));`
Import of old SSLeay release: SSLeay 0.9.0b 1998-12-21 10:56:39 +00:00			`&movb( &BP(-8,$d,"",0),&LB($x));`

			`&function_end($name);`
			`}`