openssl/crypto/rc4/asm/rc4-x86_64.pl
2005-05-04 16:12:07 +00:00

150 lines
3.1 KiB
Raku
Executable file

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
# commentary section in corresponding script in development branch
# for background information about this option carousel. For those
# who don't have energy to figure out these gory details, here is
# basis in form of performance matrix relative to the original
# 0.9.7e C code-base:
#
# 0.9.7e 0.9.7f this
# AMD64 1x 3.3x 2.4x
# EM64T 1x 0.8x 1.5x
#
# In other words idea is to trade -25% AMD64 performance to compensate
# for deterioration and gain +90% on EM64T core. Development branch
# maintains best performance for either target, i.e. 3.3x for AMD64
# and 1.5x for EM64T.
$output=shift;
open STDOUT,">$output" || die "can't open $output: $!";
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
$inp="%rdx"; # arg3
$out="%rcx"; # arg4
@XX=("%r8","%r10");
@TX=("%r9","%r11");
$YY="%r12";
$TY="%r13";
$code=<<___;;
.text
.globl RC4
.type RC4,\@function
.align 16
RC4: or $len,$len
jne .Lentry
repret
.Lentry:
push %r12
push %r13
add \$2,$dat
movzb -2($dat),$XX[0]#d
movzb -1($dat),$YY#d
add \$1,$XX[0]#b
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
push %rbx
.align 16 # incidentally aligned already
.Lcloop8:
mov ($inp),%eax
mov 4($inp),%ebx
___
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
for ($i=0;$i<4;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%al
ror \$8,%eax
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
for ($i=4;$i<8;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%bl
ror \$8,%ebx
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lea -8($len),$len
mov %eax,($out)
lea 8($inp),$inp
mov %ebx,4($out)
lea 8($out),$out
test \$-8,$len
jnz .Lcloop8
pop %rbx
cmp \$0,$len
jne .Lcloop1
.Lexit:
sub \$1,$XX[0]#b
movb $XX[0]#b,-2($dat)
movb $YY#b,-1($dat)
pop %r13
pop %r12
repret
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])
add $TX[0]#b,$TY#b
add \$1,$XX[0]#b
movzb ($dat,$TY),$TY#d
movzb ($dat,$XX[0]),$TX[0]#d
xorb ($inp),$TY#b
lea 1($inp),$inp
movb $TY#b,($out)
lea 1($out),$out
sub \$1,$len
jnz .Lcloop1
jmp .Lexit
.size RC4,.-RC4
___
$code =~ s/#([bwd])/$1/gm;
$code =~ s/repret/.byte\t0xF3,0xC3/gm;
print $code;