aesni-x86_64.pl: CTR face lift, +25% on Bulldozer.
This commit is contained in:
parent
c3cddeaec8
commit
9282c33596
1 changed files with 248 additions and 210 deletions
|
@ -130,7 +130,7 @@
|
||||||
# Further data for other parallelizable modes:
|
# Further data for other parallelizable modes:
|
||||||
#
|
#
|
||||||
# CBC decrypt 1.16 0.93 0.93
|
# CBC decrypt 1.16 0.93 0.93
|
||||||
# CTR 1.14 0.91 n/a
|
# CTR 1.14 0.91 0.90
|
||||||
#
|
#
|
||||||
# Well, given 3x column it's probably inappropriate to call the limit
|
# Well, given 3x column it's probably inappropriate to call the limit
|
||||||
# asymptotic, if it can be surpassed, isn't it? What happens there?
|
# asymptotic, if it can be surpassed, isn't it? What happens there?
|
||||||
|
@ -160,7 +160,7 @@
|
||||||
######################################################################
|
######################################################################
|
||||||
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
||||||
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
|
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
|
||||||
# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec]
|
# in ECB, 0.76 in CTR, 0.95 in XTS... This means that aes[enc|dec]
|
||||||
# instruction latency is 9 cycles and that they can be issued every
|
# instruction latency is 9 cycles and that they can be issued every
|
||||||
# cycle.
|
# cycle.
|
||||||
|
|
||||||
|
@ -1013,286 +1013,321 @@ ___
|
||||||
# does not update *ivec! (see engine/eng_aesni.c for details)
|
# does not update *ivec! (see engine/eng_aesni.c for details)
|
||||||
#
|
#
|
||||||
{
|
{
|
||||||
my $frame_size = 0x20+($win64?160:0);
|
my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15));
|
||||||
my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
|
my $len_="%r9";
|
||||||
my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
|
|
||||||
my $bswap_mask="%xmm15";
|
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.globl aesni_ctr32_encrypt_blocks
|
.globl aesni_ctr32_encrypt_blocks
|
||||||
.type aesni_ctr32_encrypt_blocks,\@function,5
|
.type aesni_ctr32_encrypt_blocks,\@function,5
|
||||||
.align 16
|
.align 16
|
||||||
aesni_ctr32_encrypt_blocks:
|
aesni_ctr32_encrypt_blocks:
|
||||||
lea (%rsp),%rax
|
|
||||||
push %rbp
|
|
||||||
sub \$$frame_size,%rsp
|
|
||||||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
|
||||||
___
|
___
|
||||||
$code.=<<___ if ($win64);
|
$code.=<<___ if ($win64);
|
||||||
movaps %xmm6,0x20(%rsp)
|
lea -0xa8(%rsp),%rsp
|
||||||
movaps %xmm7,0x30(%rsp)
|
movaps %xmm6,0x00(%rsp)
|
||||||
movaps %xmm8,0x40(%rsp)
|
movaps %xmm7,0x10(%rsp)
|
||||||
movaps %xmm9,0x50(%rsp)
|
movaps %xmm8,0x20(%rsp)
|
||||||
movaps %xmm10,0x60(%rsp)
|
movaps %xmm9,0x30(%rsp)
|
||||||
movaps %xmm11,0x70(%rsp)
|
movaps %xmm10,0x40(%rsp)
|
||||||
movaps %xmm12,0x80(%rsp)
|
movaps %xmm11,0x50(%rsp)
|
||||||
movaps %xmm13,0x90(%rsp)
|
movaps %xmm12,0x60(%rsp)
|
||||||
movaps %xmm14,0xa0(%rsp)
|
movaps %xmm13,0x70(%rsp)
|
||||||
movaps %xmm15,0xb0(%rsp)
|
movaps %xmm14,0x80(%rsp)
|
||||||
|
movaps %xmm15,0x90(%rsp)
|
||||||
.Lctr32_body:
|
.Lctr32_body:
|
||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea -8(%rax),%rbp
|
|
||||||
cmp \$1,$len
|
cmp \$1,$len
|
||||||
je .Lctr32_one_shortcut
|
je .Lctr32_one_shortcut
|
||||||
|
|
||||||
movdqu ($ivp),$ivec
|
movzb 15($ivp),%rax # counter LSB
|
||||||
movdqa .Lbswap_mask(%rip),$bswap_mask
|
mov $len,$len_ # backup $len
|
||||||
xor $rounds,$rounds
|
mov 240($key),$rnds_ # key->rounds
|
||||||
pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
|
|
||||||
pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
|
|
||||||
|
|
||||||
mov 240($key),$rounds # key->rounds
|
|
||||||
bswap $rnds_
|
|
||||||
pxor $iv0,$iv0 # vector of 3 32-bit counters
|
|
||||||
pxor $iv1,$iv1 # vector of 3 32-bit counters
|
|
||||||
pinsrd \$0,$rnds_,$iv0
|
|
||||||
lea 3($rnds_),$key_
|
|
||||||
pinsrd \$0,$key_,$iv1
|
|
||||||
inc $rnds_
|
|
||||||
pinsrd \$1,$rnds_,$iv0
|
|
||||||
inc $key_
|
|
||||||
pinsrd \$1,$key_,$iv1
|
|
||||||
inc $rnds_
|
|
||||||
pinsrd \$2,$rnds_,$iv0
|
|
||||||
inc $key_
|
|
||||||
pinsrd \$2,$key_,$iv1
|
|
||||||
movdqa $iv0,0x00(%rsp)
|
|
||||||
pshufb $bswap_mask,$iv0
|
|
||||||
movdqa $iv1,0x10(%rsp)
|
|
||||||
pshufb $bswap_mask,$iv1
|
|
||||||
|
|
||||||
pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
|
|
||||||
pshufd \$`2<<6`,$iv0,$inout1
|
|
||||||
pshufd \$`1<<6`,$iv0,$inout2
|
|
||||||
cmp \$6,$len
|
|
||||||
jb .Lctr32_tail
|
|
||||||
shr \$1,$rounds
|
|
||||||
mov $key,$key_ # backup $key
|
mov $key,$key_ # backup $key
|
||||||
mov $rounds,$rnds_ # backup $rounds
|
movdqu ($ivp),$ivec
|
||||||
sub \$6,$len
|
neg %rax
|
||||||
jmp .Lctr32_loop6
|
movdqa .Lincrement1(%rip),$one
|
||||||
|
add \$256,%rax # steps to closest overflow
|
||||||
|
|
||||||
|
.Lctr32_grandloop:
|
||||||
|
cmp %rax,$len
|
||||||
|
cmova %rax,$len
|
||||||
|
mov $rnds_,$rounds # restore $rounds
|
||||||
|
sub $len,$len_
|
||||||
|
|
||||||
|
cmp \$8,$len
|
||||||
|
jb .Lctr32_tail
|
||||||
|
|
||||||
.align 16
|
|
||||||
.Lctr32_loop6:
|
|
||||||
pshufd \$`3<<6`,$iv1,$inout3
|
|
||||||
por $ivec,$inout0 # merge counter-less ivec
|
|
||||||
$movkey ($key_),$rndkey0
|
$movkey ($key_),$rndkey0
|
||||||
pshufd \$`2<<6`,$iv1,$inout4
|
shr \$1,$rounds
|
||||||
por $ivec,$inout1
|
shr \$1,$rnds_
|
||||||
$movkey 16($key_),$rndkey1
|
sub \$8,$len
|
||||||
pshufd \$`1<<6`,$iv1,$inout5
|
jmp .Lctr32_loop8
|
||||||
por $ivec,$inout2
|
|
||||||
por $ivec,$inout3
|
|
||||||
xorps $rndkey0,$inout0
|
|
||||||
por $ivec,$inout4
|
|
||||||
por $ivec,$inout5
|
|
||||||
|
|
||||||
# inline _aesni_encrypt6 and interleave last rounds
|
|
||||||
# with own code...
|
|
||||||
|
|
||||||
pxor $rndkey0,$inout1
|
|
||||||
aesenc $rndkey1,$inout0
|
|
||||||
lea 32($key_),$key
|
|
||||||
pxor $rndkey0,$inout2
|
|
||||||
aesenc $rndkey1,$inout1
|
|
||||||
movdqa .Lincrement32(%rip),$iv1
|
|
||||||
pxor $rndkey0,$inout3
|
|
||||||
aesenc $rndkey1,$inout2
|
|
||||||
movdqa (%rsp),$iv0
|
|
||||||
pxor $rndkey0,$inout4
|
|
||||||
aesenc $rndkey1,$inout3
|
|
||||||
pxor $rndkey0,$inout5
|
|
||||||
$movkey ($key),$rndkey0
|
|
||||||
dec $rounds
|
|
||||||
aesenc $rndkey1,$inout4
|
|
||||||
aesenc $rndkey1,$inout5
|
|
||||||
jmp .Lctr32_enc_loop6_enter
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lctr32_enc_loop6:
|
.Lctr32_loop8:
|
||||||
|
$movkey 16($key_),$rndkey1
|
||||||
|
movdqa $rndkey0,$inout0
|
||||||
|
movdqa $rndkey0,$inout1
|
||||||
|
pxor $ivec,$inout0
|
||||||
|
paddb $one,$ivec
|
||||||
|
movdqa $rndkey0,$inout2
|
||||||
aesenc $rndkey1,$inout0
|
aesenc $rndkey1,$inout0
|
||||||
|
pxor $ivec,$inout1
|
||||||
|
paddb $one,$ivec
|
||||||
|
lea 32($key_),$key
|
||||||
|
movdqa $rndkey0,$inout3
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout1
|
||||||
dec $rounds
|
pxor $ivec,$inout2
|
||||||
|
paddb $one,$ivec
|
||||||
|
movdqa $rndkey0,$inout4
|
||||||
aesenc $rndkey1,$inout2
|
aesenc $rndkey1,$inout2
|
||||||
|
pxor $ivec,$inout3
|
||||||
|
paddb $one,$ivec
|
||||||
|
movdqa $rndkey0,$inout5
|
||||||
aesenc $rndkey1,$inout3
|
aesenc $rndkey1,$inout3
|
||||||
|
pxor $ivec,$inout4
|
||||||
|
paddb $one,$ivec
|
||||||
|
movdqa $rndkey0,$inout6
|
||||||
aesenc $rndkey1,$inout4
|
aesenc $rndkey1,$inout4
|
||||||
|
pxor $ivec,$inout5
|
||||||
|
paddb $one,$ivec
|
||||||
|
movdqa $rndkey0,$inout7
|
||||||
aesenc $rndkey1,$inout5
|
aesenc $rndkey1,$inout5
|
||||||
.Lctr32_enc_loop6_enter:
|
pxor $ivec,$inout6
|
||||||
$movkey 16($key),$rndkey1
|
paddb $one,$ivec
|
||||||
aesenc $rndkey0,$inout0
|
|
||||||
aesenc $rndkey0,$inout1
|
|
||||||
lea 32($key),$key
|
|
||||||
aesenc $rndkey0,$inout2
|
|
||||||
aesenc $rndkey0,$inout3
|
|
||||||
aesenc $rndkey0,$inout4
|
|
||||||
aesenc $rndkey0,$inout5
|
|
||||||
$movkey ($key),$rndkey0
|
$movkey ($key),$rndkey0
|
||||||
jnz .Lctr32_enc_loop6
|
aesenc $rndkey1,$inout6
|
||||||
|
pxor $ivec,$inout7
|
||||||
aesenc $rndkey1,$inout0
|
paddb $one,$ivec
|
||||||
paddd $iv1,$iv0 # increment counter vector
|
dec $rounds
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout7
|
||||||
paddd 0x10(%rsp),$iv1
|
$movkey 16($key),$rndkey1
|
||||||
aesenc $rndkey1,$inout2
|
|
||||||
movdqa $iv0,0x00(%rsp) # save counter vector
|
|
||||||
aesenc $rndkey1,$inout3
|
|
||||||
movdqa $iv1,0x10(%rsp)
|
|
||||||
aesenc $rndkey1,$inout4
|
|
||||||
pshufb $bswap_mask,$iv0 # byte swap
|
|
||||||
aesenc $rndkey1,$inout5
|
|
||||||
pshufb $bswap_mask,$iv1
|
|
||||||
|
|
||||||
aesenclast $rndkey0,$inout0
|
|
||||||
movups ($inp),$in0 # load input
|
movups ($inp),$in0 # load input
|
||||||
aesenclast $rndkey0,$inout1
|
|
||||||
movups 0x10($inp),$in1
|
movups 0x10($inp),$in1
|
||||||
aesenclast $rndkey0,$inout2
|
|
||||||
movups 0x20($inp),$in2
|
movups 0x20($inp),$in2
|
||||||
aesenclast $rndkey0,$inout3
|
|
||||||
movups 0x30($inp),$in3
|
movups 0x30($inp),$in3
|
||||||
aesenclast $rndkey0,$inout4
|
|
||||||
movups 0x40($inp),$rndkey1
|
|
||||||
aesenclast $rndkey0,$inout5
|
|
||||||
movups 0x50($inp),$rndkey0
|
|
||||||
lea 0x60($inp),$inp
|
|
||||||
|
|
||||||
xorps $inout0,$in0 # xor
|
call .Lenc_loop8_enter
|
||||||
pshufd \$`3<<6`,$iv0,$inout0
|
|
||||||
xorps $inout1,$in1
|
xorps $in0,$inout0 # xor
|
||||||
pshufd \$`2<<6`,$iv0,$inout1
|
movups 0x40($inp),$in0
|
||||||
movups $in0,($out) # store output
|
xorps $in1,$inout1
|
||||||
xorps $inout2,$in2
|
movups 0x50($inp),$in1
|
||||||
pshufd \$`1<<6`,$iv0,$inout2
|
xorps $in2,$inout2
|
||||||
movups $in1,0x10($out)
|
movups 0x60($inp),$in2
|
||||||
xorps $inout3,$in3
|
xorps $in3,$inout3
|
||||||
movups $in2,0x20($out)
|
movups 0x70($inp),$in3
|
||||||
xorps $inout4,$rndkey1
|
lea 0x80($inp),$inp
|
||||||
movups $in3,0x30($out)
|
xorps $in0,$inout4
|
||||||
xorps $inout5,$rndkey0
|
movups $inout0,($out) # store output
|
||||||
movups $rndkey1,0x40($out)
|
xorps $in1,$inout5
|
||||||
movups $rndkey0,0x50($out)
|
movups $inout1,0x10($out)
|
||||||
lea 0x60($out),$out
|
xorps $in2,$inout6
|
||||||
|
movups $inout2,0x20($out)
|
||||||
|
xorps $in3,$inout7
|
||||||
|
movups $inout3,0x30($out)
|
||||||
|
movups $inout4,0x40($out)
|
||||||
|
movups $inout5,0x50($out)
|
||||||
|
movups $inout6,0x60($out)
|
||||||
|
movups $inout7,0x70($out)
|
||||||
|
lea 0x80($out),$out
|
||||||
|
|
||||||
|
$movkey ($key_),$rndkey0
|
||||||
mov $rnds_,$rounds
|
mov $rnds_,$rounds
|
||||||
sub \$6,$len
|
sub \$8,$len
|
||||||
jnc .Lctr32_loop6
|
jnc .Lctr32_loop8
|
||||||
|
|
||||||
add \$6,$len
|
|
||||||
jz .Lctr32_done
|
|
||||||
mov $key_,$key # restore $key
|
|
||||||
lea 1($rounds,$rounds),$rounds # restore original value
|
lea 1($rounds,$rounds),$rounds # restore original value
|
||||||
|
lea 1($rnds_,$rnds_),$rnds_ # restore original value
|
||||||
|
add \$8,$len
|
||||||
|
jz .Lctr32_done
|
||||||
|
|
||||||
.Lctr32_tail:
|
.Lctr32_tail:
|
||||||
por $ivec,$inout0
|
mov $key_,$key # restore $key
|
||||||
|
movdqa $ivec,$inout0
|
||||||
|
paddb $one,$ivec
|
||||||
movups ($inp),$in0
|
movups ($inp),$in0
|
||||||
cmp \$2,$len
|
cmp \$2,$len
|
||||||
jb .Lctr32_one
|
jb .Lctr32_one
|
||||||
|
|
||||||
por $ivec,$inout1
|
movdqa $ivec,$inout1
|
||||||
|
paddb $one,$ivec
|
||||||
movups 0x10($inp),$in1
|
movups 0x10($inp),$in1
|
||||||
je .Lctr32_two
|
je .Lctr32_two
|
||||||
|
|
||||||
pshufd \$`3<<6`,$iv1,$inout3
|
movdqa $ivec,$inout2
|
||||||
por $ivec,$inout2
|
paddb $one,$ivec
|
||||||
movups 0x20($inp),$in2
|
movups 0x20($inp),$in2
|
||||||
cmp \$4,$len
|
cmp \$4,$len
|
||||||
jb .Lctr32_three
|
jb .Lctr32_three
|
||||||
|
|
||||||
pshufd \$`2<<6`,$iv1,$inout4
|
movdqa $ivec,$inout3
|
||||||
por $ivec,$inout3
|
paddb $one,$ivec
|
||||||
movups 0x30($inp),$in3
|
movups 0x30($inp),$in3
|
||||||
je .Lctr32_four
|
je .Lctr32_four
|
||||||
|
|
||||||
por $ivec,$inout4
|
movdqa $ivec,$inout4
|
||||||
xorps $inout5,$inout5
|
paddb $one,$ivec
|
||||||
|
cmp \$6,$len
|
||||||
|
jb .Lctr32_five
|
||||||
|
|
||||||
call _aesni_encrypt6
|
movdqa $ivec,$inout5
|
||||||
|
paddb $one,$ivec
|
||||||
|
je .Lctr32_six
|
||||||
|
|
||||||
movups 0x40($inp),$rndkey1
|
movdqa $ivec,$inout6
|
||||||
xorps $inout0,$in0
|
paddb $one,$ivec
|
||||||
xorps $inout1,$in1
|
xorps $inout7,$inout7
|
||||||
movups $in0,($out)
|
|
||||||
xorps $inout2,$in2
|
call _aesni_encrypt8
|
||||||
movups $in1,0x10($out)
|
|
||||||
xorps $inout3,$in3
|
xorps $in0,$inout0 # xor
|
||||||
movups $in2,0x20($out)
|
movups 0x40($inp),$in0
|
||||||
xorps $inout4,$rndkey1
|
xorps $in1,$inout1
|
||||||
movups $in3,0x30($out)
|
movups 0x50($inp),$in1
|
||||||
movups $rndkey1,0x40($out)
|
xorps $in2,$inout2
|
||||||
|
movups 0x60($inp),$in2
|
||||||
|
lea 0x70($inp),$inp
|
||||||
|
xorps $in3,$inout3
|
||||||
|
movups $inout0,($out) # store output
|
||||||
|
xorps $in0,$inout4
|
||||||
|
movups $inout1,0x10($out)
|
||||||
|
xorps $in1,$inout5
|
||||||
|
movups $inout2,0x20($out)
|
||||||
|
xorps $in2,$inout6
|
||||||
|
movups $inout3,0x30($out)
|
||||||
|
movups $inout4,0x40($out)
|
||||||
|
movups $inout5,0x50($out)
|
||||||
|
movups $inout6,0x60($out)
|
||||||
|
lea 0x70($out),$out
|
||||||
jmp .Lctr32_done
|
jmp .Lctr32_done
|
||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lctr32_one_shortcut:
|
.Lctr32_one_shortcut:
|
||||||
movups ($ivp),$inout0
|
movups ($ivp),$inout0
|
||||||
|
xor $len_,$len_
|
||||||
movups ($inp),$in0
|
movups ($inp),$in0
|
||||||
mov 240($key),$rounds # key->rounds
|
mov 240($key),$rounds # key->rounds
|
||||||
.Lctr32_one:
|
.Lctr32_one:
|
||||||
___
|
___
|
||||||
&aesni_generate1("enc",$key,$rounds);
|
&aesni_generate1("enc",$key,$rounds);
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
xorps $inout0,$in0
|
xorps $in0,$inout0
|
||||||
movups $in0,($out)
|
lea 0x10($inp),$inp
|
||||||
|
movups $inout0,($out)
|
||||||
|
lea 0x10($out),$out
|
||||||
jmp .Lctr32_done
|
jmp .Lctr32_done
|
||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lctr32_two:
|
.Lctr32_two:
|
||||||
xorps $inout2,$inout2
|
xorps $inout2,$inout2
|
||||||
call _aesni_encrypt3
|
call _aesni_encrypt3
|
||||||
xorps $inout0,$in0
|
xorps $in0,$inout0 # xor
|
||||||
xorps $inout1,$in1
|
lea 0x20($inp),$inp
|
||||||
movups $in0,($out)
|
xorps $in1,$inout1
|
||||||
movups $in1,0x10($out)
|
movups $inout0,($out) # store output
|
||||||
|
movups $inout1,0x10($out)
|
||||||
|
lea 0x20($out),$out
|
||||||
jmp .Lctr32_done
|
jmp .Lctr32_done
|
||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lctr32_three:
|
.Lctr32_three:
|
||||||
call _aesni_encrypt3
|
call _aesni_encrypt3
|
||||||
xorps $inout0,$in0
|
xorps $in0,$inout0 # xor
|
||||||
xorps $inout1,$in1
|
lea 0x30($inp),$inp
|
||||||
movups $in0,($out)
|
xorps $in1,$inout1
|
||||||
xorps $inout2,$in2
|
movups $inout0,($out) # store output
|
||||||
movups $in1,0x10($out)
|
xorps $in2,$inout2
|
||||||
movups $in2,0x20($out)
|
movups $inout1,0x10($out)
|
||||||
|
movups $inout2,0x20($out)
|
||||||
|
lea 0x30($out),$out
|
||||||
jmp .Lctr32_done
|
jmp .Lctr32_done
|
||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lctr32_four:
|
.Lctr32_four:
|
||||||
call _aesni_encrypt4
|
call _aesni_encrypt4
|
||||||
xorps $inout0,$in0
|
xorps $in0,$inout0 # xor
|
||||||
xorps $inout1,$in1
|
lea 0x40($inp),$inp
|
||||||
movups $in0,($out)
|
xorps $in1,$inout1
|
||||||
xorps $inout2,$in2
|
movups $inout0,($out) # store output
|
||||||
movups $in1,0x10($out)
|
xorps $in2,$inout2
|
||||||
xorps $inout3,$in3
|
movups $inout1,0x10($out)
|
||||||
movups $in2,0x20($out)
|
xorps $in3,$inout3
|
||||||
movups $in3,0x30($out)
|
movups $inout2,0x20($out)
|
||||||
|
movups $inout3,0x30($out)
|
||||||
|
lea 0x40($out),$out
|
||||||
|
jmp .Lctr32_done
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Lctr32_five:
|
||||||
|
xorps $inout5,$inout5
|
||||||
|
call _aesni_encrypt6
|
||||||
|
xorps $in0,$inout0 # xor
|
||||||
|
movups 0x40($inp),$in0
|
||||||
|
lea 0x50($inp),$inp
|
||||||
|
xorps $in1,$inout1
|
||||||
|
movups $inout0,($out) # store output
|
||||||
|
xorps $in2,$inout2
|
||||||
|
movups $inout1,0x10($out)
|
||||||
|
xorps $in3,$inout3
|
||||||
|
movups $inout2,0x20($out)
|
||||||
|
xorps $in0,$inout4
|
||||||
|
movups $inout3,0x30($out)
|
||||||
|
movups $inout4,0x40($out)
|
||||||
|
lea 0x50($out),$out
|
||||||
|
jmp .Lctr32_done
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Lctr32_six:
|
||||||
|
call _aesni_encrypt6
|
||||||
|
xorps $in0,$inout0 # xor
|
||||||
|
movups 0x40($inp),$in0
|
||||||
|
xorps $in1,$inout1
|
||||||
|
movups 0x50($inp),$in1
|
||||||
|
lea 0x60($inp),$inp
|
||||||
|
xorps $in2,$inout2
|
||||||
|
movups $inout0,($out) # store output
|
||||||
|
xorps $in3,$inout3
|
||||||
|
movups $inout1,0x10($out)
|
||||||
|
xorps $in0,$inout4
|
||||||
|
movups $inout2,0x20($out)
|
||||||
|
xorps $in1,$inout5
|
||||||
|
movups $inout3,0x30($out)
|
||||||
|
movups $inout4,0x40($out)
|
||||||
|
movups $inout5,0x50($out)
|
||||||
|
lea 0x60($out),$out
|
||||||
|
|
||||||
.Lctr32_done:
|
.Lctr32_done:
|
||||||
|
test $len_,$len_
|
||||||
|
jz .Lctr32_really_done
|
||||||
|
|
||||||
|
movdqa .Lbswap_mask(%rip),$rndkey1
|
||||||
|
pshufb $rndkey1,$ivec
|
||||||
|
psrldq \$14,$one # 256
|
||||||
|
paddd $one,$ivec
|
||||||
|
pslldq \$14,$one
|
||||||
|
pshufb $rndkey1,$ivec
|
||||||
|
mov $len_,$len
|
||||||
|
mov \$256,%rax
|
||||||
|
jmp .Lctr32_grandloop
|
||||||
|
|
||||||
|
.Lctr32_really_done:
|
||||||
___
|
___
|
||||||
$code.=<<___ if ($win64);
|
$code.=<<___ if ($win64);
|
||||||
movaps 0x20(%rsp),%xmm6
|
movaps 0x00(%rsp),%xmm6
|
||||||
movaps 0x30(%rsp),%xmm7
|
movaps 0x10(%rsp),%xmm7
|
||||||
movaps 0x40(%rsp),%xmm8
|
movaps 0x20(%rsp),%xmm8
|
||||||
movaps 0x50(%rsp),%xmm9
|
movaps 0x30(%rsp),%xmm9
|
||||||
movaps 0x60(%rsp),%xmm10
|
movaps 0x40(%rsp),%xmm10
|
||||||
movaps 0x70(%rsp),%xmm11
|
movaps 0x50(%rsp),%xmm11
|
||||||
movaps 0x80(%rsp),%xmm12
|
movaps 0x60(%rsp),%xmm12
|
||||||
movaps 0x90(%rsp),%xmm13
|
movaps 0x70(%rsp),%xmm13
|
||||||
movaps 0xa0(%rsp),%xmm14
|
movaps 0x80(%rsp),%xmm14
|
||||||
movaps 0xb0(%rsp),%xmm15
|
movaps 0x90(%rsp),%xmm15
|
||||||
|
lea 0xa8(%rsp),%rsp
|
||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea (%rbp),%rsp
|
|
||||||
pop %rbp
|
|
||||||
.Lctr32_ret:
|
.Lctr32_ret:
|
||||||
ret
|
ret
|
||||||
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
||||||
|
@ -2739,6 +2774,8 @@ $code.=<<___;
|
||||||
.long 1,0,0,0
|
.long 1,0,0,0
|
||||||
.Lxts_magic:
|
.Lxts_magic:
|
||||||
.long 0x87,0,1,0
|
.long 0x87,0,1,0
|
||||||
|
.Lincrement1:
|
||||||
|
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||||||
|
|
||||||
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
|
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
|
||||||
.align 64
|
.align 64
|
||||||
|
@ -2843,12 +2880,13 @@ ctr32_se_handler:
|
||||||
cmp %r10,%rbx
|
cmp %r10,%rbx
|
||||||
jae .Lcommon_seh_tail
|
jae .Lcommon_seh_tail
|
||||||
|
|
||||||
lea 0x20(%rax),%rsi # %xmm save area
|
lea (%rax),%rsi # %xmm save area
|
||||||
lea 512($context),%rdi # &context.Xmm6
|
lea 512($context),%rdi # &context.Xmm6
|
||||||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||||||
.long 0xa548f3fc # cld; rep movsq
|
.long 0xa548f3fc # cld; rep movsq
|
||||||
|
lea 0xa8(%rax),%rax # adjust stack pointer
|
||||||
|
|
||||||
jmp .Lcommon_rbp_tail
|
jmp .Lcommon_seh_tail
|
||||||
.size ctr32_se_handler,.-ctr32_se_handler
|
.size ctr32_se_handler,.-ctr32_se_handler
|
||||||
|
|
||||||
.type xts_se_handler,\@abi-omnipotent
|
.type xts_se_handler,\@abi-omnipotent
|
||||||
|
|
Loading…
Reference in a new issue