aesni-x86_64.pl: optimize CTR even further.

Based on suggestions from Shay Gueron and Vlad Krasnov.
PR: 3021
This commit is contained in:
Andy Polyakov 2013-03-26 14:29:18 +01:00
parent 1da5d3029e
commit 6c79faaa9d

View file

@ -130,7 +130,7 @@
# Further data for other parallelizable modes:
#
# CBC decrypt 1.16 0.93 0.93
# CTR 1.14 0.91 0.86
# CTR 1.14 0.91 0.77
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
@ -160,7 +160,7 @@
######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
# in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc|dec]
# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.
@ -1011,385 +1011,389 @@ ___
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
# http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest.
# Keywords are full unroll and modulo-schedule counter calculations
# with zero-round key xor.
{
my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15));
my $len_="%r9";
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
my ($key0,$ctr)=("${key_}d","${ivp}d");
my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
lea (%rsp),%rax
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
movaps %xmm6,0x00(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,0x70(%rsp)
movaps %xmm14,0x80(%rsp)
movaps %xmm15,0x90(%rsp)
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lctr32_body:
___
$code.=<<___;
lea -8(%rax),%rbp
cmp \$1,$len
je .Lctr32_one_shortcut
movzb 15($ivp),%rax # counter LSB
mov $len,$len_ # backup $len
mov 240($key),$rnds_ # key->rounds
mov $key,$key_ # backup $key
movdqu ($ivp),$ivec
neg %rax
movdqa .Lincrement1(%rip),$one
add \$256,%rax # steps to closest overflow
movdqu ($ivp),$inout0
movdqu ($key),$rndkey0
mov 12($ivp),$ctr # counter LSB
pxor $rndkey0,$inout0
mov 12($key),$key0 # 0-round key LSB
movdqa $inout0,0x00(%rsp) # populate counter block
bswap $ctr
movdqa $inout0,0x10(%rsp)
movdqa $inout0,0x20(%rsp)
movdqa $inout0,0x30(%rsp)
movdqa $inout0,0x40(%rsp)
movdqa $inout0,0x50(%rsp)
movdqa $inout0,0x60(%rsp)
movdqa $inout0,0x70(%rsp)
.Lctr32_grandloop:
cmp %rax,$len
cmova %rax,$len
mov $rnds_,$rounds # restore $rounds
sub $len,$len_
mov 240($key),$rounds # key->rounds
lea 1($ctr),%r9
lea 2($ctr),%r10
bswap %r9d
bswap %r10d
xor $key0,%r9d
xor $key0,%r10d
mov %r9d,0x10+12(%rsp)
lea 3($ctr),%r9
mov %r10d,0x20+12(%rsp)
bswap %r9d
lea 4($ctr),%r10
xor $key0,%r9d
bswap %r10d
mov %r9d,0x30+12(%rsp)
xor $key0,%r10d
lea 5($ctr),%r9
mov %r10d,0x40+12(%rsp)
bswap %r9d
lea 6($ctr),%r10
xor $key0,%r9d
bswap %r10d
mov %r9d,0x50+12(%rsp)
xor $key0,%r10d
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
xor $key0,%r9d
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
movdqa 0x10(%rsp),$inout1
movdqa 0x20(%rsp),$inout2
movdqa 0x30(%rsp),$inout3
movdqa 0x40(%rsp),$inout4
movdqa 0x50(%rsp),$inout5
cmp \$8,$len
jb .Lctr32_tail
$movkey ($key_),$rndkey0
shr \$1,$rounds
shr \$1,$rnds_
movdqa $rndkey0,$inout0
movdqa $rndkey0,$inout1
movdqa $rndkey0,$inout2
movdqa $rndkey0,$inout3
movdqa $rndkey0,$inout4
movdqa $rndkey0,$inout5
movdqa $rndkey0,$inout6
movdqa $rndkey0,$inout7
$movkey 16($key_),$rndkey1
lea 0x80($key),$key # size optimization
sub \$8,$len
jmp .Lctr32_loop8
.align 16
.align 32
.Lctr32_loop8:
pxor $ivec,$inout0
paddb $one,$ivec
aesenc $rndkey1,$inout0
pxor $ivec,$inout1
paddb $one,$ivec
lea 32($key_),$key
aesenc $rndkey1,$inout1
pxor $ivec,$inout2
paddb $one,$ivec
aesenc $rndkey1,$inout2
pxor $ivec,$inout3
paddb $one,$ivec
aesenc $rndkey1,$inout3
pxor $ivec,$inout4
paddb $one,$ivec
aesenc $rndkey1,$inout4
pxor $ivec,$inout5
paddb $one,$ivec
aesenc $rndkey1,$inout5
pxor $ivec,$inout6
paddb $one,$ivec
$movkey ($key),$rndkey0
aesenc $rndkey1,$inout6
pxor $ivec,$inout7
paddb $one,$ivec
dec $rounds
aesenc $rndkey1,$inout7
$movkey 16($key),$rndkey1
add \$8,$ctr
movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
mov $ctr,%r9d
movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
bswap %r9d
$movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
xor $key0,%r9d
aesenc $rndkey1,$inout3
mov %r9d,0x00+12(%rsp)
lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0x30-0x80($key),$rndkey1
___
for($i=2;$i<8;$i++) {
my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
$code.=<<___;
aesenc $rndkeyx,$inout0
aesenc $rndkeyx,$inout1
bswap %r9d
aesenc $rndkeyx,$inout2
xor $key0,%r9d
aesenc $rndkeyx,$inout3
mov %r9d,`0x10*($i-1)`+12(%rsp)
lea $i($ctr),%r9
aesenc $rndkeyx,$inout4
aesenc $rndkeyx,$inout5
aesenc $rndkeyx,$inout6
aesenc $rndkeyx,$inout7
$movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
___
}
$code.=<<___;
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
lea 32($key),$key
bswap %r9d
aesenc $rndkey0,$inout2
movups ($inp),$in0 # load input
xor $key0,%r9d
aesenc $rndkey0,$inout3
movups 0x10($inp),$in1
mov %r9d,0x70+12(%rsp)
aesenc $rndkey0,$inout4
movups 0x20($inp),$in2
aesenc $rndkey0,$inout5
movups 0x30($inp),$in3
aesenc $rndkey0,$inout6
movups 0x40($inp),$one
movdqu 0x00($inp),$in0
aesenc $rndkey0,$inout7
$movkey ($key),$rndkey0
$movkey 0xa0-0x80($key),$rndkey0
cmp \$11,$rounds
jb .Lctr32_enc_done
.Lctr32_enc_loop8:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 16($key),$rndkey1
$movkey 0xb0-0x80($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
aesenc $rndkey0,$inout6
aesenc $rndkey0,$inout7
$movkey ($key),$rndkey0
jnz .Lctr32_enc_loop8
$movkey 0xc0-0x80($key),$rndkey0
je .Lctr32_enc_done
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0xd0-0x80($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
aesenc $rndkey0,$inout6
aesenc $rndkey0,$inout7
$movkey 0xe0-0x80($key),$rndkey0
.Lctr32_enc_done:
aesenc $rndkey1,$inout0
movdqu 0x10($inp),$in1
pxor $rndkey0,$in0
aesenc $rndkey1,$inout1
movdqu 0x20($inp),$in2
pxor $rndkey0,$in1
aesenc $rndkey1,$inout2
movdqu 0x30($inp),$in3
pxor $rndkey0,$in2
aesenc $rndkey1,$inout3
movdqu 0x40($inp),$in4
pxor $rndkey0,$in3
aesenc $rndkey1,$inout4
pxor $rndkey0,$one
movdqu 0x50($inp),$in5
pxor $rndkey0,$in4
aesenc $rndkey1,$inout5
pxor $rndkey0,$in5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
movdqu 0x50($inp),$rndkey1
movdqu 0x60($inp),$rndkey1
aesenclast $in0,$inout0
movdqu 0x60($inp),$in0
pxor $rndkey0,$rndkey1
aesenclast $in1,$inout1
movdqu 0x70($inp),$in1
pxor $rndkey0,$in0
aesenclast $in2,$inout2
pxor $rndkey0,$in1
$movkey ($key_),$rndkey0
aesenclast $in3,$inout3
movdqu 0x70($inp),$in0
lea 0x80($inp),$inp
aesenclast $one,$inout4
movdqa .Lincrement1(%rip),$one
aesenclast $rndkey1,$inout5
$movkey 16($key_),$rndkey1
aesenclast $in0,$inout6
aesenclast $in1,$inout7
aesenclast $in1,$inout1
pxor $rndkey0,$in0
movdqa 0x00(%rsp),$in1 # load next counter block
aesenclast $in2,$inout2
movdqa 0x10(%rsp),$in2
aesenclast $in3,$inout3
movdqa 0x20(%rsp),$in3
aesenclast $in4,$inout4
movdqa 0x30(%rsp),$in4
aesenclast $in5,$inout5
movdqa 0x40(%rsp),$in5
aesenclast $rndkey1,$inout6
movdqa 0x50(%rsp),$rndkey0
aesenclast $in0,$inout7
$movkey 0x10-0x80($key),$rndkey1
movups $inout0,($out) # store output
movdqa $rndkey0,$inout0
movdqa $in1,$inout0
movups $inout1,0x10($out)
movdqa $rndkey0,$inout1
movdqa $in2,$inout1
movups $inout2,0x20($out)
movdqa $rndkey0,$inout2
movdqa $in3,$inout2
movups $inout3,0x30($out)
movdqa $rndkey0,$inout3
movdqa $in4,$inout3
movups $inout4,0x40($out)
movdqa $rndkey0,$inout4
movdqa $in5,$inout4
movups $inout5,0x50($out)
movdqa $rndkey0,$inout5
movups $inout6,0x60($out)
movdqa $rndkey0,$inout6
movups $inout7,0x70($out)
movdqa $rndkey0,$inout7
lea 0x80($out),$out
mov $rnds_,$rounds
sub \$8,$len
jnc .Lctr32_loop8
lea 1($rounds,$rounds),$rounds # restore original value
lea 1($rnds_,$rnds_),$rnds_ # restore original value
add \$8,$len
jz .Lctr32_done
lea -0x80($key),$key
.Lctr32_tail:
mov $key_,$key # restore $key
movdqa $ivec,$inout0
paddb $one,$ivec
movups ($inp),$in0
cmp \$2,$len
jb .Lctr32_one
movdqa $ivec,$inout1
paddb $one,$ivec
movups 0x10($inp),$in1
je .Lctr32_two
movdqa $ivec,$inout2
paddb $one,$ivec
movups 0x20($inp),$in2
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_three
jbe .Lctr32_loop4
movdqa $ivec,$inout3
paddb $one,$ivec
movdqa 0x60(%rsp),$inout6
$movkey 16($key),$rndkey0
aesenc $rndkey1,$inout0
lea 16($key),$key
aesenc $rndkey1,$inout1
shr \$1,$rounds
aesenc $rndkey1,$inout2
dec $rounds
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
pxor $inout7,$inout7
$movkey 16($key),$rndkey1
call .Lenc_loop8_enter
movups ($inp),$in0
movups 0x10($inp),$in1
movups 0x20($inp),$in2
xorps $in0,$inout0
movups 0x30($inp),$in3
je .Lctr32_four
movdqa $ivec,$inout4
paddb $one,$ivec
xorps $in1,$inout1
movups 0x40($inp),$in0
xorps $in2,$inout2
movups $inout0,($out)
xorps $in3,$inout3
movups $inout1,0x10($out)
xorps $in0,$inout4
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
cmp \$6,$len
jb .Lctr32_five
jb .Lctr32_done
movdqa $ivec,$inout5
paddb $one,$ivec
je .Lctr32_six
movups 0x50($inp),$in1
xorps $in1,$inout5
movups $inout5,0x50($out)
je .Lctr32_done
movdqa $ivec,$inout6
paddb $one,$ivec
xorps $inout7,$inout7
movups 0x60($inp),$in2
xorps $in2,$inout6
movups $inout6,0x60($out)
jmp .Lctr32_done
call _aesni_encrypt8
.align 32
.Lctr32_loop4:
aesenc $rndkey1,$inout0
lea 16($key),$key
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
$movkey ($key),$rndkey1
dec $rounds
jnz .Lctr32_loop4
aesenclast $rndkey1,$inout0
aesenclast $rndkey1,$inout1
aesenclast $rndkey1,$inout2
aesenclast $rndkey1,$inout3
xorps $in0,$inout0 # xor
movups 0x40($inp),$in0
xorps $in1,$inout1
movups 0x50($inp),$in1
xorps $in2,$inout2
movups 0x60($inp),$in2
lea 0x70($inp),$inp
xorps $in3,$inout3
movups $inout0,($out) # store output
xorps $in0,$inout4
movups $inout1,0x10($out)
xorps $in1,$inout5
movups $inout2,0x20($out)
xorps $in2,$inout6
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
lea 0x70($out),$out
movups ($inp),$in0
xorps $in0,$inout0
movups $inout0,($out)
cmp \$2,$len
jb .Lctr32_done
movups 0x10($inp),$in1
xorps $in1,$inout1
movups $inout1,0x10($out)
je .Lctr32_done
movups 0x20($inp),$in2
xorps $in2,$inout2
movups $inout2,0x20($out)
cmp \$4,$len
jb .Lctr32_done
movups 0x30($inp),$in3
xorps $in3,$inout3
movups $inout3,0x30($out)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups ($ivp),$inout0
xor $len_,$len_
movups ($inp),$in0
mov 240($key),$rounds # key->rounds
.Lctr32_one:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps $in0,$inout0
lea 0x10($inp),$inp
movups $inout0,($out)
lea 0x10($out),$out
jmp .Lctr32_done
.align 16
.Lctr32_two:
xorps $inout2,$inout2
call _aesni_encrypt3
xorps $in0,$inout0 # xor
lea 0x20($inp),$inp
xorps $in1,$inout1
movups $inout0,($out) # store output
movups $inout1,0x10($out)
lea 0x20($out),$out
jmp .Lctr32_done
.align 16
.Lctr32_three:
call _aesni_encrypt3
xorps $in0,$inout0 # xor
lea 0x30($inp),$inp
xorps $in1,$inout1
movups $inout0,($out) # store output
xorps $in2,$inout2
movups $inout1,0x10($out)
movups $inout2,0x20($out)
lea 0x30($out),$out
jmp .Lctr32_done
.align 16
.Lctr32_four:
call _aesni_encrypt4
xorps $in0,$inout0 # xor
lea 0x40($inp),$inp
xorps $in1,$inout1
movups $inout0,($out) # store output
xorps $in2,$inout2
movups $inout1,0x10($out)
xorps $in3,$inout3
movups $inout2,0x20($out)
movups $inout3,0x30($out)
lea 0x40($out),$out
jmp .Lctr32_done
.align 16
.Lctr32_five:
xorps $inout5,$inout5
call _aesni_encrypt6
xorps $in0,$inout0 # xor
movups 0x40($inp),$in0
lea 0x50($inp),$inp
xorps $in1,$inout1
movups $inout0,($out) # store output
xorps $in2,$inout2
movups $inout1,0x10($out)
xorps $in3,$inout3
movups $inout2,0x20($out)
xorps $in0,$inout4
movups $inout3,0x30($out)
movups $inout4,0x40($out)
lea 0x50($out),$out
jmp .Lctr32_done
.align 16
.Lctr32_six:
call _aesni_encrypt6
xorps $in0,$inout0 # xor
movups 0x40($inp),$in0
xorps $in1,$inout1
movups 0x50($inp),$in1
lea 0x60($inp),$inp
xorps $in2,$inout2
movups $inout0,($out) # store output
xorps $in3,$inout3
movups $inout1,0x10($out)
xorps $in0,$inout4
movups $inout2,0x20($out)
xorps $in1,$inout5
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
lea 0x60($out),$out
.Lctr32_done:
test $len_,$len_
jz .Lctr32_really_done
movdqa .Lbswap_mask(%rip),$rndkey1
pshufb $rndkey1,$ivec
psrldq \$14,$one # 256
paddd $one,$ivec
pslldq \$14,$one
pshufb $rndkey1,$ivec
mov $len_,$len
mov \$256,%rax
jmp .Lctr32_grandloop
.Lctr32_really_done:
___
$code.=<<___ if ($win64);
movaps 0x00(%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
movaps 0x50(%rsp),%xmm11
movaps 0x60(%rsp),%xmm12
movaps 0x70(%rsp),%xmm13
movaps 0x80(%rsp),%xmm14
movaps 0x90(%rsp),%xmm15
lea 0xa8(%rsp),%rsp
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
.Lctr32_ret:
lea (%rbp),%rsp
pop %rbp
.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
@ -1417,16 +1421,16 @@ aesni_xts_encrypt:
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,0x60(%rsp)
movaps %xmm7,0x70(%rsp)
movaps %xmm8,0x80(%rsp)
movaps %xmm9,0x90(%rsp)
movaps %xmm10,0xa0(%rsp)
movaps %xmm11,0xb0(%rsp)
movaps %xmm12,0xc0(%rsp)
movaps %xmm13,0xd0(%rsp)
movaps %xmm14,0xe0(%rsp)
movaps %xmm15,0xf0(%rsp)
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lxts_enc_body:
___
$code.=<<___;
@ -1782,16 +1786,16 @@ $code.=<<___;
.Lxts_enc_ret:
___
$code.=<<___ if ($win64);
movaps 0x60(%rsp),%xmm6
movaps 0x70(%rsp),%xmm7
movaps 0x80(%rsp),%xmm8
movaps 0x90(%rsp),%xmm9
movaps 0xa0(%rsp),%xmm10
movaps 0xb0(%rsp),%xmm11
movaps 0xc0(%rsp),%xmm12
movaps 0xd0(%rsp),%xmm13
movaps 0xe0(%rsp),%xmm14
movaps 0xf0(%rsp),%xmm15
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
@ -1812,16 +1816,16 @@ aesni_xts_decrypt:
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,0x60(%rsp)
movaps %xmm7,0x70(%rsp)
movaps %xmm8,0x80(%rsp)
movaps %xmm9,0x90(%rsp)
movaps %xmm10,0xa0(%rsp)
movaps %xmm11,0xb0(%rsp)
movaps %xmm12,0xc0(%rsp)
movaps %xmm13,0xd0(%rsp)
movaps %xmm14,0xe0(%rsp)
movaps %xmm15,0xf0(%rsp)
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lxts_dec_body:
___
$code.=<<___;
@ -2213,16 +2217,16 @@ $code.=<<___;
.Lxts_dec_ret:
___
$code.=<<___ if ($win64);
movaps 0x60(%rsp),%xmm6
movaps 0x70(%rsp),%xmm7
movaps 0x80(%rsp),%xmm8
movaps 0x90(%rsp),%xmm9
movaps 0xa0(%rsp),%xmm10
movaps 0xb0(%rsp),%xmm11
movaps 0xc0(%rsp),%xmm12
movaps 0xd0(%rsp),%xmm13
movaps 0xe0(%rsp),%xmm14
movaps 0xf0(%rsp),%xmm15
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
@ -2914,45 +2918,9 @@ ccm64_se_handler:
jmp .Lcommon_seh_tail
.size ccm64_se_handler,.-ccm64_se_handler
.type ctr32_se_handler,\@abi-omnipotent
.type ctr_xts_se_handler,\@abi-omnipotent
.align 16
ctr32_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lctr32_body(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
lea .Lctr32_ret(%rip),%r10
cmp %r10,%rbx
jae .Lcommon_seh_tail
lea (%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0xa8(%rax),%rax # adjust stack pointer
jmp .Lcommon_seh_tail
.size ctr32_se_handler,.-ctr32_se_handler
.type xts_se_handler,\@abi-omnipotent
.align 16
xts_se_handler:
ctr_xts_se_handler:
push %rsi
push %rdi
push %rbx
@ -2982,13 +2950,14 @@ xts_se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea 0x60(%rax),%rsi # %xmm save area
mov 160($context),%rax # pull context->Rbp
lea -0xa0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
jmp .Lcommon_rbp_tail
.size xts_se_handler,.-xts_se_handler
.size ctr_xts_se_handler,.-ctr_xts_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
@ -3132,14 +3101,15 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
.LSEH_info_ctr32:
.byte 9,0,0,0
.rva ctr32_se_handler
.rva ctr_xts_se_handler
.rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
.LSEH_info_xts_enc:
.byte 9,0,0,0
.rva xts_se_handler
.rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
.rva xts_se_handler
.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
$code.=<<___;