x86_64 assembly pack: Win64 SEH face-lift.
- harmonize handlers with guidelines and themselves; - fix some bugs in handlers; - add missing handlers in chacha and ecp_nistz256 modules; Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
parent
e1dbf7f431
commit
384e6de4c7
14 changed files with 1267 additions and 615 deletions
|
@ -599,6 +599,7 @@ $code.=<<___;
|
|||
.hidden asm_AES_encrypt
|
||||
asm_AES_encrypt:
|
||||
AES_encrypt:
|
||||
mov %rsp,%rax
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -607,7 +608,6 @@ AES_encrypt:
|
|||
push %r15
|
||||
|
||||
# allocate frame "above" key schedule
|
||||
mov %rsp,%r10
|
||||
lea -63(%rdx),%rcx # %rdx is key argument
|
||||
and \$-64,%rsp
|
||||
sub %rsp,%rcx
|
||||
|
@ -617,7 +617,7 @@ AES_encrypt:
|
|||
sub \$32,%rsp
|
||||
|
||||
mov %rsi,16(%rsp) # save out
|
||||
mov %r10,24(%rsp) # save real stack pointer
|
||||
mov %rax,24(%rsp) # save original stack pointer
|
||||
.Lenc_prologue:
|
||||
|
||||
mov %rdx,$key
|
||||
|
@ -649,13 +649,13 @@ AES_encrypt:
|
|||
mov $s2,8($out)
|
||||
mov $s3,12($out)
|
||||
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lenc_epilogue:
|
||||
ret
|
||||
.size AES_encrypt,.-AES_encrypt
|
||||
|
@ -1197,6 +1197,7 @@ $code.=<<___;
|
|||
.hidden asm_AES_decrypt
|
||||
asm_AES_decrypt:
|
||||
AES_decrypt:
|
||||
mov %rsp,%rax
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -1205,7 +1206,6 @@ AES_decrypt:
|
|||
push %r15
|
||||
|
||||
# allocate frame "above" key schedule
|
||||
mov %rsp,%r10
|
||||
lea -63(%rdx),%rcx # %rdx is key argument
|
||||
and \$-64,%rsp
|
||||
sub %rsp,%rcx
|
||||
|
@ -1215,7 +1215,7 @@ AES_decrypt:
|
|||
sub \$32,%rsp
|
||||
|
||||
mov %rsi,16(%rsp) # save out
|
||||
mov %r10,24(%rsp) # save real stack pointer
|
||||
mov %rax,24(%rsp) # save original stack pointer
|
||||
.Ldec_prologue:
|
||||
|
||||
mov %rdx,$key
|
||||
|
@ -1249,13 +1249,13 @@ AES_decrypt:
|
|||
mov $s2,8($out)
|
||||
mov $s3,12($out)
|
||||
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Ldec_epilogue:
|
||||
ret
|
||||
.size AES_decrypt,.-AES_decrypt
|
||||
|
@ -1675,10 +1675,9 @@ AES_cbc_encrypt:
|
|||
mov %r9d,%r9d # clear upper half of enc
|
||||
|
||||
lea .LAES_Te(%rip),$sbox
|
||||
lea .LAES_Td(%rip),%r10
|
||||
cmp \$0,%r9
|
||||
jne .Lcbc_picked_te
|
||||
lea .LAES_Td(%rip),$sbox
|
||||
.Lcbc_picked_te:
|
||||
cmoveq %r10,$sbox
|
||||
|
||||
mov OPENSSL_ia32cap_P(%rip),%r10d
|
||||
cmp \$$speed_limit,%rdx
|
||||
|
@ -2580,7 +2579,6 @@ block_se_handler:
|
|||
jae .Lin_block_prologue
|
||||
|
||||
mov 24(%rax),%rax # pull saved real stack pointer
|
||||
lea 48(%rax),%rax # adjust...
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
|
|
|
@ -341,13 +341,13 @@ $code.=<<___;
|
|||
${func}_xop:
|
||||
.Lxop_shortcut:
|
||||
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
sub \$`$framesz+$win64*16*10`,%rsp
|
||||
and \$-64,%rsp # align stack frame
|
||||
|
||||
|
@ -363,7 +363,7 @@ ${func}_xop:
|
|||
mov $ivp,$_ivp
|
||||
mov $ctx,$_ctx
|
||||
mov $in0,$_in0
|
||||
mov %r11,$_rsp
|
||||
mov %rax,$_rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,`$framesz+16*0`(%rsp)
|
||||
|
@ -617,13 +617,13 @@ $code.=<<___ if ($win64);
|
|||
movaps `$framesz+16*9`(%rsp),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_xop:
|
||||
ret
|
||||
.size ${func}_xop,.-${func}_xop
|
||||
|
@ -639,13 +639,13 @@ $code.=<<___;
|
|||
${func}_avx:
|
||||
.Lavx_shortcut:
|
||||
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
sub \$`$framesz+$win64*16*10`,%rsp
|
||||
and \$-64,%rsp # align stack frame
|
||||
|
||||
|
@ -661,7 +661,7 @@ ${func}_avx:
|
|||
mov $ivp,$_ivp
|
||||
mov $ctx,$_ctx
|
||||
mov $in0,$_in0
|
||||
mov %r11,$_rsp
|
||||
mov %rax,$_rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,`$framesz+16*0`(%rsp)
|
||||
|
@ -868,13 +868,13 @@ $code.=<<___ if ($win64);
|
|||
movaps `$framesz+16*9`(%rsp),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_avx:
|
||||
ret
|
||||
.size ${func}_avx,.-${func}_avx
|
||||
|
@ -935,13 +935,13 @@ $code.=<<___;
|
|||
${func}_avx2:
|
||||
.Lavx2_shortcut:
|
||||
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
|
||||
and \$-256*$SZ,%rsp # align stack frame
|
||||
add \$`2*$SZ*($rounds-8)`,%rsp
|
||||
|
@ -958,7 +958,7 @@ ${func}_avx2:
|
|||
mov $ivp,$_ivp
|
||||
mov $ctx,$_ctx
|
||||
mov $in0,$_in0
|
||||
mov %r11,$_rsp
|
||||
mov %rax,$_rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,`$framesz+16*0`(%rsp)
|
||||
|
@ -1205,13 +1205,13 @@ $code.=<<___ if ($win64);
|
|||
movaps `$framesz+16*9`(%rsp),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_avx2:
|
||||
ret
|
||||
.size ${func}_avx2,.-${func}_avx2
|
||||
|
@ -1569,7 +1569,6 @@ ___
|
|||
$code.=<<___;
|
||||
mov %rax,%rsi # put aside Rsp
|
||||
mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
|
||||
lea 48(%rax),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
|
|
|
@ -1172,7 +1172,7 @@ ___
|
|||
# with zero-round key xor.
|
||||
{
|
||||
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
|
||||
my ($key0,$ctr)=("${key_}d","${ivp}d");
|
||||
my ($key0,$ctr)=("%ebp","${ivp}d");
|
||||
my $frame_size = 0x80 + ($win64?160:0);
|
||||
|
||||
$code.=<<___;
|
||||
|
@ -1201,26 +1201,25 @@ $code.=<<___;
|
|||
|
||||
.align 16
|
||||
.Lctr32_bulk:
|
||||
lea (%rsp),%rax
|
||||
lea (%rsp),$key_ # use $key_ as frame pointer
|
||||
push %rbp
|
||||
sub \$$frame_size,%rsp
|
||||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||||
movaps %xmm7,-0x98(%rax)
|
||||
movaps %xmm8,-0x88(%rax)
|
||||
movaps %xmm9,-0x78(%rax)
|
||||
movaps %xmm10,-0x68(%rax)
|
||||
movaps %xmm11,-0x58(%rax)
|
||||
movaps %xmm12,-0x48(%rax)
|
||||
movaps %xmm13,-0x38(%rax)
|
||||
movaps %xmm14,-0x28(%rax)
|
||||
movaps %xmm15,-0x18(%rax)
|
||||
movaps %xmm6,-0xa8($key_) # offload everything
|
||||
movaps %xmm7,-0x98($key_)
|
||||
movaps %xmm8,-0x88($key_)
|
||||
movaps %xmm9,-0x78($key_)
|
||||
movaps %xmm10,-0x68($key_)
|
||||
movaps %xmm11,-0x58($key_)
|
||||
movaps %xmm12,-0x48($key_)
|
||||
movaps %xmm13,-0x38($key_)
|
||||
movaps %xmm14,-0x28($key_)
|
||||
movaps %xmm15,-0x18($key_)
|
||||
.Lctr32_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea -8(%rax),%rbp
|
||||
|
||||
# 8 16-byte words on top of stack are counter values
|
||||
# xor-ed with zero-round key
|
||||
|
@ -1692,26 +1691,26 @@ $code.=<<___ if (!$win64);
|
|||
pxor %xmm15,%xmm15
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -0xa0(%rbp),%xmm6
|
||||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||||
movaps -0x90(%rbp),%xmm7
|
||||
movaps %xmm0,-0x90(%rbp)
|
||||
movaps -0x80(%rbp),%xmm8
|
||||
movaps %xmm0,-0x80(%rbp)
|
||||
movaps -0x70(%rbp),%xmm9
|
||||
movaps %xmm0,-0x70(%rbp)
|
||||
movaps -0x60(%rbp),%xmm10
|
||||
movaps %xmm0,-0x60(%rbp)
|
||||
movaps -0x50(%rbp),%xmm11
|
||||
movaps %xmm0,-0x50(%rbp)
|
||||
movaps -0x40(%rbp),%xmm12
|
||||
movaps %xmm0,-0x40(%rbp)
|
||||
movaps -0x30(%rbp),%xmm13
|
||||
movaps %xmm0,-0x30(%rbp)
|
||||
movaps -0x20(%rbp),%xmm14
|
||||
movaps %xmm0,-0x20(%rbp)
|
||||
movaps -0x10(%rbp),%xmm15
|
||||
movaps %xmm0,-0x10(%rbp)
|
||||
movaps -0xa8($key_),%xmm6
|
||||
movaps %xmm0,-0xa8($key_) # clear stack
|
||||
movaps -0x98($key_),%xmm7
|
||||
movaps %xmm0,-0x98($key_)
|
||||
movaps -0x88($key_),%xmm8
|
||||
movaps %xmm0,-0x88($key_)
|
||||
movaps -0x78($key_),%xmm9
|
||||
movaps %xmm0,-0x78($key_)
|
||||
movaps -0x68($key_),%xmm10
|
||||
movaps %xmm0,-0x68($key_)
|
||||
movaps -0x58($key_),%xmm11
|
||||
movaps %xmm0,-0x58($key_)
|
||||
movaps -0x48($key_),%xmm12
|
||||
movaps %xmm0,-0x48($key_)
|
||||
movaps -0x38($key_),%xmm13
|
||||
movaps %xmm0,-0x38($key_)
|
||||
movaps -0x28($key_),%xmm14
|
||||
movaps %xmm0,-0x28($key_)
|
||||
movaps -0x18($key_),%xmm15
|
||||
movaps %xmm0,-0x18($key_)
|
||||
movaps %xmm0,0x00(%rsp)
|
||||
movaps %xmm0,0x10(%rsp)
|
||||
movaps %xmm0,0x20(%rsp)
|
||||
|
@ -1722,8 +1721,8 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0x70(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%rbp),%rsp
|
||||
pop %rbp
|
||||
mov -8($key_),%rbp
|
||||
lea ($key_),%rsp
|
||||
.Lctr32_epilogue:
|
||||
ret
|
||||
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
||||
|
@ -1740,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15));
|
|||
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
|
||||
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
|
||||
my $frame_size = 0x70 + ($win64?160:0);
|
||||
my $key_ = "%rbp"; # override so that we can use %r11 as FP
|
||||
|
||||
$code.=<<___;
|
||||
.globl aesni_xts_encrypt
|
||||
.type aesni_xts_encrypt,\@function,6
|
||||
.align 16
|
||||
aesni_xts_encrypt:
|
||||
lea (%rsp),%rax
|
||||
lea (%rsp),%r11 # frame pointer
|
||||
push %rbp
|
||||
sub \$$frame_size,%rsp
|
||||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||||
movaps %xmm7,-0x98(%rax)
|
||||
movaps %xmm8,-0x88(%rax)
|
||||
movaps %xmm9,-0x78(%rax)
|
||||
movaps %xmm10,-0x68(%rax)
|
||||
movaps %xmm11,-0x58(%rax)
|
||||
movaps %xmm12,-0x48(%rax)
|
||||
movaps %xmm13,-0x38(%rax)
|
||||
movaps %xmm14,-0x28(%rax)
|
||||
movaps %xmm15,-0x18(%rax)
|
||||
movaps %xmm6,-0xa8(%r11) # offload everything
|
||||
movaps %xmm7,-0x98(%r11)
|
||||
movaps %xmm8,-0x88(%r11)
|
||||
movaps %xmm9,-0x78(%r11)
|
||||
movaps %xmm10,-0x68(%r11)
|
||||
movaps %xmm11,-0x58(%r11)
|
||||
movaps %xmm12,-0x48(%r11)
|
||||
movaps %xmm13,-0x38(%r11)
|
||||
movaps %xmm14,-0x28(%r11)
|
||||
movaps %xmm15,-0x18(%r11)
|
||||
.Lxts_enc_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea -8(%rax),%rbp
|
||||
movups ($ivp),$inout0 # load clear-text tweak
|
||||
mov 240(%r8),$rounds # key2->rounds
|
||||
mov 240($key),$rnds_ # key1->rounds
|
||||
|
@ -2183,26 +2182,26 @@ $code.=<<___ if (!$win64);
|
|||
pxor %xmm15,%xmm15
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -0xa0(%rbp),%xmm6
|
||||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||||
movaps -0x90(%rbp),%xmm7
|
||||
movaps %xmm0,-0x90(%rbp)
|
||||
movaps -0x80(%rbp),%xmm8
|
||||
movaps %xmm0,-0x80(%rbp)
|
||||
movaps -0x70(%rbp),%xmm9
|
||||
movaps %xmm0,-0x70(%rbp)
|
||||
movaps -0x60(%rbp),%xmm10
|
||||
movaps %xmm0,-0x60(%rbp)
|
||||
movaps -0x50(%rbp),%xmm11
|
||||
movaps %xmm0,-0x50(%rbp)
|
||||
movaps -0x40(%rbp),%xmm12
|
||||
movaps %xmm0,-0x40(%rbp)
|
||||
movaps -0x30(%rbp),%xmm13
|
||||
movaps %xmm0,-0x30(%rbp)
|
||||
movaps -0x20(%rbp),%xmm14
|
||||
movaps %xmm0,-0x20(%rbp)
|
||||
movaps -0x10(%rbp),%xmm15
|
||||
movaps %xmm0,-0x10(%rbp)
|
||||
movaps -0xa8(%r11),%xmm6
|
||||
movaps %xmm0,-0xa8(%r11) # clear stack
|
||||
movaps -0x98(%r11),%xmm7
|
||||
movaps %xmm0,-0x98(%r11)
|
||||
movaps -0x88(%r11),%xmm8
|
||||
movaps %xmm0,-0x88(%r11)
|
||||
movaps -0x78(%r11),%xmm9
|
||||
movaps %xmm0,-0x78(%r11)
|
||||
movaps -0x68(%r11),%xmm10
|
||||
movaps %xmm0,-0x68(%r11)
|
||||
movaps -0x58(%r11),%xmm11
|
||||
movaps %xmm0,-0x58(%r11)
|
||||
movaps -0x48(%r11),%xmm12
|
||||
movaps %xmm0,-0x48(%r11)
|
||||
movaps -0x38(%r11),%xmm13
|
||||
movaps %xmm0,-0x38(%r11)
|
||||
movaps -0x28(%r11),%xmm14
|
||||
movaps %xmm0,-0x28(%r11)
|
||||
movaps -0x18(%r11),%xmm15
|
||||
movaps %xmm0,-0x18(%r11)
|
||||
movaps %xmm0,0x00(%rsp)
|
||||
movaps %xmm0,0x10(%rsp)
|
||||
movaps %xmm0,0x20(%rsp)
|
||||
|
@ -2212,8 +2211,8 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0x60(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%rbp),%rsp
|
||||
pop %rbp
|
||||
mov -8(%r11),%rbp
|
||||
lea (%r11),%rsp
|
||||
.Lxts_enc_epilogue:
|
||||
ret
|
||||
.size aesni_xts_encrypt,.-aesni_xts_encrypt
|
||||
|
@ -2224,26 +2223,25 @@ $code.=<<___;
|
|||
.type aesni_xts_decrypt,\@function,6
|
||||
.align 16
|
||||
aesni_xts_decrypt:
|
||||
lea (%rsp),%rax
|
||||
lea (%rsp),%r11 # frame pointer
|
||||
push %rbp
|
||||
sub \$$frame_size,%rsp
|
||||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||||
movaps %xmm7,-0x98(%rax)
|
||||
movaps %xmm8,-0x88(%rax)
|
||||
movaps %xmm9,-0x78(%rax)
|
||||
movaps %xmm10,-0x68(%rax)
|
||||
movaps %xmm11,-0x58(%rax)
|
||||
movaps %xmm12,-0x48(%rax)
|
||||
movaps %xmm13,-0x38(%rax)
|
||||
movaps %xmm14,-0x28(%rax)
|
||||
movaps %xmm15,-0x18(%rax)
|
||||
movaps %xmm6,-0xa8(%r11) # offload everything
|
||||
movaps %xmm7,-0x98(%r11)
|
||||
movaps %xmm8,-0x88(%r11)
|
||||
movaps %xmm9,-0x78(%r11)
|
||||
movaps %xmm10,-0x68(%r11)
|
||||
movaps %xmm11,-0x58(%r11)
|
||||
movaps %xmm12,-0x48(%r11)
|
||||
movaps %xmm13,-0x38(%r11)
|
||||
movaps %xmm14,-0x28(%r11)
|
||||
movaps %xmm15,-0x18(%r11)
|
||||
.Lxts_dec_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea -8(%rax),%rbp
|
||||
movups ($ivp),$inout0 # load clear-text tweak
|
||||
mov 240($key2),$rounds # key2->rounds
|
||||
mov 240($key),$rnds_ # key1->rounds
|
||||
|
@ -2687,26 +2685,26 @@ $code.=<<___ if (!$win64);
|
|||
pxor %xmm15,%xmm15
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -0xa0(%rbp),%xmm6
|
||||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||||
movaps -0x90(%rbp),%xmm7
|
||||
movaps %xmm0,-0x90(%rbp)
|
||||
movaps -0x80(%rbp),%xmm8
|
||||
movaps %xmm0,-0x80(%rbp)
|
||||
movaps -0x70(%rbp),%xmm9
|
||||
movaps %xmm0,-0x70(%rbp)
|
||||
movaps -0x60(%rbp),%xmm10
|
||||
movaps %xmm0,-0x60(%rbp)
|
||||
movaps -0x50(%rbp),%xmm11
|
||||
movaps %xmm0,-0x50(%rbp)
|
||||
movaps -0x40(%rbp),%xmm12
|
||||
movaps %xmm0,-0x40(%rbp)
|
||||
movaps -0x30(%rbp),%xmm13
|
||||
movaps %xmm0,-0x30(%rbp)
|
||||
movaps -0x20(%rbp),%xmm14
|
||||
movaps %xmm0,-0x20(%rbp)
|
||||
movaps -0x10(%rbp),%xmm15
|
||||
movaps %xmm0,-0x10(%rbp)
|
||||
movaps -0xa8(%r11),%xmm6
|
||||
movaps %xmm0,-0xa8(%r11) # clear stack
|
||||
movaps -0x98(%r11),%xmm7
|
||||
movaps %xmm0,-0x98(%r11)
|
||||
movaps -0x88(%r11),%xmm8
|
||||
movaps %xmm0,-0x88(%r11)
|
||||
movaps -0x78(%r11),%xmm9
|
||||
movaps %xmm0,-0x78(%r11)
|
||||
movaps -0x68(%r11),%xmm10
|
||||
movaps %xmm0,-0x68(%r11)
|
||||
movaps -0x58(%r11),%xmm11
|
||||
movaps %xmm0,-0x58(%r11)
|
||||
movaps -0x48(%r11),%xmm12
|
||||
movaps %xmm0,-0x48(%r11)
|
||||
movaps -0x38(%r11),%xmm13
|
||||
movaps %xmm0,-0x38(%r11)
|
||||
movaps -0x28(%r11),%xmm14
|
||||
movaps %xmm0,-0x28(%r11)
|
||||
movaps -0x18(%r11),%xmm15
|
||||
movaps %xmm0,-0x18(%r11)
|
||||
movaps %xmm0,0x00(%rsp)
|
||||
movaps %xmm0,0x10(%rsp)
|
||||
movaps %xmm0,0x20(%rsp)
|
||||
|
@ -2716,8 +2714,8 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0x60(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%rbp),%rsp
|
||||
pop %rbp
|
||||
mov -8(%r11),%rbp
|
||||
lea (%r11),%rsp
|
||||
.Lxts_dec_epilogue:
|
||||
ret
|
||||
.size aesni_xts_decrypt,.-aesni_xts_decrypt
|
||||
|
@ -2943,6 +2941,7 @@ $code.=<<___ if (!$win64);
|
|||
pxor %xmm13,%xmm13
|
||||
pxor %xmm14,%xmm14
|
||||
pxor %xmm15,%xmm15
|
||||
lea 0x28(%rsp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x00(%rsp),%xmm6
|
||||
|
@ -2967,14 +2966,14 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0x90(%rsp)
|
||||
lea 0xa0+0x28(%rsp),%rax
|
||||
.Locb_enc_pop:
|
||||
lea 0xa0(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
mov -40(%rax),%r14
|
||||
mov -32(%rax),%r13
|
||||
mov -24(%rax),%r12
|
||||
mov -16(%rax),%rbp
|
||||
mov -8(%rax),%rbx
|
||||
lea (%rax),%rsp
|
||||
.Locb_enc_epilogue:
|
||||
ret
|
||||
.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
|
||||
|
@ -3410,6 +3409,7 @@ $code.=<<___ if (!$win64);
|
|||
pxor %xmm13,%xmm13
|
||||
pxor %xmm14,%xmm14
|
||||
pxor %xmm15,%xmm15
|
||||
lea 0x28(%rsp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x00(%rsp),%xmm6
|
||||
|
@ -3434,14 +3434,14 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0x90(%rsp)
|
||||
lea 0xa0+0x28(%rsp),%rax
|
||||
.Locb_dec_pop:
|
||||
lea 0xa0(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
mov -40(%rax),%r14
|
||||
mov -32(%rax),%r13
|
||||
mov -24(%rax),%r12
|
||||
mov -16(%rax),%rbp
|
||||
mov -8(%rax),%rbx
|
||||
lea (%rax),%rsp
|
||||
.Locb_dec_epilogue:
|
||||
ret
|
||||
.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
|
||||
|
@ -3650,7 +3650,6 @@ ___
|
|||
{
|
||||
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
|
||||
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
|
||||
my $inp_=$key_;
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${PREFIX}_cbc_encrypt
|
||||
|
@ -3732,7 +3731,7 @@ $code.=<<___;
|
|||
jmp .Lcbc_ret
|
||||
.align 16
|
||||
.Lcbc_decrypt_bulk:
|
||||
lea (%rsp),%rax
|
||||
lea (%rsp),%r11 # frame pointer
|
||||
push %rbp
|
||||
sub \$$frame_size,%rsp
|
||||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||||
|
@ -3750,8 +3749,11 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm15,0xa0(%rsp)
|
||||
.Lcbc_decrypt_body:
|
||||
___
|
||||
|
||||
my $inp_=$key_="%rbp"; # reassign $key_
|
||||
|
||||
$code.=<<___;
|
||||
lea -8(%rax),%rbp
|
||||
mov $key,$key_ # [re-]backup $key [after reassignment]
|
||||
movups ($ivp),$iv
|
||||
mov $rnds_,$rounds
|
||||
cmp \$0x50,$len
|
||||
|
@ -3791,7 +3793,7 @@ $code.=<<___;
|
|||
pxor $rndkey0,$inout1
|
||||
$movkey 0x10-0x70($key),$rndkey1
|
||||
pxor $rndkey0,$inout2
|
||||
xor $inp_,$inp_
|
||||
mov \$-1,$inp_
|
||||
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
|
||||
pxor $rndkey0,$inout3
|
||||
pxor $rndkey0,$inout4
|
||||
|
@ -3807,8 +3809,8 @@ $code.=<<___;
|
|||
aesdec $rndkey1,$inout4
|
||||
aesdec $rndkey1,$inout5
|
||||
aesdec $rndkey1,$inout6
|
||||
setnc ${inp_}b
|
||||
shl \$7,$inp_
|
||||
adc \$0,$inp_
|
||||
and \$128,$inp_
|
||||
aesdec $rndkey1,$inout7
|
||||
add $inp,$inp_
|
||||
$movkey 0x30-0x70($key),$rndkey1
|
||||
|
@ -4172,8 +4174,8 @@ $code.=<<___ if ($win64);
|
|||
movaps %xmm0,0xa0(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%rbp),%rsp
|
||||
pop %rbp
|
||||
mov -8(%r11),%rbp
|
||||
lea (%r11),%rsp
|
||||
.Lcbc_ret:
|
||||
ret
|
||||
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
|
||||
|
@ -4744,13 +4746,16 @@ ctr_xts_se_handler:
|
|||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
mov 160($context),%rax # pull context->Rbp
|
||||
lea -0xa0(%rax),%rsi # %xmm save area
|
||||
mov 208($context),%rax # pull context->R11
|
||||
|
||||
lea -0xa8(%rax),%rsi # %xmm save area
|
||||
lea 512($context),%rdi # & context.Xmm6
|
||||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
jmp .Lcommon_rbp_tail
|
||||
mov -8(%rax),%rbp # restore saved %rbp
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
jmp .Lcommon_seh_tail
|
||||
.size ctr_xts_se_handler,.-ctr_xts_se_handler
|
||||
|
||||
.type ocb_se_handler,\@abi-omnipotent
|
||||
|
@ -4834,9 +4839,13 @@ cbc_se_handler:
|
|||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
|
||||
lea .Lcbc_decrypt_body(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
|
||||
jb .Lrestore_cbc_rax
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
lea .Lcbc_ret(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip>="epilogue" label
|
||||
|
@ -4847,15 +4856,10 @@ cbc_se_handler:
|
|||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
.Lcommon_rbp_tail:
|
||||
mov 160($context),%rax # pull context->Rbp
|
||||
mov (%rax),%rbp # restore saved %rbp
|
||||
lea 8(%rax),%rax # adjust stack pointer
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
jmp .Lcommon_seh_tail
|
||||
mov 208($context),%rax # pull context->R11
|
||||
|
||||
.Lrestore_cbc_rax:
|
||||
mov 120($context),%rax
|
||||
mov -8(%rax),%rbp # restore saved %rbp
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
|
||||
.Lcommon_seh_tail:
|
||||
mov 8(%rax),%rdi
|
||||
|
|
|
@ -1334,7 +1334,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
jb .Lecb_enc_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -1347,17 +1347,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lecb_enc_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lecb_enc_epilogue:
|
||||
ret
|
||||
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
|
||||
|
@ -1536,7 +1536,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
jb .Lecb_dec_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -1549,17 +1549,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lecb_dec_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lecb_dec_epilogue:
|
||||
ret
|
||||
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
|
||||
|
@ -1826,7 +1826,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
ja .Lcbc_dec_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -1839,17 +1839,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lcbc_dec_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lcbc_dec_epilogue:
|
||||
ret
|
||||
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
|
||||
|
@ -2058,7 +2058,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
ja .Lctr_enc_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -2071,17 +2071,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lctr_enc_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lctr_enc_epilogue:
|
||||
ret
|
||||
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
|
||||
|
@ -2448,7 +2448,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
ja .Lxts_enc_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -2461,17 +2461,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lxts_enc_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lxts_enc_epilogue:
|
||||
ret
|
||||
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
|
||||
|
@ -2855,7 +2855,7 @@ $code.=<<___;
|
|||
cmp %rax, %rbp
|
||||
ja .Lxts_dec_bzero
|
||||
|
||||
lea (%rbp),%rsp # restore %rsp
|
||||
lea 0x78(%rbp),%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x40(%rbp), %xmm6
|
||||
|
@ -2868,17 +2868,17 @@ $code.=<<___ if ($win64);
|
|||
movaps 0xb0(%rbp), %xmm13
|
||||
movaps 0xc0(%rbp), %xmm14
|
||||
movaps 0xd0(%rbp), %xmm15
|
||||
lea 0xa0(%rbp), %rsp
|
||||
lea 0xa0(%rax), %rax
|
||||
.Lxts_dec_tail:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x48(%rsp), %r15
|
||||
mov 0x50(%rsp), %r14
|
||||
mov 0x58(%rsp), %r13
|
||||
mov 0x60(%rsp), %r12
|
||||
mov 0x68(%rsp), %rbx
|
||||
mov 0x70(%rsp), %rax
|
||||
lea 0x78(%rsp), %rsp
|
||||
mov %rax, %rbp
|
||||
mov -48(%rax), %r15
|
||||
mov -40(%rax), %r14
|
||||
mov -32(%rax), %r13
|
||||
mov -24(%rax), %r12
|
||||
mov -16(%rax), %rbx
|
||||
mov -8(%rax), %rbp
|
||||
lea (%rax), %rsp # restore %rsp
|
||||
.Lxts_dec_epilogue:
|
||||
ret
|
||||
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
|
||||
|
@ -2974,31 +2974,34 @@ se_handler:
|
|||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # prologue label
|
||||
cmp %r10,%rbx # context->Rip<prologue label
|
||||
jb .Lin_prologue
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
cmp %r10,%rbx # context->Rip<=prologue label
|
||||
jbe .Lin_prologue
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lin_prologue
|
||||
|
||||
mov 8(%r11),%r10d # HandlerData[2]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=tail label
|
||||
jae .Lin_tail
|
||||
|
||||
mov 160($context),%rax # pull context->Rbp
|
||||
|
||||
lea 0x40(%rax),%rsi # %xmm save area
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
lea 0xa0(%rax),%rax # adjust stack pointer
|
||||
lea 0xa0+0x78(%rax),%rax # adjust stack pointer
|
||||
|
||||
mov 0x70(%rax),%rbp
|
||||
mov 0x68(%rax),%rbx
|
||||
mov 0x60(%rax),%r12
|
||||
mov 0x58(%rax),%r13
|
||||
mov 0x50(%rax),%r14
|
||||
mov 0x48(%rax),%r15
|
||||
lea 0x78(%rax),%rax # adjust stack pointer
|
||||
.Lin_tail:
|
||||
mov -48(%rax),%rbp
|
||||
mov -40(%rax),%rbx
|
||||
mov -32(%rax),%r12
|
||||
mov -24(%rax),%r13
|
||||
mov -16(%rax),%r14
|
||||
mov -8(%rax),%r15
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
|
@ -3079,28 +3082,40 @@ $code.=<<___ if ($ecb);
|
|||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
|
||||
.rva .Lecb_enc_tail
|
||||
.long 0
|
||||
.Lecb_dec_info:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
|
||||
.rva .Lecb_dec_tail
|
||||
.long 0
|
||||
___
|
||||
$code.=<<___;
|
||||
.Lcbc_dec_info:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
|
||||
.rva .Lcbc_dec_tail
|
||||
.long 0
|
||||
.Lctr_enc_info:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
|
||||
.rva .Lctr_enc_tail
|
||||
.long 0
|
||||
.Lxts_enc_info:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
|
||||
.rva .Lxts_enc_tail
|
||||
.long 0
|
||||
.Lxts_dec_info:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
|
||||
.rva .Lxts_dec_tail
|
||||
.long 0
|
||||
___
|
||||
}
|
||||
|
||||
|
|
|
@ -1738,11 +1738,11 @@ $code.=<<___ if ($win64);
|
|||
movaps -0x38(%r11),%xmm13
|
||||
movaps -0x28(%r11),%xmm14
|
||||
movaps -0x18(%r11),%xmm15
|
||||
.LSEH_end_rsaz_1024_gather5:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%r11),%rsp
|
||||
ret
|
||||
.LSEH_end_rsaz_1024_gather5:
|
||||
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
|
||||
___
|
||||
}
|
||||
|
|
|
@ -174,8 +174,9 @@ $code.=<<___;
|
|||
.type bn_GF2m_mul_2x2,\@abi-omnipotent
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
mov OPENSSL_ia32cap_P(%rip),%rax
|
||||
bt \$33,%rax
|
||||
mov %rsp,%rax
|
||||
mov OPENSSL_ia32cap_P(%rip),%r10
|
||||
bt \$33,%r10
|
||||
jnc .Lvanilla_mul_2x2
|
||||
|
||||
movq $a1,%xmm0
|
||||
|
@ -280,6 +281,7 @@ $code.=<<___ if ($win64);
|
|||
___
|
||||
$code.=<<___;
|
||||
lea 8*17(%rsp),%rsp
|
||||
.Lepilogue_mul_2x2:
|
||||
ret
|
||||
.Lend_mul_2x2:
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
|
@ -312,13 +314,19 @@ se_handler:
|
|||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
lea .Lbody_mul_2x2(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lin_prologue
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
lea .Lepilogue_mul_2x2(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip>="epilogue" label
|
||||
jae .Lin_prologue
|
||||
|
||||
mov 8*10(%rax),%r14 # mimic epilogue
|
||||
mov 8*11(%rax),%r13
|
||||
mov 8*12(%rax),%r12
|
||||
|
@ -335,8 +343,9 @@ se_handler:
|
|||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
|
||||
.Lin_prologue:
|
||||
lea 8*17(%rax),%rax
|
||||
|
||||
.Lin_prologue:
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
|
|
|
@ -695,10 +695,11 @@ ___
|
|||
my @ri=("%rax","%rdx",$m0,$m1);
|
||||
$code.=<<___;
|
||||
mov 16(%rsp,$num,8),$rp # restore $rp
|
||||
lea -4($num),$j
|
||||
mov 0(%rsp),@ri[0] # tp[0]
|
||||
pxor %xmm0,%xmm0
|
||||
mov 8(%rsp),@ri[1] # tp[1]
|
||||
shr \$2,$num # num/=4
|
||||
shr \$2,$j # j=num/4-1
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
|
||||
|
@ -706,7 +707,6 @@ $code.=<<___;
|
|||
mov 16($ap),@ri[2] # tp[2]
|
||||
mov 24($ap),@ri[3] # tp[3]
|
||||
sbb 8($np),@ri[1]
|
||||
lea -1($num),$j # j=num/4-1
|
||||
jmp .Lsub4x
|
||||
.align 16
|
||||
.Lsub4x:
|
||||
|
@ -740,8 +740,9 @@ $code.=<<___;
|
|||
not @ri[0]
|
||||
mov $rp,$np
|
||||
and @ri[0],$np
|
||||
lea -1($num),$j
|
||||
lea -4($num),$j
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
shr \$2,$j # j=num/4-1
|
||||
|
||||
movdqu ($ap),%xmm1
|
||||
movdqa %xmm0,(%rsp)
|
||||
|
@ -759,7 +760,6 @@ $code.=<<___;
|
|||
dec $j
|
||||
jnz .Lcopy4x
|
||||
|
||||
shl \$2,$num
|
||||
movdqu 16($ap,$i),%xmm2
|
||||
movdqa %xmm0,16(%rsp,$i)
|
||||
movdqu %xmm2,16($rp,$i)
|
||||
|
@ -1401,12 +1401,12 @@ sqr_handler:
|
|||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # end of prologue label
|
||||
cmp %r10,%rbx # context->Rip<.Lsqr_body
|
||||
cmp %r10,%rbx # context->Rip<.Lsqr_prologue
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # body label
|
||||
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
||||
cmp %r10,%rbx # context->Rip<.Lsqr_body
|
||||
jb .Lcommon_pop_regs
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
|
|
@ -3669,8 +3669,8 @@ mul_handler:
|
|||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
lea (%rsi,%r10),%r10 # beginning of body label
|
||||
cmp %r10,%rbx # context->Rip<body label
|
||||
jb .Lcommon_pop_regs
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
|
|
@ -261,6 +261,7 @@ $code.=<<___;
|
|||
push %r14
|
||||
push %r15
|
||||
sub \$64+24,%rsp
|
||||
.Lctr32_body:
|
||||
|
||||
#movdqa .Lsigma(%rip),%xmm0
|
||||
movdqu ($key),%xmm1
|
||||
|
@ -399,13 +400,14 @@ $code.=<<___;
|
|||
jnz .Loop_tail
|
||||
|
||||
.Ldone:
|
||||
add \$64+24,%rsp
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
lea 64+24+48(%rsp),%rsi
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lno_data:
|
||||
ret
|
||||
.size ChaCha20_ctr32,.-ChaCha20_ctr32
|
||||
|
@ -440,13 +442,14 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
|
|||
&por ($b,$t);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 32+32+8 : 24;
|
||||
my $xframe = $win64 ? 32+8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_ssse3,\@function,5
|
||||
.align 32
|
||||
ChaCha20_ssse3:
|
||||
.LChaCha20_ssse3:
|
||||
mov %rsp,%r9 # frame pointer
|
||||
___
|
||||
$code.=<<___ if ($avx);
|
||||
test \$`1<<(43-32)`,%r10d
|
||||
|
@ -457,18 +460,12 @@ $code.=<<___;
|
|||
ja .LChaCha20_4x # but overall it won't be slower
|
||||
|
||||
.Ldo_sse3_after_all:
|
||||
push %rbx # just to share SEH handler, no pops
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
sub \$64+$xframe,%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,64+32(%rsp)
|
||||
movaps %xmm7,64+48(%rsp)
|
||||
movaps %xmm6,-0x28(%r9)
|
||||
movaps %xmm7,-0x18(%r9)
|
||||
.Lssse3_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa .Lsigma(%rip),$a
|
||||
|
@ -563,11 +560,12 @@ $code.=<<___;
|
|||
.Ldone_ssse3:
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 64+32(%rsp),%xmm6
|
||||
movaps 64+48(%rsp),%xmm7
|
||||
movaps -0x28(%r9),%xmm6
|
||||
movaps -0x18(%r9),%xmm7
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$64+$xframe+48,%rsp
|
||||
lea (%r9),%rsp
|
||||
.Lssse3_epilogue:
|
||||
ret
|
||||
.size ChaCha20_ssse3,.-ChaCha20_ssse3
|
||||
___
|
||||
|
@ -704,13 +702,14 @@ my @x=map("\"$_\"",@xx);
|
|||
);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 0xa0 : 0;
|
||||
my $xframe = $win64 ? 0xa8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_4x,\@function,5
|
||||
.align 32
|
||||
ChaCha20_4x:
|
||||
.LChaCha20_4x:
|
||||
mov %rsp,%r9 # frame pointer
|
||||
mov %r10,%r11
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
|
@ -727,8 +726,7 @@ $code.=<<___;
|
|||
je .Ldo_sse3_after_all # to detect Atom
|
||||
|
||||
.Lproceed4x:
|
||||
lea -0x78(%rsp),%r11
|
||||
sub \$0x148+$xframe,%rsp
|
||||
sub \$0x140+$xframe,%rsp
|
||||
___
|
||||
################ stack layout
|
||||
# +0x00 SIMD equivalent of @x[8-12]
|
||||
|
@ -739,16 +737,17 @@ ___
|
|||
# ...
|
||||
# +0x140
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-0x30(%r11)
|
||||
movaps %xmm7,-0x20(%r11)
|
||||
movaps %xmm8,-0x10(%r11)
|
||||
movaps %xmm9,0x00(%r11)
|
||||
movaps %xmm10,0x10(%r11)
|
||||
movaps %xmm11,0x20(%r11)
|
||||
movaps %xmm12,0x30(%r11)
|
||||
movaps %xmm13,0x40(%r11)
|
||||
movaps %xmm14,0x50(%r11)
|
||||
movaps %xmm15,0x60(%r11)
|
||||
movaps %xmm6,-0xa8(%r9)
|
||||
movaps %xmm7,-0x98(%r9)
|
||||
movaps %xmm8,-0x88(%r9)
|
||||
movaps %xmm9,-0x78(%r9)
|
||||
movaps %xmm10,-0x68(%r9)
|
||||
movaps %xmm11,-0x58(%r9)
|
||||
movaps %xmm12,-0x48(%r9)
|
||||
movaps %xmm13,-0x38(%r9)
|
||||
movaps %xmm14,-0x28(%r9)
|
||||
movaps %xmm15,-0x18(%r9)
|
||||
.L4x_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa .Lsigma(%rip),$xa3 # key[0]
|
||||
|
@ -1137,20 +1136,20 @@ $code.=<<___;
|
|||
.Ldone4x:
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x140+0x30(%rsp),%r11
|
||||
movaps -0x30(%r11),%xmm6
|
||||
movaps -0x20(%r11),%xmm7
|
||||
movaps -0x10(%r11),%xmm8
|
||||
movaps 0x00(%r11),%xmm9
|
||||
movaps 0x10(%r11),%xmm10
|
||||
movaps 0x20(%r11),%xmm11
|
||||
movaps 0x30(%r11),%xmm12
|
||||
movaps 0x40(%r11),%xmm13
|
||||
movaps 0x50(%r11),%xmm14
|
||||
movaps 0x60(%r11),%xmm15
|
||||
movaps -0xa8(%r9),%xmm6
|
||||
movaps -0x98(%r9),%xmm7
|
||||
movaps -0x88(%r9),%xmm8
|
||||
movaps -0x78(%r9),%xmm9
|
||||
movaps -0x68(%r9),%xmm10
|
||||
movaps -0x58(%r9),%xmm11
|
||||
movaps -0x48(%r9),%xmm12
|
||||
movaps -0x38(%r9),%xmm13
|
||||
movaps -0x28(%r9),%xmm14
|
||||
movaps -0x18(%r9),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$0x148+$xframe,%rsp
|
||||
lea (%r9),%rsp
|
||||
.L4x_epilogue:
|
||||
ret
|
||||
.size ChaCha20_4x,.-ChaCha20_4x
|
||||
___
|
||||
|
@ -1232,15 +1231,15 @@ my @x=map("\"$_\"",@xx);
|
|||
);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 0xa0 : 0;
|
||||
my $xframe = $win64 ? 0xa8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_4xop,\@function,5
|
||||
.align 32
|
||||
ChaCha20_4xop:
|
||||
.LChaCha20_4xop:
|
||||
lea -0x78(%rsp),%r11
|
||||
sub \$0x148+$xframe,%rsp
|
||||
mov %rsp,%r9 # frame pointer
|
||||
sub \$0x140+$xframe,%rsp
|
||||
___
|
||||
################ stack layout
|
||||
# +0x00 SIMD equivalent of @x[8-12]
|
||||
|
@ -1251,16 +1250,17 @@ ___
|
|||
# ...
|
||||
# +0x140
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-0x30(%r11)
|
||||
movaps %xmm7,-0x20(%r11)
|
||||
movaps %xmm8,-0x10(%r11)
|
||||
movaps %xmm9,0x00(%r11)
|
||||
movaps %xmm10,0x10(%r11)
|
||||
movaps %xmm11,0x20(%r11)
|
||||
movaps %xmm12,0x30(%r11)
|
||||
movaps %xmm13,0x40(%r11)
|
||||
movaps %xmm14,0x50(%r11)
|
||||
movaps %xmm15,0x60(%r11)
|
||||
movaps %xmm6,-0xa8(%r9)
|
||||
movaps %xmm7,-0x98(%r9)
|
||||
movaps %xmm8,-0x88(%r9)
|
||||
movaps %xmm9,-0x78(%r9)
|
||||
movaps %xmm10,-0x68(%r9)
|
||||
movaps %xmm11,-0x58(%r9)
|
||||
movaps %xmm12,-0x48(%r9)
|
||||
movaps %xmm13,-0x38(%r9)
|
||||
movaps %xmm14,-0x28(%r9)
|
||||
movaps %xmm15,-0x18(%r9)
|
||||
.L4xop_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
vzeroupper
|
||||
|
@ -1588,20 +1588,20 @@ $code.=<<___;
|
|||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x140+0x30(%rsp),%r11
|
||||
movaps -0x30(%r11),%xmm6
|
||||
movaps -0x20(%r11),%xmm7
|
||||
movaps -0x10(%r11),%xmm8
|
||||
movaps 0x00(%r11),%xmm9
|
||||
movaps 0x10(%r11),%xmm10
|
||||
movaps 0x20(%r11),%xmm11
|
||||
movaps 0x30(%r11),%xmm12
|
||||
movaps 0x40(%r11),%xmm13
|
||||
movaps 0x50(%r11),%xmm14
|
||||
movaps 0x60(%r11),%xmm15
|
||||
movaps -0xa8(%r9),%xmm6
|
||||
movaps -0x98(%r9),%xmm7
|
||||
movaps -0x88(%r9),%xmm8
|
||||
movaps -0x78(%r9),%xmm9
|
||||
movaps -0x68(%r9),%xmm10
|
||||
movaps -0x58(%r9),%xmm11
|
||||
movaps -0x48(%r9),%xmm12
|
||||
movaps -0x38(%r9),%xmm13
|
||||
movaps -0x28(%r9),%xmm14
|
||||
movaps -0x18(%r9),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$0x148+$xframe,%rsp
|
||||
lea (%r9),%rsp
|
||||
.L4xop_epilogue:
|
||||
ret
|
||||
.size ChaCha20_4xop,.-ChaCha20_4xop
|
||||
___
|
||||
|
@ -1729,33 +1729,32 @@ my @x=map("\"$_\"",@xx);
|
|||
);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 0xb0 : 8;
|
||||
my $xframe = $win64 ? 0xa8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_8x,\@function,5
|
||||
.align 32
|
||||
ChaCha20_8x:
|
||||
.LChaCha20_8x:
|
||||
mov %rsp,%r10
|
||||
mov %rsp,%r9 # frame register
|
||||
sub \$0x280+$xframe,%rsp
|
||||
and \$-32,%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
movaps %xmm6,-0x30(%r11)
|
||||
movaps %xmm7,-0x20(%r11)
|
||||
movaps %xmm8,-0x10(%r11)
|
||||
movaps %xmm9,0x00(%r11)
|
||||
movaps %xmm10,0x10(%r11)
|
||||
movaps %xmm11,0x20(%r11)
|
||||
movaps %xmm12,0x30(%r11)
|
||||
movaps %xmm13,0x40(%r11)
|
||||
movaps %xmm14,0x50(%r11)
|
||||
movaps %xmm15,0x60(%r11)
|
||||
movaps %xmm6,-0xa8(%r9)
|
||||
movaps %xmm7,-0x98(%r9)
|
||||
movaps %xmm8,-0x88(%r9)
|
||||
movaps %xmm9,-0x78(%r9)
|
||||
movaps %xmm10,-0x68(%r9)
|
||||
movaps %xmm11,-0x58(%r9)
|
||||
movaps %xmm12,-0x48(%r9)
|
||||
movaps %xmm13,-0x38(%r9)
|
||||
movaps %xmm14,-0x28(%r9)
|
||||
movaps %xmm15,-0x18(%r9)
|
||||
.L8x_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
vzeroupper
|
||||
mov %r10,0x280(%rsp)
|
||||
|
||||
################ stack layout
|
||||
# +0x00 SIMD equivalent of @x[8-12]
|
||||
|
@ -1764,7 +1763,7 @@ $code.=<<___;
|
|||
# ...
|
||||
# +0x200 SIMD counters (with nonce smashed by lanes)
|
||||
# ...
|
||||
# +0x280 saved %rsp
|
||||
# +0x280
|
||||
|
||||
vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
|
||||
vbroadcasti128 ($key),$xb3 # key[1]
|
||||
|
@ -2230,20 +2229,20 @@ $code.=<<___;
|
|||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
movaps -0x30(%r11),%xmm6
|
||||
movaps -0x20(%r11),%xmm7
|
||||
movaps -0x10(%r11),%xmm8
|
||||
movaps 0x00(%r11),%xmm9
|
||||
movaps 0x10(%r11),%xmm10
|
||||
movaps 0x20(%r11),%xmm11
|
||||
movaps 0x30(%r11),%xmm12
|
||||
movaps 0x40(%r11),%xmm13
|
||||
movaps 0x50(%r11),%xmm14
|
||||
movaps 0x60(%r11),%xmm15
|
||||
movaps -0xa8(%r9),%xmm6
|
||||
movaps -0x98(%r9),%xmm7
|
||||
movaps -0x88(%r9),%xmm8
|
||||
movaps -0x78(%r9),%xmm9
|
||||
movaps -0x68(%r9),%xmm10
|
||||
movaps -0x58(%r9),%xmm11
|
||||
movaps -0x48(%r9),%xmm12
|
||||
movaps -0x38(%r9),%xmm13
|
||||
movaps -0x28(%r9),%xmm14
|
||||
movaps -0x18(%r9),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 0x280(%rsp),%rsp
|
||||
lea (%r9),%rsp
|
||||
.L8x_epilogue:
|
||||
ret
|
||||
.size ChaCha20_8x,.-ChaCha20_8x
|
||||
___
|
||||
|
@ -2275,28 +2274,23 @@ sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
|
|||
&vprold ($b,$b,7);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 32+32+8 : 24;
|
||||
my $xframe = $win64 ? 32+8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_avx512,\@function,5
|
||||
.align 32
|
||||
ChaCha20_avx512:
|
||||
.LChaCha20_avx512:
|
||||
mov %rsp,%r9 # frame pointer
|
||||
cmp \$512,$len
|
||||
ja .LChaCha20_16x
|
||||
|
||||
push %rbx # just to share SEH handler, no pops
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
sub \$64+$xframe,%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,64+32(%rsp)
|
||||
movaps %xmm7,64+48(%rsp)
|
||||
movaps %xmm6,-0x28(%r9)
|
||||
movaps %xmm7,-0x18(%r9)
|
||||
.Lavx512_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
vbroadcasti32x4 .Lsigma(%rip),$a
|
||||
|
@ -2462,11 +2456,12 @@ $code.=<<___;
|
|||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 64+32(%rsp),%xmm6
|
||||
movaps 64+48(%rsp),%xmm7
|
||||
movaps -0x28(%r9),%xmm6
|
||||
movaps -0x18(%r9),%xmm7
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$64+$xframe+48,%rsp
|
||||
lea (%r9),%rsp
|
||||
.Lavx512_epilogue:
|
||||
ret
|
||||
.size ChaCha20_avx512,.-ChaCha20_avx512
|
||||
___
|
||||
|
@ -2543,29 +2538,29 @@ my @x=map("\"$_\"",@xx);
|
|||
);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 0xb0 : 8;
|
||||
my $xframe = $win64 ? 0xa8 : 8;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_16x,\@function,5
|
||||
.align 32
|
||||
ChaCha20_16x:
|
||||
.LChaCha20_16x:
|
||||
mov %rsp,%r11
|
||||
mov %rsp,%r9 # frame register
|
||||
sub \$64+$xframe,%rsp
|
||||
and \$-64,%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
movaps %xmm6,-0x30(%r11)
|
||||
movaps %xmm7,-0x20(%r11)
|
||||
movaps %xmm8,-0x10(%r11)
|
||||
movaps %xmm9,0x00(%r11)
|
||||
movaps %xmm10,0x10(%r11)
|
||||
movaps %xmm11,0x20(%r11)
|
||||
movaps %xmm12,0x30(%r11)
|
||||
movaps %xmm13,0x40(%r11)
|
||||
movaps %xmm14,0x50(%r11)
|
||||
movaps %xmm15,0x60(%r11)
|
||||
movaps %xmm6,-0xa8(%r9)
|
||||
movaps %xmm7,-0x98(%r9)
|
||||
movaps %xmm8,-0x88(%r9)
|
||||
movaps %xmm9,-0x78(%r9)
|
||||
movaps %xmm10,-0x68(%r9)
|
||||
movaps %xmm11,-0x58(%r9)
|
||||
movaps %xmm12,-0x48(%r9)
|
||||
movaps %xmm13,-0x38(%r9)
|
||||
movaps %xmm14,-0x28(%r9)
|
||||
movaps %xmm15,-0x18(%r9)
|
||||
.L16x_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
vzeroupper
|
||||
|
@ -2955,25 +2950,275 @@ $code.=<<___;
|
|||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
movaps -0x30(%r11),%xmm6
|
||||
movaps -0x20(%r11),%xmm7
|
||||
movaps -0x10(%r11),%xmm8
|
||||
movaps 0x00(%r11),%xmm9
|
||||
movaps 0x10(%r11),%xmm10
|
||||
movaps 0x20(%r11),%xmm11
|
||||
movaps 0x30(%r11),%xmm12
|
||||
movaps 0x40(%r11),%xmm13
|
||||
movaps 0x50(%r11),%xmm14
|
||||
movaps 0x60(%r11),%xmm15
|
||||
movaps -0xa8(%r9),%xmm6
|
||||
movaps -0x98(%r9),%xmm7
|
||||
movaps -0x88(%r9),%xmm8
|
||||
movaps -0x78(%r9),%xmm9
|
||||
movaps -0x68(%r9),%xmm10
|
||||
movaps -0x58(%r9),%xmm11
|
||||
movaps -0x48(%r9),%xmm12
|
||||
movaps -0x38(%r9),%xmm13
|
||||
movaps -0x28(%r9),%xmm14
|
||||
movaps -0x18(%r9),%xmm15
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %r11,%rsp
|
||||
lea (%r9),%rsp
|
||||
.L16x_epilogue:
|
||||
ret
|
||||
.size ChaCha20_16x,.-ChaCha20_16x
|
||||
___
|
||||
}
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
$rec="%rcx";
|
||||
$frame="%rdx";
|
||||
$context="%r8";
|
||||
$disp="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
.type se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
lea .Lctr32_body(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<.Lprologue
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
lea .Lno_data(%rip),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=.Lepilogue
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
lea 64+24+48(%rax),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
mov -24(%rax),%r12
|
||||
mov -32(%rax),%r13
|
||||
mov -40(%rax),%r14
|
||||
mov -48(%rax),%r15
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
mov %r15,240($context) # restore context->R14
|
||||
|
||||
.Lcommon_seh_tail:
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
mov $context,%rsi # context
|
||||
mov \$154,%ecx # sizeof(CONTEXT)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
mov $disp,%rsi
|
||||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||||
mov %r10,32(%rsp) # arg5
|
||||
mov %r11,40(%rsp) # arg6
|
||||
mov %r12,48(%rsp) # arg7
|
||||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||||
call *__imp_RtlVirtualUnwind(%rip)
|
||||
|
||||
mov \$1,%eax # ExceptionContinueSearch
|
||||
add \$64,%rsp
|
||||
popfq
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
ret
|
||||
.size se_handler,.-se_handler
|
||||
|
||||
.type ssse3_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
ssse3_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # prologue label
|
||||
cmp %r10,%rbx # context->Rip<prologue label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 192($context),%rax # pull context->R9
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
lea -0x28(%rax),%rsi
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$4,%ecx
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
jmp .Lcommon_seh_tail
|
||||
.size ssse3_handler,.-ssse3_handler
|
||||
|
||||
.type full_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
full_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # prologue label
|
||||
cmp %r10,%rbx # context->Rip<prologue label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 192($context),%rax # pull context->R9
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
lea -0xa8(%rax),%rsi
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$20,%ecx
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
jmp .Lcommon_seh_tail
|
||||
.size full_handler,.-full_handler
|
||||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva .LSEH_begin_ChaCha20_ctr32
|
||||
.rva .LSEH_end_ChaCha20_ctr32
|
||||
.rva .LSEH_info_ChaCha20_ctr32
|
||||
|
||||
.rva .LSEH_begin_ChaCha20_ssse3
|
||||
.rva .LSEH_end_ChaCha20_ssse3
|
||||
.rva .LSEH_info_ChaCha20_ssse3
|
||||
|
||||
.rva .LSEH_begin_ChaCha20_4x
|
||||
.rva .LSEH_end_ChaCha20_4x
|
||||
.rva .LSEH_info_ChaCha20_4x
|
||||
___
|
||||
$code.=<<___ if ($avx);
|
||||
.rva .LSEH_begin_ChaCha20_4xop
|
||||
.rva .LSEH_end_ChaCha20_4xop
|
||||
.rva .LSEH_info_ChaCha20_4xop
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
.rva .LSEH_begin_ChaCha20_8x
|
||||
.rva .LSEH_end_ChaCha20_8x
|
||||
.rva .LSEH_info_ChaCha20_8x
|
||||
___
|
||||
$code.=<<___ if ($avx>2);
|
||||
.rva .LSEH_begin_ChaCha20_avx512
|
||||
.rva .LSEH_end_ChaCha20_avx512
|
||||
.rva .LSEH_info_ChaCha20_avx512
|
||||
|
||||
.rva .LSEH_begin_ChaCha20_16x
|
||||
.rva .LSEH_end_ChaCha20_16x
|
||||
.rva .LSEH_info_ChaCha20_16x
|
||||
___
|
||||
$code.=<<___;
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_ChaCha20_ctr32:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
|
||||
.LSEH_info_ChaCha20_ssse3:
|
||||
.byte 9,0,0,0
|
||||
.rva ssse3_handler
|
||||
.rva .Lssse3_body,.Lssse3_epilogue
|
||||
|
||||
.LSEH_info_ChaCha20_4x:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .L4x_body,.L4x_epilogue
|
||||
___
|
||||
$code.=<<___ if ($avx);
|
||||
.LSEH_info_ChaCha20_4xop:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .L4xop_body,.L4xop_epilogue # HandlerData[]
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
.LSEH_info_ChaCha20_8x:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .L8x_body,.L8x_epilogue # HandlerData[]
|
||||
___
|
||||
$code.=<<___ if ($avx>2);
|
||||
.LSEH_info_ChaCha20_avx512:
|
||||
.byte 9,0,0,0
|
||||
.rva ssse3_handler
|
||||
.rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
|
||||
|
||||
.LSEH_info_ChaCha20_16x:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .L16x_body,.L16x_epilogue # HandlerData[]
|
||||
___
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
|
|
|
@ -133,6 +133,7 @@ $code.=<<___;
|
|||
ecp_nistz256_mul_by_2:
|
||||
push %r12
|
||||
push %r13
|
||||
.Lmul_by_2_body:
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4,$t4
|
||||
|
@ -165,8 +166,10 @@ ecp_nistz256_mul_by_2:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Lmul_by_2_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
||||
|
||||
|
@ -178,6 +181,7 @@ ecp_nistz256_mul_by_2:
|
|||
ecp_nistz256_div_by_2:
|
||||
push %r12
|
||||
push %r13
|
||||
.Ldiv_by_2_body:
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
mov 8*1($a_ptr), $a1
|
||||
|
@ -225,8 +229,10 @@ ecp_nistz256_div_by_2:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Ldiv_by_2_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
||||
|
||||
|
@ -238,6 +244,7 @@ ecp_nistz256_div_by_2:
|
|||
ecp_nistz256_mul_by_3:
|
||||
push %r12
|
||||
push %r13
|
||||
.Lmul_by_3_body:
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4, $t4
|
||||
|
@ -291,8 +298,10 @@ ecp_nistz256_mul_by_3:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Lmul_by_3_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
||||
|
||||
|
@ -304,6 +313,7 @@ ecp_nistz256_mul_by_3:
|
|||
ecp_nistz256_add:
|
||||
push %r12
|
||||
push %r13
|
||||
.Ladd_body:
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4, $t4
|
||||
|
@ -337,8 +347,10 @@ ecp_nistz256_add:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Ladd_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_add,.-ecp_nistz256_add
|
||||
|
||||
|
@ -350,6 +362,7 @@ ecp_nistz256_add:
|
|||
ecp_nistz256_sub:
|
||||
push %r12
|
||||
push %r13
|
||||
.Lsub_body:
|
||||
|
||||
mov 8*0($a_ptr), $a0
|
||||
xor $t4, $t4
|
||||
|
@ -383,8 +396,10 @@ ecp_nistz256_sub:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Lsub_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_sub,.-ecp_nistz256_sub
|
||||
|
||||
|
@ -396,6 +411,7 @@ ecp_nistz256_sub:
|
|||
ecp_nistz256_neg:
|
||||
push %r12
|
||||
push %r13
|
||||
.Lneg_body:
|
||||
|
||||
xor $a0, $a0
|
||||
xor $a1, $a1
|
||||
|
@ -429,8 +445,10 @@ ecp_nistz256_neg:
|
|||
mov $a2, 8*2($r_ptr)
|
||||
mov $a3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Lneg_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_neg,.-ecp_nistz256_neg
|
||||
___
|
||||
|
@ -483,6 +501,7 @@ $code.=<<___;
|
|||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
.Lmul_body:
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
cmp \$0x80100, %ecx
|
||||
|
@ -515,12 +534,14 @@ $code.=<<___ if ($addx);
|
|||
___
|
||||
$code.=<<___;
|
||||
.Lmul_mont_done:
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
mov 0(%rsp),%r15
|
||||
mov 8(%rsp),%r14
|
||||
mov 16(%rsp),%r13
|
||||
mov 24(%rsp),%r12
|
||||
mov 32(%rsp),%rbx
|
||||
mov 40(%rsp),%rbp
|
||||
lea 48(%rsp),%rsp
|
||||
.Lmul_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
||||
|
||||
|
@ -763,6 +784,7 @@ $code.=<<___;
|
|||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
.Lsqr_body:
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
cmp \$0x80100, %ecx
|
||||
|
@ -791,12 +813,14 @@ $code.=<<___ if ($addx);
|
|||
___
|
||||
$code.=<<___;
|
||||
.Lsqr_mont_done:
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
mov 0(%rsp),%r15
|
||||
mov 8(%rsp),%r14
|
||||
mov 16(%rsp),%r13
|
||||
mov 24(%rsp),%r12
|
||||
mov 32(%rsp),%rbx
|
||||
mov 40(%rsp),%rbp
|
||||
lea 48(%rsp),%rsp
|
||||
.Lsqr_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
||||
|
||||
|
@ -1284,6 +1308,7 @@ $code.=<<___;
|
|||
ecp_nistz256_from_mont:
|
||||
push %r12
|
||||
push %r13
|
||||
.Lfrom_body:
|
||||
|
||||
mov 8*0($in_ptr), %rax
|
||||
mov .Lpoly+8*3(%rip), $t2
|
||||
|
@ -1364,8 +1389,10 @@ ecp_nistz256_from_mont:
|
|||
mov $acc2, 8*2($r_ptr)
|
||||
mov $acc3, 8*3($r_ptr)
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
mov 0(%rsp),%r13
|
||||
mov 8(%rsp),%r12
|
||||
lea 16(%rsp),%rsp
|
||||
.Lfrom_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
|
||||
___
|
||||
|
@ -1492,10 +1519,10 @@ $code.=<<___ if ($win64);
|
|||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_gather_w5:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_ecp_nistz256_gather_w5:
|
||||
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
||||
|
||||
################################################################################
|
||||
|
@ -1597,10 +1624,10 @@ $code.=<<___ if ($win64);
|
|||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_gather_w7:
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_ecp_nistz256_gather_w7:
|
||||
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
||||
___
|
||||
}
|
||||
|
@ -1621,18 +1648,19 @@ ecp_nistz256_avx2_gather_w5:
|
|||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x88(%rsp), %rax
|
||||
mov %rsp,%r11
|
||||
.LSEH_begin_ecp_nistz256_avx2_gather_w5:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
|
||||
.byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
|
||||
___
|
||||
$code.=<<___;
|
||||
vmovdqa .LTwo(%rip), $TWO
|
||||
|
@ -1698,11 +1726,11 @@ $code.=<<___ if ($win64);
|
|||
movaps 0x70(%rsp), %xmm13
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w5:
|
||||
lea (%r11), %rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w5:
|
||||
.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
|
||||
___
|
||||
}
|
||||
|
@ -1725,19 +1753,20 @@ ecp_nistz256_avx2_gather_w7:
|
|||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
mov %rsp,%r11
|
||||
lea -0x88(%rsp), %rax
|
||||
.LSEH_begin_ecp_nistz256_avx2_gather_w7:
|
||||
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
|
||||
.byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
|
||||
.byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
|
||||
.byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
|
||||
.byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
|
||||
___
|
||||
$code.=<<___;
|
||||
vmovdqa .LThree(%rip), $THREE
|
||||
|
@ -1818,11 +1847,11 @@ $code.=<<___ if ($win64);
|
|||
movaps 0x70(%rsp), %xmm13
|
||||
movaps 0x80(%rsp), %xmm14
|
||||
movaps 0x90(%rsp), %xmm15
|
||||
lea 0xa8(%rsp), %rsp
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w7:
|
||||
lea (%r11), %rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_ecp_nistz256_avx2_gather_w7:
|
||||
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
|
||||
___
|
||||
} else {
|
||||
|
@ -2053,6 +2082,7 @@ $code.=<<___;
|
|||
push %r14
|
||||
push %r15
|
||||
sub \$32*5+8, %rsp
|
||||
.Lpoint_double${x}_body:
|
||||
|
||||
.Lpoint_double_shortcut$x:
|
||||
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
|
||||
|
@ -2223,13 +2253,15 @@ $code.=<<___;
|
|||
movq %xmm1, $r_ptr
|
||||
call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
|
||||
|
||||
add \$32*5+8, %rsp
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
lea 32*5+56(%rsp), %rsi
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbx
|
||||
mov -8(%rsi),%rbp
|
||||
lea (%rsi),%rsp
|
||||
.Lpoint_double${x}_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
|
||||
___
|
||||
|
@ -2283,6 +2315,7 @@ $code.=<<___;
|
|||
push %r14
|
||||
push %r15
|
||||
sub \$32*18+8, %rsp
|
||||
.Lpoint_add${x}_body:
|
||||
|
||||
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
|
||||
movdqu 0x10($a_ptr), %xmm1
|
||||
|
@ -2591,13 +2624,15 @@ $code.=<<___;
|
|||
movdqu %xmm3, 0x30($r_ptr)
|
||||
|
||||
.Ladd_done$x:
|
||||
add \$32*18+8, %rsp
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
lea 32*18+56(%rsp), %rsi
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbx
|
||||
mov -8(%rsi),%rbp
|
||||
lea (%rsi),%rsp
|
||||
.Lpoint_add${x}_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
|
||||
___
|
||||
|
@ -2650,6 +2685,7 @@ $code.=<<___;
|
|||
push %r14
|
||||
push %r15
|
||||
sub \$32*15+8, %rsp
|
||||
.Ladd_affine${x}_body:
|
||||
|
||||
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
|
||||
mov $b_org, $b_ptr # reassign
|
||||
|
@ -2894,13 +2930,15 @@ $code.=<<___;
|
|||
movdqu %xmm2, 0x20($r_ptr)
|
||||
movdqu %xmm3, 0x30($r_ptr)
|
||||
|
||||
add \$32*15+8, %rsp
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
lea 32*15+56(%rsp), %rsi
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbx
|
||||
mov -8(%rsi),%rbp
|
||||
lea (%rsi),%rsp
|
||||
.Ladd_affine${x}_epilogue:
|
||||
ret
|
||||
.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
|
||||
___
|
||||
|
@ -3052,6 +3090,348 @@ ___
|
|||
}
|
||||
}}}
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
$rec="%rcx";
|
||||
$frame="%rdx";
|
||||
$context="%r8";
|
||||
$disp="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
|
||||
.type short_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
short_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # end of prologue label
|
||||
cmp %r10,%rbx # context->Rip<end of prologue label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
lea 16(%rax),%rax
|
||||
|
||||
mov -8(%rax),%r12
|
||||
mov -16(%rax),%r13
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
|
||||
jmp .Lcommon_seh_tail
|
||||
.size short_handler,.-short_handler
|
||||
|
||||
.type full_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
full_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # end of prologue label
|
||||
cmp %r10,%rbx # context->Rip<end of prologue label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
mov 8(%r11),%r10d # HandlerData[2]
|
||||
lea (%rax,%r10),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
mov -24(%rax),%r12
|
||||
mov -32(%rax),%r13
|
||||
mov -40(%rax),%r14
|
||||
mov -48(%rax),%r15
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
mov %r15,240($context) # restore context->R15
|
||||
|
||||
.Lcommon_seh_tail:
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
mov $context,%rsi # context
|
||||
mov \$154,%ecx # sizeof(CONTEXT)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
mov $disp,%rsi
|
||||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||||
mov %r10,32(%rsp) # arg5
|
||||
mov %r11,40(%rsp) # arg6
|
||||
mov %r12,48(%rsp) # arg7
|
||||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||||
call *__imp_RtlVirtualUnwind(%rip)
|
||||
|
||||
mov \$1,%eax # ExceptionContinueSearch
|
||||
add \$64,%rsp
|
||||
popfq
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
ret
|
||||
.size full_handler,.-full_handler
|
||||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva .LSEH_begin_ecp_nistz256_mul_by_2
|
||||
.rva .LSEH_end_ecp_nistz256_mul_by_2
|
||||
.rva .LSEH_info_ecp_nistz256_mul_by_2
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_div_by_2
|
||||
.rva .LSEH_end_ecp_nistz256_div_by_2
|
||||
.rva .LSEH_info_ecp_nistz256_div_by_2
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_mul_by_3
|
||||
.rva .LSEH_end_ecp_nistz256_mul_by_3
|
||||
.rva .LSEH_info_ecp_nistz256_mul_by_3
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_add
|
||||
.rva .LSEH_end_ecp_nistz256_add
|
||||
.rva .LSEH_info_ecp_nistz256_add
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_sub
|
||||
.rva .LSEH_end_ecp_nistz256_sub
|
||||
.rva .LSEH_info_ecp_nistz256_sub
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_neg
|
||||
.rva .LSEH_end_ecp_nistz256_neg
|
||||
.rva .LSEH_info_ecp_nistz256_neg
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_to_mont
|
||||
.rva .LSEH_end_ecp_nistz256_to_mont
|
||||
.rva .LSEH_info_ecp_nistz256_to_mont
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_mul_mont
|
||||
.rva .LSEH_end_ecp_nistz256_mul_mont
|
||||
.rva .LSEH_info_ecp_nistz256_mul_mont
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_sqr_mont
|
||||
.rva .LSEH_end_ecp_nistz256_sqr_mont
|
||||
.rva .LSEH_info_ecp_nistz256_sqr_mont
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_from_mont
|
||||
.rva .LSEH_end_ecp_nistz256_from_mont
|
||||
.rva .LSEH_info_ecp_nistz256_from_mont
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_gather_w5
|
||||
.rva .LSEH_end_ecp_nistz256_gather_w5
|
||||
.rva .LSEH_info_ecp_nistz256_gather_wX
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_gather_w7
|
||||
.rva .LSEH_end_ecp_nistz256_gather_w7
|
||||
.rva .LSEH_info_ecp_nistz256_gather_wX
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
.rva .LSEH_begin_ecp_nistz256_avx2_gather_w5
|
||||
.rva .LSEH_end_ecp_nistz256_avx2_gather_w5
|
||||
.rva .LSEH_info_ecp_nistz256_avx2_gather_wX
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_avx2_gather_w7
|
||||
.rva .LSEH_end_ecp_nistz256_avx2_gather_w7
|
||||
.rva .LSEH_info_ecp_nistz256_avx2_gather_wX
|
||||
___
|
||||
$code.=<<___;
|
||||
.rva .LSEH_begin_ecp_nistz256_point_double
|
||||
.rva .LSEH_end_ecp_nistz256_point_double
|
||||
.rva .LSEH_info_ecp_nistz256_point_double
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_point_add
|
||||
.rva .LSEH_end_ecp_nistz256_point_add
|
||||
.rva .LSEH_info_ecp_nistz256_point_add
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_point_add_affine
|
||||
.rva .LSEH_end_ecp_nistz256_point_add_affine
|
||||
.rva .LSEH_info_ecp_nistz256_point_add_affine
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
.rva .LSEH_begin_ecp_nistz256_point_doublex
|
||||
.rva .LSEH_end_ecp_nistz256_point_doublex
|
||||
.rva .LSEH_info_ecp_nistz256_point_doublex
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_point_addx
|
||||
.rva .LSEH_end_ecp_nistz256_point_addx
|
||||
.rva .LSEH_info_ecp_nistz256_point_addx
|
||||
|
||||
.rva .LSEH_begin_ecp_nistz256_point_add_affinex
|
||||
.rva .LSEH_end_ecp_nistz256_point_add_affinex
|
||||
.rva .LSEH_info_ecp_nistz256_point_add_affinex
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_ecp_nistz256_mul_by_2:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_div_by_2:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_mul_by_3:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_add:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Ladd_body,.Ladd_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_sub:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Lsub_body,.Lsub_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_neg:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Lneg_body,.Lneg_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_to_mont:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
||||
.long 48,0
|
||||
.LSEH_info_ecp_nistz256_mul_mont:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
||||
.long 48,0
|
||||
.LSEH_info_ecp_nistz256_sqr_mont:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
|
||||
.long 48,0
|
||||
.LSEH_info_ecp_nistz256_from_mont:
|
||||
.byte 9,0,0,0
|
||||
.rva short_handler
|
||||
.rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
|
||||
.LSEH_info_ecp_nistz256_gather_wX:
|
||||
.byte 0x01,0x33,0x16,0x00
|
||||
.byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
|
||||
.byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
|
||||
.byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
|
||||
.byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
|
||||
.byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
|
||||
.byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
||||
.byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
||||
.byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
||||
.byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
||||
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
||||
.byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
||||
.align 8
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
.LSEH_info_ecp_nistz256_avx2_gather_wX:
|
||||
.byte 0x01,0x36,0x17,0x0b
|
||||
.byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
|
||||
.byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
|
||||
.byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
|
||||
.byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
|
||||
.byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
|
||||
.byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
|
||||
.byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
|
||||
.byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
|
||||
.byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
|
||||
.byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
|
||||
.byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
|
||||
.byte 0x00,0xb3,0x00,0x00 # set_frame r11
|
||||
.align 8
|
||||
___
|
||||
$code.=<<___;
|
||||
.LSEH_info_ecp_nistz256_point_double:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[]
|
||||
.long 32*5+56,0
|
||||
.LSEH_info_ecp_nistz256_point_add:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[]
|
||||
.long 32*18+56,0
|
||||
.LSEH_info_ecp_nistz256_point_add_affine:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[]
|
||||
.long 32*15+56,0
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
.align 8
|
||||
.LSEH_info_ecp_nistz256_point_doublex:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[]
|
||||
.long 32*5+56,0
|
||||
.LSEH_info_ecp_nistz256_point_addx:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[]
|
||||
.long 32*18+56,0
|
||||
.LSEH_info_ecp_nistz256_point_add_affinex:
|
||||
.byte 9,0,0,0
|
||||
.rva full_handler
|
||||
.rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[]
|
||||
.long 32*15+56,0
|
||||
___
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
||||
#
|
||||
|
|
|
@ -237,8 +237,12 @@ $code=<<___;
|
|||
.align 16
|
||||
gcm_gmult_4bit:
|
||||
push %rbx
|
||||
push %rbp # %rbp and %r12 are pushed exclusively in
|
||||
push %rbp # %rbp and others are pushed exclusively in
|
||||
push %r12 # order to reuse Win64 exception handler...
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
sub \$280,%rsp
|
||||
.Lgmult_prologue:
|
||||
|
||||
movzb 15($Xi),$Zlo
|
||||
|
@ -249,8 +253,9 @@ $code.=<<___;
|
|||
mov $Zlo,8($Xi)
|
||||
mov $Zhi,($Xi)
|
||||
|
||||
mov 16(%rsp),%rbx
|
||||
lea 24(%rsp),%rsp
|
||||
lea 280+48(%rsp),%rsi
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lgmult_epilogue:
|
||||
ret
|
||||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||||
|
@ -400,14 +405,14 @@ $code.=<<___;
|
|||
mov $Zlo,8($Xi)
|
||||
mov $Zhi,($Xi)
|
||||
|
||||
lea 280(%rsp),%rsi
|
||||
mov 0(%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
lea 280+48(%rsp),%rsi
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea 0(%rsi),%rsp
|
||||
.Lghash_epilogue:
|
||||
ret
|
||||
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
||||
|
@ -1648,14 +1653,20 @@ se_handler:
|
|||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lin_prologue
|
||||
|
||||
lea 24(%rax),%rax # adjust "rsp"
|
||||
lea 48+280(%rax),%rax # adjust "rsp"
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
mov -24(%rax),%r12
|
||||
mov -32(%rax),%r13
|
||||
mov -40(%rax),%r14
|
||||
mov -48(%rax),%r15
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
mov %r15,240($context) # restore context->R15
|
||||
|
||||
.Lin_prologue:
|
||||
mov 8(%rax),%rdi
|
||||
|
|
|
@ -462,7 +462,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
|||
my @T=("%esi","%edi");
|
||||
my $j=0;
|
||||
my $rx=0;
|
||||
my $K_XX_XX="%r11";
|
||||
my $K_XX_XX="%r14";
|
||||
my $fp="%r11";
|
||||
|
||||
my $_rol=sub { &rol(@_) };
|
||||
my $_ror=sub { &ror(@_) };
|
||||
|
@ -483,7 +484,7 @@ $code.=<<___;
|
|||
.align 16
|
||||
sha1_block_data_order_ssse3:
|
||||
_ssse3_shortcut:
|
||||
mov %rsp,%rax
|
||||
mov %rsp,$fp # frame pointer
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -492,16 +493,15 @@ _ssse3_shortcut:
|
|||
lea `-64-($win64?6*16:0)`(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,-40-6*16(%rax)
|
||||
movaps %xmm7,-40-5*16(%rax)
|
||||
movaps %xmm8,-40-4*16(%rax)
|
||||
movaps %xmm9,-40-3*16(%rax)
|
||||
movaps %xmm10,-40-2*16(%rax)
|
||||
movaps %xmm11,-40-1*16(%rax)
|
||||
movaps %xmm6,-40-6*16($fp)
|
||||
movaps %xmm7,-40-5*16($fp)
|
||||
movaps %xmm8,-40-4*16($fp)
|
||||
movaps %xmm9,-40-3*16($fp)
|
||||
movaps %xmm10,-40-2*16($fp)
|
||||
movaps %xmm11,-40-1*16($fp)
|
||||
.Lprologue_ssse3:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rax,%r14 # original %rsp
|
||||
and \$-64,%rsp
|
||||
mov %rdi,$ctx # reassigned argument
|
||||
mov %rsi,$inp # reassigned argument
|
||||
|
@ -908,21 +908,20 @@ $code.=<<___;
|
|||
mov $E,16($ctx)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -40-6*16(%r14),%xmm6
|
||||
movaps -40-5*16(%r14),%xmm7
|
||||
movaps -40-4*16(%r14),%xmm8
|
||||
movaps -40-3*16(%r14),%xmm9
|
||||
movaps -40-2*16(%r14),%xmm10
|
||||
movaps -40-1*16(%r14),%xmm11
|
||||
movaps -40-6*16($fp),%xmm6
|
||||
movaps -40-5*16($fp),%xmm7
|
||||
movaps -40-4*16($fp),%xmm8
|
||||
movaps -40-3*16($fp),%xmm9
|
||||
movaps -40-2*16($fp),%xmm10
|
||||
movaps -40-1*16($fp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%r14),%rsi
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
mov -40($fp),%r14
|
||||
mov -32($fp),%r13
|
||||
mov -24($fp),%r12
|
||||
mov -16($fp),%rbp
|
||||
mov -8($fp),%rbx
|
||||
lea ($fp),%rsp
|
||||
.Lepilogue_ssse3:
|
||||
ret
|
||||
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
|
||||
|
@ -945,7 +944,7 @@ $code.=<<___;
|
|||
.align 16
|
||||
sha1_block_data_order_avx:
|
||||
_avx_shortcut:
|
||||
mov %rsp,%rax
|
||||
mov %rsp,$fp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -955,16 +954,15 @@ _avx_shortcut:
|
|||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
vmovaps %xmm6,-40-6*16(%rax)
|
||||
vmovaps %xmm7,-40-5*16(%rax)
|
||||
vmovaps %xmm8,-40-4*16(%rax)
|
||||
vmovaps %xmm9,-40-3*16(%rax)
|
||||
vmovaps %xmm10,-40-2*16(%rax)
|
||||
vmovaps %xmm11,-40-1*16(%rax)
|
||||
vmovaps %xmm6,-40-6*16($fp)
|
||||
vmovaps %xmm7,-40-5*16($fp)
|
||||
vmovaps %xmm8,-40-4*16($fp)
|
||||
vmovaps %xmm9,-40-3*16($fp)
|
||||
vmovaps %xmm10,-40-2*16($fp)
|
||||
vmovaps %xmm11,-40-1*16($fp)
|
||||
.Lprologue_avx:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rax,%r14 # original %rsp
|
||||
and \$-64,%rsp
|
||||
mov %rdi,$ctx # reassigned argument
|
||||
mov %rsi,$inp # reassigned argument
|
||||
|
@ -1272,21 +1270,20 @@ $code.=<<___;
|
|||
mov $E,16($ctx)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -40-6*16(%r14),%xmm6
|
||||
movaps -40-5*16(%r14),%xmm7
|
||||
movaps -40-4*16(%r14),%xmm8
|
||||
movaps -40-3*16(%r14),%xmm9
|
||||
movaps -40-2*16(%r14),%xmm10
|
||||
movaps -40-1*16(%r14),%xmm11
|
||||
movaps -40-6*16($fp),%xmm6
|
||||
movaps -40-5*16($fp),%xmm7
|
||||
movaps -40-4*16($fp),%xmm8
|
||||
movaps -40-3*16($fp),%xmm9
|
||||
movaps -40-2*16($fp),%xmm10
|
||||
movaps -40-1*16($fp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%r14),%rsi
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
mov -40($fp),%r14
|
||||
mov -32($fp),%r13
|
||||
mov -24($fp),%r12
|
||||
mov -16($fp),%rbp
|
||||
mov -8($fp),%rbx
|
||||
lea ($fp),%rsp
|
||||
.Lepilogue_avx:
|
||||
ret
|
||||
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
|
||||
|
@ -1312,7 +1309,7 @@ $code.=<<___;
|
|||
.align 16
|
||||
sha1_block_data_order_avx2:
|
||||
_avx2_shortcut:
|
||||
mov %rsp,%rax
|
||||
mov %rsp,$fp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -1322,16 +1319,15 @@ _avx2_shortcut:
|
|||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -6*16(%rsp),%rsp
|
||||
vmovaps %xmm6,-40-6*16(%rax)
|
||||
vmovaps %xmm7,-40-5*16(%rax)
|
||||
vmovaps %xmm8,-40-4*16(%rax)
|
||||
vmovaps %xmm9,-40-3*16(%rax)
|
||||
vmovaps %xmm10,-40-2*16(%rax)
|
||||
vmovaps %xmm11,-40-1*16(%rax)
|
||||
vmovaps %xmm6,-40-6*16($fp)
|
||||
vmovaps %xmm7,-40-5*16($fp)
|
||||
vmovaps %xmm8,-40-4*16($fp)
|
||||
vmovaps %xmm9,-40-3*16($fp)
|
||||
vmovaps %xmm10,-40-2*16($fp)
|
||||
vmovaps %xmm11,-40-1*16($fp)
|
||||
.Lprologue_avx2:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rax,%r14 # original %rsp
|
||||
mov %rdi,$ctx # reassigned argument
|
||||
mov %rsi,$inp # reassigned argument
|
||||
mov %rdx,$num # reassigned argument
|
||||
|
@ -1751,21 +1747,20 @@ $code.=<<___;
|
|||
vzeroupper
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -40-6*16(%r14),%xmm6
|
||||
movaps -40-5*16(%r14),%xmm7
|
||||
movaps -40-4*16(%r14),%xmm8
|
||||
movaps -40-3*16(%r14),%xmm9
|
||||
movaps -40-2*16(%r14),%xmm10
|
||||
movaps -40-1*16(%r14),%xmm11
|
||||
movaps -40-6*16($fp),%xmm6
|
||||
movaps -40-5*16($fp),%xmm7
|
||||
movaps -40-4*16($fp),%xmm8
|
||||
movaps -40-3*16($fp),%xmm9
|
||||
movaps -40-2*16($fp),%xmm10
|
||||
movaps -40-1*16($fp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
lea (%r14),%rsi
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
mov -40($fp),%r14
|
||||
mov -32($fp),%r13
|
||||
mov -24($fp),%r12
|
||||
mov -16($fp),%rbp
|
||||
mov -8($fp),%rbx
|
||||
lea ($fp),%rsp
|
||||
.Lepilogue_avx2:
|
||||
ret
|
||||
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
|
||||
|
@ -1908,15 +1903,13 @@ ssse3_handler:
|
|||
cmp %r10,%rbx # context->Rip<prologue label
|
||||
jb .Lcommon_seh_tail
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 208($context),%rax # pull context->R11
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lcommon_seh_tail
|
||||
|
||||
mov 232($context),%rax # pull context->R14
|
||||
|
||||
lea -40-6*16(%rax),%rsi
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$12,%ecx
|
||||
|
|
|
@ -301,13 +301,13 @@ $code.=<<___ if ($SZ==4);
|
|||
jnz .Lssse3_shortcut
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$$framesz,%rsp
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
|
@ -315,7 +315,7 @@ $code.=<<___;
|
|||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
mov %r11,$_rsp # save copy of %rsp
|
||||
mov %rax,$_rsp # save copy of %rsp
|
||||
.Lprologue:
|
||||
|
||||
mov $SZ*0($ctx),$A
|
||||
|
@ -382,13 +382,13 @@ $code.=<<___;
|
|||
jb .Lloop
|
||||
|
||||
mov $_rsp,%rsi
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue:
|
||||
ret
|
||||
.size $func,.-$func
|
||||
|
@ -761,13 +761,13 @@ $code.=<<___;
|
|||
.align 64
|
||||
${func}_ssse3:
|
||||
.Lssse3_shortcut:
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$`$framesz+$win64*16*4`,%rsp
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
|
@ -775,7 +775,7 @@ ${func}_ssse3:
|
|||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
mov %r11,$_rsp # save copy of %rsp
|
||||
mov %rax,$_rsp # save copy of %rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,16*$SZ+32(%rsp)
|
||||
|
@ -1082,13 +1082,13 @@ $code.=<<___ if ($win64);
|
|||
movaps 16*$SZ+80(%rsp),%xmm9
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_ssse3:
|
||||
ret
|
||||
.size ${func}_ssse3,.-${func}_ssse3
|
||||
|
@ -1105,13 +1105,13 @@ $code.=<<___;
|
|||
.align 64
|
||||
${func}_xop:
|
||||
.Lxop_shortcut:
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
|
@ -1119,7 +1119,7 @@ ${func}_xop:
|
|||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
mov %r11,$_rsp # save copy of %rsp
|
||||
mov %rax,$_rsp # save copy of %rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,16*$SZ+32(%rsp)
|
||||
|
@ -1459,13 +1459,13 @@ $code.=<<___ if ($win64 && $SZ>4);
|
|||
movaps 16*$SZ+112(%rsp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_xop:
|
||||
ret
|
||||
.size ${func}_xop,.-${func}_xop
|
||||
|
@ -1481,13 +1481,13 @@ $code.=<<___;
|
|||
.align 64
|
||||
${func}_avx:
|
||||
.Lavx_shortcut:
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
|
@ -1495,7 +1495,7 @@ ${func}_avx:
|
|||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
mov %r11,$_rsp # save copy of %rsp
|
||||
mov %rax,$_rsp # save copy of %rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,16*$SZ+32(%rsp)
|
||||
|
@ -1767,13 +1767,13 @@ $code.=<<___ if ($win64 && $SZ>4);
|
|||
movaps 16*$SZ+112(%rsp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_avx:
|
||||
ret
|
||||
.size ${func}_avx,.-${func}_avx
|
||||
|
@ -1832,13 +1832,13 @@ $code.=<<___;
|
|||
.align 64
|
||||
${func}_avx2:
|
||||
.Lavx2_shortcut:
|
||||
mov %rsp,%rax # copy %rsp
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
mov %rsp,%r11 # copy %rsp
|
||||
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
|
||||
shl \$4,%rdx # num*16
|
||||
and \$-256*$SZ,%rsp # align stack frame
|
||||
|
@ -1847,7 +1847,7 @@ ${func}_avx2:
|
|||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
mov %r11,$_rsp # save copy of %rsp
|
||||
mov %rax,$_rsp # save copy of %rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,16*$SZ+32(%rsp)
|
||||
|
@ -2141,13 +2141,13 @@ $code.=<<___ if ($win64 && $SZ>4);
|
|||
movaps 16*$SZ+112(%rsp),%xmm11
|
||||
___
|
||||
$code.=<<___;
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue_avx2:
|
||||
ret
|
||||
.size ${func}_avx2,.-${func}_avx2
|
||||
|
@ -2209,7 +2209,6 @@ ___
|
|||
$code.=<<___;
|
||||
mov %rax,%rsi # put aside Rsp
|
||||
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
|
||||
lea 48(%rax),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
|
|
|
@ -66,6 +66,7 @@ $code=<<___;
|
|||
.type $func,\@function,3
|
||||
.align 16
|
||||
$func:
|
||||
mov %rsp,%rax
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
|
@ -73,7 +74,6 @@ $func:
|
|||
push %r14
|
||||
push %r15
|
||||
|
||||
mov %rsp,%r11
|
||||
sub \$128+40,%rsp
|
||||
and \$-64,%rsp
|
||||
|
||||
|
@ -81,7 +81,7 @@ $func:
|
|||
mov %rdi,0(%r10) # save parameter block
|
||||
mov %rsi,8(%r10)
|
||||
mov %rdx,16(%r10)
|
||||
mov %r11,32(%r10) # saved stack pointer
|
||||
mov %rax,32(%r10) # saved stack pointer
|
||||
.Lprologue:
|
||||
|
||||
mov %r10,%rbx
|
||||
|
@ -205,13 +205,13 @@ $code.=<<___;
|
|||
jmp .Louterloop
|
||||
.Lalldone:
|
||||
mov 32(%rbx),%rsi # restore saved pointer
|
||||
mov (%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
mov -24(%rsi),%r12
|
||||
mov -16(%rsi),%rbp
|
||||
mov -8(%rsi),%rbx
|
||||
lea (%rsi),%rsp
|
||||
.Lepilogue:
|
||||
ret
|
||||
.size $func,.-$func
|
||||
|
@ -526,7 +526,6 @@ se_handler:
|
|||
jae .Lin_prologue
|
||||
|
||||
mov 128+32(%rax),%rax # pull saved stack pointer
|
||||
lea 48(%rax),%rax
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
|
|
Loading…
Reference in a new issue