AESNI engine: add counter mode.
This commit is contained in:
parent
fead253986
commit
6c83629bd9
3 changed files with 642 additions and 71 deletions
|
@ -23,7 +23,8 @@ require "x86asm.pl";
|
|||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
|
||||
if ($PREFIX eq "aesni") { $movekey=*movaps; }
|
||||
else { $movekey=*movups; }
|
||||
|
||||
$len="eax";
|
||||
$rounds="ecx";
|
||||
|
@ -41,7 +42,7 @@ $rndkey1="xmm4";
|
|||
$ivec="xmm5";
|
||||
$in0="xmm6";
|
||||
$in1="xmm7"; $inout3="xmm7";
|
||||
|
||||
|
||||
# Inline version of internal aesni_[en|de]crypt1
|
||||
sub aesni_inline_generate1
|
||||
{ my $p=shift;
|
||||
|
@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop
|
|||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt1");
|
||||
}
|
||||
|
||||
|
||||
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
|
||||
&aesni_generate1("enc") if (!$inline);
|
||||
&function_begin_B("${PREFIX}_encrypt");
|
||||
|
@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop
|
|||
&movups (&QWP(0,"eax"),$inout0);
|
||||
&ret ();
|
||||
&function_end_B("${PREFIX}_decrypt");
|
||||
|
||||
|
||||
# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
|
||||
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
|
||||
# latency is 6, it turned out that it can be scheduled only every
|
||||
|
@ -229,8 +230,9 @@ sub aesni_generate4
|
|||
&aesni_generate3("dec");
|
||||
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
||||
&aesni_generate4("dec");
|
||||
|
||||
|
||||
if ($PREFIX eq "aesni") {
|
||||
######################################################################
|
||||
# void aesni_ecb_encrypt (const void *in, void *out,
|
||||
# size_t length, const AES_KEY *key,
|
||||
# int enc);
|
||||
|
@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") {
|
|||
&mov ($rounds_,$rounds); # backup $rounds
|
||||
&jz (&label("ecb_decrypt"));
|
||||
|
||||
&sub ($len,0x40);
|
||||
&cmp ($len,0x40);
|
||||
&jbe (&label("ecb_enc_tail"));
|
||||
&sub ($len,0x40);
|
||||
&jmp (&label("ecb_enc_loop3"));
|
||||
|
||||
&set_label("ecb_enc_loop3",16);
|
||||
|
@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") {
|
|||
&movups (&QWP(-0x10,$out),$inout2);
|
||||
&ja (&label("ecb_enc_loop3"));
|
||||
|
||||
&set_label("ecb_enc_tail");
|
||||
&add ($len,0x40);
|
||||
&jz (&label("ecb_ret"));
|
||||
|
||||
&cmp ($len,0x10);
|
||||
&movups ($inout0,&QWP(0,$inp));
|
||||
&je (&label("ecb_enc_one"));
|
||||
&set_label("ecb_enc_tail");
|
||||
&cmp ($len,0x20);
|
||||
&movups ($inout0,&QWP(0,$inp));
|
||||
&jb (&label("ecb_enc_one"));
|
||||
&movups ($inout1,&QWP(0x10,$inp));
|
||||
&je (&label("ecb_enc_two"));
|
||||
&cmp ($len,0x30);
|
||||
|
@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") {
|
|||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&movups (&QWP(0x20,$out),$inout2);
|
||||
&jmp (&label("ecb_ret"));
|
||||
|
||||
######################################################################
|
||||
&set_label("ecb_decrypt",16);
|
||||
&sub ($len,0x40);
|
||||
&cmp ($len,0x40);
|
||||
&jbe (&label("ecb_dec_tail"));
|
||||
&sub ($len,0x40);
|
||||
&jmp (&label("ecb_dec_loop3"));
|
||||
|
||||
&set_label("ecb_dec_loop3",16);
|
||||
|
@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") {
|
|||
&movups (&QWP(-0x10,$out),$inout2);
|
||||
&ja (&label("ecb_dec_loop3"));
|
||||
|
||||
&set_label("ecb_dec_tail");
|
||||
&add ($len,0x40);
|
||||
&jz (&label("ecb_ret"));
|
||||
|
||||
&cmp ($len,0x10);
|
||||
&movups ($inout0,&QWP(0,$inp));
|
||||
&je (&label("ecb_dec_one"));
|
||||
&set_label("ecb_dec_tail");
|
||||
&cmp ($len,0x20);
|
||||
&movups ($inout0,&QWP(0,$inp));
|
||||
&jb (&label("ecb_dec_one"));
|
||||
&movups ($inout1,&QWP(0x10,$inp));
|
||||
&je (&label("ecb_dec_two"));
|
||||
&cmp ($len,0x30);
|
||||
|
@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") {
|
|||
|
||||
&set_label("ecb_ret");
|
||||
&function_end("aesni_ecb_encrypt");
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# handles only complete blocks, operates on 32-bit counter and
|
||||
# does not update *ivec! (see engine/eng_aesni.c for details)
|
||||
#
|
||||
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
|
||||
# size_t blocks, const AES_KEY *key,
|
||||
# const char *ivec);
|
||||
&function_begin("aesni_ctr32_encrypt_blocks");
|
||||
&mov ($inp,&wparam(0));
|
||||
&mov ($out,&wparam(1));
|
||||
&mov ($len,&wparam(2));
|
||||
&mov ($key,&wparam(3));
|
||||
&mov ($rounds_,&wparam(4));
|
||||
&mov ($key_,"esp");
|
||||
&sub ("esp",60);
|
||||
&and ("esp",-16); # align stack
|
||||
&mov (&DWP(48,"esp"),$key_);
|
||||
|
||||
&movups ($inout3,&QWP(0,$rounds_)); # load ivec
|
||||
|
||||
# compose byte-swap control mask for pshufb on stack
|
||||
&mov (&DWP(0,"esp"),0x0c0d0e0f);
|
||||
&mov (&DWP(4,"esp"),0x08090a0b);
|
||||
&mov (&DWP(8,"esp"),0x04050607);
|
||||
&mov (&DWP(12,"esp"),0x00010203);
|
||||
|
||||
# compose counter increment vector on stack
|
||||
&mov ($rounds,3);
|
||||
&xor ($key_,$key_);
|
||||
&mov (&DWP(16,"esp"),$rounds);
|
||||
&mov (&DWP(20,"esp"),$rounds);
|
||||
&mov (&DWP(24,"esp"),$rounds);
|
||||
&mov (&DWP(28,"esp"),$key_);
|
||||
|
||||
&pextrd ($rounds_,$inout3,3); # pull 32-bit counter
|
||||
&pinsrd ($inout3,$key_,3); # wipe 32-bit counter
|
||||
|
||||
&mov ($rounds,&DWP(240,$key)); # key->rounds
|
||||
&movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
|
||||
|
||||
# $ivec is vector of 3 32-bit counters
|
||||
&pxor ($ivec,$ivec);
|
||||
&bswap ($rounds_);
|
||||
&pinsrd ($ivec,$rounds_,0);
|
||||
&inc ($rounds_);
|
||||
&pinsrd ($ivec,$rounds_,1);
|
||||
&inc ($rounds_);
|
||||
&pinsrd ($ivec,$rounds_,2);
|
||||
|
||||
&cmp ($len,4);
|
||||
&pshufb ($ivec,$rndkey0); # byte swap
|
||||
&jbe (&label("ctr32_tail"));
|
||||
&movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec
|
||||
&mov ($rounds_,$rounds);
|
||||
&mov ($key_,$key);
|
||||
&sub ($len,4);
|
||||
&jmp (&label("ctr32_loop3"));
|
||||
|
||||
&set_label("ctr32_loop3",16);
|
||||
&pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
|
||||
&pshufd ($inout1,$ivec,2<<6);
|
||||
&pshufd ($inout2,$ivec,1<<6);
|
||||
&por ($inout0,$inout3); # merge counter-less ivec
|
||||
&por ($inout1,$inout3);
|
||||
&por ($inout2,$inout3);
|
||||
|
||||
&call ("_aesni_encrypt3");
|
||||
|
||||
&movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&movups ($in1,&QWP(0x10,$inp));
|
||||
&movups ($rndkey1,&QWP(0x20,$inp));
|
||||
&pshufb($ivec,$rndkey0); # byte swap
|
||||
&paddd ($ivec,&QWP(16,"esp")); # counter increment
|
||||
&pxor ($in0,$inout0);
|
||||
&pxor ($in1,$inout1);
|
||||
&pxor ($rndkey1,$inout2);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0x10,$out),$in1);
|
||||
&movups (&QWP(0x20,$out),$rndkey1);
|
||||
&movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec
|
||||
&pshufb($ivec,$rndkey0); # byte swap
|
||||
|
||||
&sub ($len,3);
|
||||
&lea ($inp,&DWP(0x30,$inp));
|
||||
&lea ($out,&DWP(0x30,$out));
|
||||
&mov ($key,$key_);
|
||||
&mov ($rounds,$rounds_);
|
||||
&ja (&label("ctr32_loop3"));
|
||||
|
||||
&add ($len,4);
|
||||
&pextrd ($rounds_,$ivec,1); # might need last counter value
|
||||
&jz (&label("ctr32_ret"));
|
||||
&bswap ($rounds_);
|
||||
|
||||
&set_label("ctr32_tail");
|
||||
&cmp ($len,2);
|
||||
&pshufd ($inout0,$ivec,3<<6);
|
||||
&pshufd ($inout1,$ivec,2<<6);
|
||||
&pshufd ($inout2,$ivec,1<<6);
|
||||
&por ($inout0,$inout3);
|
||||
&jb (&label("ctr32_one"));
|
||||
&por ($inout1,$inout3);
|
||||
&je (&label("ctr32_two"));
|
||||
&cmp ($len,3);
|
||||
&por ($inout2,$inout3);
|
||||
&je (&label("ctr32_three"));
|
||||
|
||||
&inc ($rounds_); # compose last counter value
|
||||
&bswap ($rounds_);
|
||||
&pinsrd ($inout3,$rounds_,3);
|
||||
|
||||
&call ("_aesni_encrypt4");
|
||||
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&movups ($rndkey1,&QWP(0x10,$inp));
|
||||
&movups ($rndkey0,&QWP(0x20,$inp));
|
||||
&movups ($ivec,&QWP(0x30,$inp));
|
||||
&pxor ($in0,$inout0);
|
||||
&pxor ($rndkey1,$inout1);
|
||||
&pxor ($rndkey0,$inout2);
|
||||
&pxor ($ivec,$inout3);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0x10,$out),$rndkey1);
|
||||
&movups (&QWP(0x20,$out),$rndkey0);
|
||||
&movups (&QWP(0x30,$out),$ivec);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_one",16);
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
else
|
||||
{ &call ("_aesni_encrypt1"); }
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&pxor ($in0,$inout0);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_two",16);
|
||||
&call ("_aesni_encrypt3");
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&movups ($in1,&QWP(0x10,$inp));
|
||||
&pxor ($in0,$inout0);
|
||||
&pxor ($in1,$inout1);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0x10,$out),$in1);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_three",16);
|
||||
&call ("_aesni_encrypt3");
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&movups ($in1,&QWP(0x10,$inp));
|
||||
&movups ($rndkey1,&QWP(0x20,$inp));
|
||||
&pxor ($in0,$inout0);
|
||||
&pxor ($in1,$inout1);
|
||||
&pxor ($rndkey1,$inout2);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0x10,$out),$in1);
|
||||
&movups (&QWP(0x20,$out),$rndkey1);
|
||||
|
||||
&set_label("ctr32_ret");
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&function_end("aesni_ctr32_encrypt_blocks");
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
|
||||
# size_t length, const AES_KEY *key,
|
||||
# unsigned char *ivp,const int enc);
|
||||
|
@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") {
|
|||
&mov ($inp,$out); # $inp and $out are the same
|
||||
&mov ($key,$key_); # restore $key
|
||||
&jmp (&label("cbc_enc_loop"));
|
||||
|
||||
######################################################################
|
||||
&set_label("cbc_decrypt",16);
|
||||
&sub ($len,0x40);
|
||||
&cmp ($len,0x40);
|
||||
&jbe (&label("cbc_dec_tail"));
|
||||
&sub ($len,0x40);
|
||||
&jmp (&label("cbc_dec_loop3"));
|
||||
|
||||
&set_label("cbc_dec_loop3",16);
|
||||
|
@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") {
|
|||
&movups (&QWP(-0x10,$out),$inout2);
|
||||
&ja (&label("cbc_dec_loop3"));
|
||||
|
||||
&set_label("cbc_dec_tail");
|
||||
&add ($len,0x40);
|
||||
&jz (&label("cbc_ret"));
|
||||
|
||||
&set_label("cbc_dec_tail");
|
||||
&movups ($inout0,&QWP(0,$inp));
|
||||
&cmp ($len,0x10);
|
||||
&movaps ($in0,$inout0);
|
||||
|
@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") {
|
|||
&mov ($key_,&wparam(4));
|
||||
&movups (&QWP(0,$key_),$ivec); # output IV
|
||||
&function_end("${PREFIX}_cbc_encrypt");
|
||||
|
||||
|
||||
######################################################################
|
||||
# Mechanical port from aesni-x86_64.pl.
|
||||
#
|
||||
# _aesni_set_encrypt_key is private interface,
|
||||
|
|
|
@ -41,7 +41,7 @@ $inp="%rdi";
|
|||
$out="%rsi";
|
||||
$len="%rdx";
|
||||
$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
|
||||
$ivp="%r8"; # cbc
|
||||
$ivp="%r8"; # cbc, ctr
|
||||
|
||||
$rnds_="%r10d"; # backup copy for $rounds
|
||||
$key_="%r11"; # backup copy for $key
|
||||
|
@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1";
|
|||
$inout2="%xmm2"; $inout3="%xmm3";
|
||||
$rndkey0="%xmm4"; $rndkey1="%xmm5";
|
||||
|
||||
$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
|
||||
$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR
|
||||
$in1="%xmm8"; $in2="%xmm9";
|
||||
|
||||
# Inline version of internal aesni_[en|de]crypt1.
|
||||
|
@ -214,6 +214,7 @@ ___
|
|||
&aesni_generate4("dec");
|
||||
|
||||
if ($PREFIX eq "aesni") {
|
||||
########################################################################
|
||||
# void aesni_ecb_encrypt (const void *in, void *out,
|
||||
# size_t length, const AES_KEY *key,
|
||||
# int enc);
|
||||
|
@ -232,8 +233,9 @@ aesni_ecb_encrypt:
|
|||
mov $rounds,$rnds_ # backup $rounds
|
||||
jz .Lecb_decrypt
|
||||
#--------------------------- ECB ENCRYPT ------------------------------#
|
||||
sub \$0x40,$len
|
||||
cmp \$0x40,$len
|
||||
jbe .Lecb_enc_tail
|
||||
sub \$0x40,$len
|
||||
jmp .Lecb_enc_loop3
|
||||
.align 16
|
||||
.Lecb_enc_loop3:
|
||||
|
@ -251,14 +253,13 @@ aesni_ecb_encrypt:
|
|||
movups $inout2,-0x10($out)
|
||||
ja .Lecb_enc_loop3
|
||||
|
||||
.Lecb_enc_tail:
|
||||
add \$0x40,$len
|
||||
jz .Lecb_ret
|
||||
|
||||
cmp \$0x10,$len
|
||||
movups ($inp),$inout0
|
||||
je .Lecb_enc_one
|
||||
.Lecb_enc_tail:
|
||||
cmp \$0x20,$len
|
||||
movups ($inp),$inout0
|
||||
jb .Lecb_enc_one
|
||||
movups 0x10($inp),$inout1
|
||||
je .Lecb_enc_two
|
||||
cmp \$0x30,$len
|
||||
|
@ -294,8 +295,9 @@ $code.=<<___;
|
|||
#--------------------------- ECB DECRYPT ------------------------------#
|
||||
.align 16
|
||||
.Lecb_decrypt:
|
||||
sub \$0x40,$len
|
||||
cmp \$0x40,$len
|
||||
jbe .Lecb_dec_tail
|
||||
sub \$0x40,$len
|
||||
jmp .Lecb_dec_loop3
|
||||
.align 16
|
||||
.Lecb_dec_loop3:
|
||||
|
@ -313,14 +315,13 @@ $code.=<<___;
|
|||
movups $inout2,-0x10($out)
|
||||
ja .Lecb_dec_loop3
|
||||
|
||||
.Lecb_dec_tail:
|
||||
add \$0x40,$len
|
||||
jz .Lecb_ret
|
||||
|
||||
cmp \$0x10,$len
|
||||
movups ($inp),$inout0
|
||||
je .Lecb_dec_one
|
||||
.Lecb_dec_tail:
|
||||
cmp \$0x20,$len
|
||||
movups ($inp),$inout0
|
||||
jb .Lecb_dec_one
|
||||
movups 0x10($inp),$inout1
|
||||
je .Lecb_dec_two
|
||||
cmp \$0x30,$len
|
||||
|
@ -357,8 +358,175 @@ $code.=<<___;
|
|||
ret
|
||||
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
|
||||
___
|
||||
######################################################################
|
||||
# handles only complete blocks, operates on 32-bit counter and
|
||||
# does not update *ivec! (see engine/eng_aesni.c for details)
|
||||
#
|
||||
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
|
||||
# size_t blocks, const AES_KEY *key,
|
||||
# const char *ivec);
|
||||
$increment="%xmm10";
|
||||
$bswap_mask="%xmm11";
|
||||
|
||||
$code.=<<___;
|
||||
.globl aesni_ctr32_encrypt_blocks
|
||||
.type aesni_ctr32_encrypt_blocks,\@function,5
|
||||
.align 16
|
||||
aesni_ctr32_encrypt_blocks:
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x68(%rsp),%rsp
|
||||
movaps %xmm6,(%rsp)
|
||||
movaps %xmm7,0x10(%rsp)
|
||||
movaps %xmm8,0x20(%rsp)
|
||||
movaps %xmm9,0x30(%rsp)
|
||||
movaps %xmm10,0x40(%rsp)
|
||||
movaps %xmm11,0x50(%rsp)
|
||||
|
||||
.Lctr32_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
movups ($ivp),$inout3
|
||||
movaps .Lincrement(%rip),$increment
|
||||
movaps .Lbswap_mask(%rip),$bswap_mask
|
||||
xor $rounds,$rounds
|
||||
pextrd \$3,$inout3,$rnds_ # pull 32-bit counter
|
||||
pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter
|
||||
|
||||
mov 240($key),$rounds # key->rounds
|
||||
pxor $iv,$iv # vector of 3 32-bit counters
|
||||
bswap $rnds_
|
||||
pinsrd \$0,$rnds_,$iv
|
||||
inc $rnds_
|
||||
pinsrd \$1,$rnds_,$iv
|
||||
inc $rnds_
|
||||
pinsrd \$2,$rnds_,$iv
|
||||
|
||||
cmp \$4,$len
|
||||
pshufb $bswap_mask,$iv
|
||||
jbe .Lctr32_tail
|
||||
mov $rounds,$rnds_
|
||||
mov $key,$key_
|
||||
sub \$4,$len
|
||||
jmp .Lctr32_loop3
|
||||
|
||||
.align 16
|
||||
.Lctr32_loop3:
|
||||
pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword
|
||||
pshufd \$`2<<6`,$iv,$inout1
|
||||
pshufd \$`1<<6`,$iv,$inout2
|
||||
movups ($inp),$in0
|
||||
movups 0x10($inp),$in1
|
||||
movups 0x20($inp),$in2
|
||||
por $inout3,$inout0 # merge counter-less ivec
|
||||
por $inout3,$inout1
|
||||
por $inout3,$inout2
|
||||
pshufb $bswap_mask,$iv
|
||||
|
||||
call _aesni_encrypt3
|
||||
|
||||
paddd $increment,$iv
|
||||
pxor $inout0,$in0
|
||||
pxor $inout1,$in1
|
||||
pxor $inout2,$in2
|
||||
pshufb $bswap_mask,$iv
|
||||
movups $in0,($out)
|
||||
movups $in1,0x10($out)
|
||||
movups $in2,0x20($out)
|
||||
|
||||
sub \$3,$len
|
||||
lea 0x30($inp),$inp
|
||||
lea 0x30($out),$out
|
||||
mov $key_,$key
|
||||
mov $rnds_,$rounds
|
||||
ja .Lctr32_loop3
|
||||
|
||||
add \$4,$len
|
||||
pextrd \$1,$iv,$rnds_ # migh need last counter value
|
||||
jz .Lctr32_done
|
||||
bswap $rnds_
|
||||
|
||||
.Lctr32_tail:
|
||||
cmp \$2,$len
|
||||
pshufd \$`3<<6`,$iv,$inout0
|
||||
pshufd \$`2<<6`,$iv,$inout1
|
||||
pshufd \$`1<<6`,$iv,$inout2
|
||||
por $inout3,$inout0
|
||||
movups ($inp),$in0
|
||||
jb .Lctr32_one
|
||||
por $inout3,$inout1
|
||||
movups 0x10($inp),$in1
|
||||
je .Lctr32_two
|
||||
cmp \$3,$len
|
||||
por $inout3,$inout2
|
||||
movups 0x20($inp),$in2
|
||||
je .Lctr32_three
|
||||
|
||||
inc $rnds_ # compose last counter value
|
||||
bswap $rnds_
|
||||
pinsrd \$3,$rnds_,$inout3
|
||||
movups 0x30($inp),$iv
|
||||
|
||||
call _aesni_encrypt4
|
||||
|
||||
pxor $inout0,$in0
|
||||
pxor $inout1,$in1
|
||||
pxor $inout2,$in2
|
||||
pxor $inout3,$iv
|
||||
movups $in0,($out)
|
||||
movups $in1,0x10($out)
|
||||
movups $in2,0x20($out)
|
||||
movups $iv,0x30($out)
|
||||
jmp .Lctr32_done
|
||||
|
||||
.align 16
|
||||
.Lctr32_one:
|
||||
___
|
||||
&aesni_generate1("enc",$key,$rounds);
|
||||
$code.=<<___;
|
||||
pxor $inout0,$in0
|
||||
movups $in0,($out)
|
||||
jmp .Lctr32_done
|
||||
|
||||
.align 16
|
||||
.Lctr32_two:
|
||||
call _aesni_encrypt3
|
||||
pxor $inout0,$in0
|
||||
pxor $inout1,$in1
|
||||
movups $in0,($out)
|
||||
movups $in1,0x10($out)
|
||||
jmp .Lctr32_done
|
||||
|
||||
.align 16
|
||||
.Lctr32_three:
|
||||
call _aesni_encrypt3
|
||||
pxor $inout0,$in0
|
||||
pxor $inout1,$in1
|
||||
pxor $inout2,$in2
|
||||
movups $in0,($out)
|
||||
movups $in1,0x10($out)
|
||||
movups $in2,0x20($out)
|
||||
|
||||
.Lctr32_done:
|
||||
___
|
||||
|
||||
$code.=<<___ if ($win64);
|
||||
movaps (%rsp),%xmm6
|
||||
movaps 0x10(%rsp),%xmm7
|
||||
movaps 0x20(%rsp),%xmm8
|
||||
movaps 0x30(%rsp),%xmm9
|
||||
movaps 0x40(%rsp),%xmm10
|
||||
movaps 0x50(%rsp),%xmm11
|
||||
lea 0x68(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
.Lctr32_ret:
|
||||
ret
|
||||
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
||||
___
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
|
||||
# size_t length, const AES_KEY *key,
|
||||
# unsigned char *ivp,const int enc);
|
||||
|
@ -429,9 +597,10 @@ $code.=<<___ if ($win64);
|
|||
___
|
||||
$code.=<<___;
|
||||
movups ($ivp),$iv
|
||||
sub \$0x40,$len
|
||||
cmp \$0x40,$len
|
||||
mov $rnds_,$rounds
|
||||
jbe .Lcbc_dec_tail
|
||||
sub \$0x40,$len
|
||||
jmp .Lcbc_dec_loop3
|
||||
.align 16
|
||||
.Lcbc_dec_loop3:
|
||||
|
@ -456,11 +625,11 @@ $code.=<<___;
|
|||
movups $inout2,-0x10($out)
|
||||
ja .Lcbc_dec_loop3
|
||||
|
||||
.Lcbc_dec_tail:
|
||||
add \$0x40,$len
|
||||
movups $iv,($ivp)
|
||||
jz .Lcbc_dec_ret
|
||||
|
||||
.Lcbc_dec_tail:
|
||||
movups ($inp),$inout0
|
||||
cmp \$0x10,$len
|
||||
movaps $inout0,$in0
|
||||
|
@ -796,6 +965,11 @@ ___
|
|||
}
|
||||
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.Lbswap_mask:
|
||||
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||||
.Lincrement:
|
||||
.long 3,3,3,0
|
||||
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 64
|
||||
___
|
||||
|
@ -810,6 +984,75 @@ $disp="%r9";
|
|||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
___
|
||||
$code.=<<___ if ($PREFIX eq "aesni");
|
||||
.type ecb_se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
ecb_se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
jmp .Lcommon_seh_exit
|
||||
.size ecb_se_handler,.-ecb_se_handler
|
||||
|
||||
.type ctr32_se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
ctr32_se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
lea .Lctr32_body(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lin_ctr32_prologue
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
lea .Lctr32_ret(%rip),%r10
|
||||
cmp %r10,%rbx
|
||||
jae .Lin_ctr32_prologue
|
||||
|
||||
lea 0(%rax),%rsi # top of stack
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
lea 0x68(%rax),%rax # adjust stack pointer
|
||||
|
||||
.Lin_ctr32_prologue:
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
jmp .Lcommon_seh_exit
|
||||
.size ctr32_se_handler,.-ctr32_se_handler
|
||||
___
|
||||
$code.=<<___;
|
||||
.type cbc_se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
cbc_se_handler:
|
||||
|
@ -829,55 +1072,32 @@ cbc_se_handler:
|
|||
|
||||
lea .Lcbc_decrypt(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lin_prologue
|
||||
jb .Lin_cbc_prologue
|
||||
|
||||
lea .Lcbc_decrypt_body(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
|
||||
jb .Lrestore_rax
|
||||
jb .Lrestore_cbc_rax
|
||||
|
||||
lea .Lcbc_ret(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip>="epilogue" label
|
||||
jae .Lin_prologue
|
||||
jae .Lin_cbc_prologue
|
||||
|
||||
lea 0(%rax),%rsi # top of stack
|
||||
lea 512($context),%rdi # &context.Xmm6
|
||||
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
lea 0x58(%rax),%rax # adjust stack pointer
|
||||
jmp .Lin_prologue
|
||||
jmp .Lin_cbc_prologue
|
||||
|
||||
.Lrestore_rax:
|
||||
.Lrestore_cbc_rax:
|
||||
mov 120($context),%rax
|
||||
.Lin_prologue:
|
||||
.Lin_cbc_prologue:
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
jmp .Lcommon_seh_exit
|
||||
.size cbc_se_handler,.-cbc_se_handler
|
||||
|
||||
.type ecb_se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
ecb_se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
.Lcommon_seh_exit:
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
|
@ -915,10 +1135,17 @@ ecb_se_handler:
|
|||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva .LSEH_begin_${PREFIX}_ecb_encrypt
|
||||
.rva .LSEH_end_${PREFIX}_ecb_encrypt
|
||||
___
|
||||
$code.=<<___ if ($PREFIX eq "aesni");
|
||||
.rva .LSEH_begin_aesni_ecb_encrypt
|
||||
.rva .LSEH_end_aesni_ecb_encrypt
|
||||
.rva .LSEH_info_ecb
|
||||
|
||||
.rva .LSEH_begin_aesni_ctr32_encrypt_blocks
|
||||
.rva .LSEH_end_aesni_ctr32_encrypt_blocks
|
||||
.rva .LSEH_info_ctr32
|
||||
___
|
||||
$code.=<<___;
|
||||
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
|
||||
.rva .LSEH_end_${PREFIX}_cbc_encrypt
|
||||
.rva .LSEH_info_cbc
|
||||
|
@ -932,9 +1159,16 @@ ecb_se_handler:
|
|||
.rva .LSEH_info_key
|
||||
.section .xdata
|
||||
.align 8
|
||||
___
|
||||
$code.=<<___ if ($PREFIX eq "aesni");
|
||||
.LSEH_info_ecb:
|
||||
.byte 9,0,0,0
|
||||
.rva ecb_se_handler
|
||||
.LSEH_info_ctr32:
|
||||
.byte 9,0,0,0
|
||||
.rva ctr32_se_handler
|
||||
___
|
||||
$code.=<<___;
|
||||
.LSEH_info_cbc:
|
||||
.byte 9,0,0,0
|
||||
.rva cbc_se_handler
|
||||
|
|
|
@ -111,6 +111,35 @@ void ENGINE_load_aesni (void)
|
|||
}
|
||||
|
||||
#ifdef COMPILE_HW_AESNI
|
||||
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__>=2
|
||||
# define BSWAP4(x) ({ u32 ret=(x); \
|
||||
asm volatile ("bswapl %0" \
|
||||
: "+r"(ret)); ret; })
|
||||
#elif defined(_MSC_VER)
|
||||
# if _MSC_VER>=1300
|
||||
# pragma intrinsic(_byteswap_ulong)
|
||||
# define BSWAP4(x) _byteswap_ulong((u32)(x))
|
||||
# elif defined(_M_IX86)
|
||||
__inline u32 _bswap4(u32 val) {
|
||||
_asm mov eax,val
|
||||
_asm bswap eax
|
||||
}
|
||||
# define BSWAP4(x) _bswap4(x)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef BSWAP4
|
||||
#define GETU32(p) BSWAP4(*(const u32 *)(p))
|
||||
#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
|
||||
#else
|
||||
#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
|
||||
#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
|
||||
#endif
|
||||
|
||||
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
|
||||
AES_KEY *key);
|
||||
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
|
||||
|
@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in,
|
|||
const AES_KEY *key,
|
||||
unsigned char *ivec, int enc);
|
||||
|
||||
void aesni_ctr32_encrypt_blocks(const unsigned char *in,
|
||||
unsigned char *out,
|
||||
size_t blocks,
|
||||
const AES_KEY *key,
|
||||
const unsigned char *ivec);
|
||||
|
||||
/* Function for ENGINE detection and control */
|
||||
static int aesni_init(ENGINE *e);
|
||||
|
||||
|
@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = {
|
|||
NID_aes_128_cbc,
|
||||
NID_aes_128_cfb,
|
||||
NID_aes_128_ofb,
|
||||
NID_aes_128_ctr,
|
||||
|
||||
NID_aes_192_ecb,
|
||||
NID_aes_192_cbc,
|
||||
NID_aes_192_cfb,
|
||||
NID_aes_192_ofb,
|
||||
NID_aes_192_ctr,
|
||||
|
||||
NID_aes_256_ecb,
|
||||
NID_aes_256_cbc,
|
||||
NID_aes_256_cfb,
|
||||
NID_aes_256_ofb,
|
||||
NID_aes_256_ctr,
|
||||
};
|
||||
static int aesni_cipher_nids_num =
|
||||
(sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0]));
|
||||
|
@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key,
|
|||
int ret;
|
||||
AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
|
||||
|
||||
if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
|
||||
|| (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
|
||||
|| enc)
|
||||
ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
|
||||
else
|
||||
if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE
|
||||
|| (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE)
|
||||
&& !enc)
|
||||
ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key);
|
||||
else
|
||||
ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
|
||||
|
||||
if(ret < 0) {
|
||||
EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV)
|
||||
{
|
||||
if (iv!=NULL)
|
||||
memcpy (ctx->iv,iv,ctx->cipher->iv_len);
|
||||
else {
|
||||
EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC);
|
|||
DECLARE_AES_EVP(256,cfb,CFB);
|
||||
DECLARE_AES_EVP(256,ofb,OFB);
|
||||
|
||||
static void ctr96_inc(unsigned char *counter) {
|
||||
u32 n=12;
|
||||
u8 c;
|
||||
|
||||
do {
|
||||
--n;
|
||||
c = counter[n];
|
||||
++c;
|
||||
counter[n] = c;
|
||||
if (c) return;
|
||||
} while (n);
|
||||
}
|
||||
|
||||
static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
const unsigned char *in, size_t len)
|
||||
{
|
||||
AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
|
||||
u32 n, ctr32;
|
||||
n = ctx->num;
|
||||
|
||||
while (n && len) {
|
||||
*(out++) = *(in++) ^ ctx->buf[n];
|
||||
--len;
|
||||
n = (n+1) % 16;
|
||||
}
|
||||
|
||||
ctr32 = GETU32(ctx->iv+12);
|
||||
while (len>=16) {
|
||||
size_t blocks = len/16;
|
||||
/*
|
||||
* 1<<24 is just a not-so-small yet not-so-large number...
|
||||
*/
|
||||
if (blocks > (1U<<24)) blocks = (1U<<24);
|
||||
/*
|
||||
* As aesni_ctr32 operates on 32-bit counter, caller
|
||||
* has to handle overflow. 'if' below detects the
|
||||
* overflow, which is then handled by limiting the
|
||||
* amount of blocks to the exact overflow point...
|
||||
*/
|
||||
ctr32 += (u32)blocks;
|
||||
if (ctr32 < blocks) {
|
||||
blocks -= ctr32;
|
||||
ctr32 = 0;
|
||||
}
|
||||
aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv);
|
||||
/* aesni_ctr32 does not update ctx->iv, caller does: */
|
||||
PUTU32(ctx->iv+12,ctr32);
|
||||
/* ... overflow was detected, propogate carry. */
|
||||
if (ctr32 == 0) ctr96_inc(ctx->iv);
|
||||
blocks *= 16;
|
||||
len -= blocks;
|
||||
out += blocks;
|
||||
in += blocks;
|
||||
}
|
||||
if (len) {
|
||||
aesni_encrypt(ctx->iv,ctx->buf,key);
|
||||
++ctr32;
|
||||
PUTU32(ctx->iv+12,ctr32);
|
||||
if (ctr32 == 0) ctr96_inc(ctx->iv);
|
||||
while (len--) {
|
||||
out[n] = in[n] ^ ctx->buf[n];
|
||||
++n;
|
||||
}
|
||||
}
|
||||
ctx->num = n;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static const EVP_CIPHER aesni_128_ctr=
|
||||
{
|
||||
NID_aes_128_ctr,1,16,16,
|
||||
EVP_CIPH_CUSTOM_IV,
|
||||
aesni_init_key,
|
||||
aesni_counter,
|
||||
NULL,
|
||||
sizeof(AESNI_KEY),
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
static const EVP_CIPHER aesni_192_ctr=
|
||||
{
|
||||
NID_aes_192_ctr,1,24,16,
|
||||
EVP_CIPH_CUSTOM_IV,
|
||||
aesni_init_key,
|
||||
aesni_counter,
|
||||
NULL,
|
||||
sizeof(AESNI_KEY),
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
static const EVP_CIPHER aesni_256_ctr=
|
||||
{
|
||||
NID_aes_256_ctr,1,32,16,
|
||||
EVP_CIPH_CUSTOM_IV,
|
||||
aesni_init_key,
|
||||
aesni_counter,
|
||||
NULL,
|
||||
sizeof(AESNI_KEY),
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
static int
|
||||
aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
|
||||
const int **nids, int nid)
|
||||
|
@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
|
|||
case NID_aes_128_ofb:
|
||||
*cipher = &aesni_128_ofb;
|
||||
break;
|
||||
case NID_aes_128_ctr:
|
||||
*cipher = &aesni_128_ctr;
|
||||
break;
|
||||
|
||||
case NID_aes_192_ecb:
|
||||
*cipher = &aesni_192_ecb;
|
||||
|
@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
|
|||
case NID_aes_192_ofb:
|
||||
*cipher = &aesni_192_ofb;
|
||||
break;
|
||||
case NID_aes_192_ctr:
|
||||
*cipher = &aesni_192_ctr;
|
||||
break;
|
||||
|
||||
case NID_aes_256_ecb:
|
||||
*cipher = &aesni_256_ecb;
|
||||
|
@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
|
|||
case NID_aes_256_ofb:
|
||||
*cipher = &aesni_256_ofb;
|
||||
break;
|
||||
case NID_aes_256_ctr:
|
||||
*cipher = &aesni_256_ctr;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Sorry, we don't support this NID */
|
||||
|
|
Loading…
Reference in a new issue