aes/asm/aesni-*.pl: fix CCM and further optimize it.
modes/ccm128.c: minor branch optimization.
This commit is contained in:
parent
8a8cc84f74
commit
267b481c47
3 changed files with 83 additions and 73 deletions
|
@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") {
|
|||
|
||||
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
|
||||
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
|
||||
# compose byte-swap control mask for pshufb on stack
|
||||
&mov (&DWP(0,"esp"),0x0c0d0e0f);
|
||||
|
@ -602,34 +603,29 @@ if ($PREFIX eq "aesni") {
|
|||
&mov (&DWP(12,"esp"),0x00010203);
|
||||
|
||||
# compose counter increment vector on stack
|
||||
&mov ($rounds,1);
|
||||
&mov ($rounds_,1);
|
||||
&xor ($key_,$key_);
|
||||
&mov (&DWP(16,"esp"),$rounds);
|
||||
&mov (&DWP(16,"esp"),$rounds_);
|
||||
&mov (&DWP(20,"esp"),$key_);
|
||||
&mov (&DWP(24,"esp"),$key_);
|
||||
&mov (&DWP(28,"esp"),$key_);
|
||||
|
||||
&movdqa ($inout3,&QWP(0,"esp"));
|
||||
&pshufb ($ivec,$inout3); # keep iv in reverse order
|
||||
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
&mov ($key_,$key);
|
||||
&mov ($rounds_,$rounds);
|
||||
&shr ($rounds,1);
|
||||
&lea ($key_,&DWP(0,$key));
|
||||
&movdqa ($inout0,$ivec);
|
||||
&mov ($rounds_,$rounds);
|
||||
&movdqa ($inout3,&QWP(0,"esp"));
|
||||
|
||||
&set_label("ccm64_enc_outer");
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&pshufb ($inout0,$inout3);
|
||||
&mov ($key,$key_);
|
||||
&$movekey ($rndkey0,&QWP(0,$key_));
|
||||
&mov ($rounds,$rounds_);
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&shr ($rounds,1);
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&xorps ($in0,$rndkey0);
|
||||
&lea ($key,&DWP(32,$key));
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&xorps ($cmac,$in0); # cmac^=inp
|
||||
&$movekey ($rndkey1,&QWP(16,$key_));
|
||||
&xorps ($rndkey0,$in0);
|
||||
&lea ($key,&DWP(32,$key_));
|
||||
&xorps ($cmac,$rndkey0); # cmac^=inp
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
|
||||
&set_label("ccm64_enc2_loop");
|
||||
|
@ -642,18 +638,20 @@ if ($PREFIX eq "aesni") {
|
|||
&aesenc ($cmac,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&jnz (&label("ccm64_enc2_loop"));
|
||||
&pshufb ($ivec,$inout3);
|
||||
&aesenc ($inout0,$rndkey1);
|
||||
&aesenc ($cmac,$rndkey1);
|
||||
&paddq ($ivec,&QWP(16,"esp"));
|
||||
&aesenclast ($inout0,$rndkey0);
|
||||
&aesenclast ($cmac,$rndkey0);
|
||||
|
||||
&paddq ($ivec,&QWP(16,"esp"));
|
||||
&dec ($len);
|
||||
&lea ($inp,&DWP(16,$inp));
|
||||
&xorps ($in0,$inout0); # inp^=E(ivec)
|
||||
&movdqa ($inout0,$ivec);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0,$out),$in0); # save output
|
||||
&lea ($out,&DWP(16,$out));
|
||||
&pshufb ($ivec,$inout3);
|
||||
&jnz (&label("ccm64_enc_outer"));
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
|
@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") {
|
|||
|
||||
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
|
||||
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
|
||||
# compose byte-swap control mask for pshufb on stack
|
||||
&mov (&DWP(0,"esp"),0x0c0d0e0f);
|
||||
|
@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") {
|
|||
&mov (&DWP(12,"esp"),0x00010203);
|
||||
|
||||
# compose counter increment vector on stack
|
||||
&mov ($rounds,1);
|
||||
&mov ($rounds_,1);
|
||||
&xor ($key_,$key_);
|
||||
&mov (&DWP(16,"esp"),$rounds);
|
||||
&mov (&DWP(16,"esp"),$rounds_);
|
||||
&mov (&DWP(20,"esp"),$key_);
|
||||
&mov (&DWP(24,"esp"),$key_);
|
||||
&mov (&DWP(28,"esp"),$key_);
|
||||
|
||||
&movdqa ($inout3,&QWP(0,"esp")); # bswap mask
|
||||
&movdqa ($inout0,$ivec);
|
||||
&pshufb ($ivec,$inout3); # keep iv in reverse order
|
||||
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
&mov ($key_,$key);
|
||||
&mov ($rounds_,$rounds);
|
||||
|
||||
&pshufb ($ivec,$inout3);
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
else
|
||||
{ &call ("_aesni_encrypt1"); }
|
||||
|
||||
&set_label("ccm64_dec_outer");
|
||||
&paddq ($ivec,&QWP(16,"esp"));
|
||||
&movups ($in0,&QWP(0,$inp)); # load inp
|
||||
&xorps ($in0,$inout0);
|
||||
&movdqa ($inout0,$ivec);
|
||||
&paddq ($ivec,&QWP(16,"esp"));
|
||||
&pshufb ($ivec,$inout3);
|
||||
&lea ($inp,&QWP(16,$inp));
|
||||
&pshufb ($inout0,$inout3);
|
||||
&mov ($key,$key_);
|
||||
&jmp (&label("ccm64_dec_outer"));
|
||||
|
||||
&set_label("ccm64_dec_outer",16);
|
||||
&xorps ($in0,$inout0); # inp ^= E(ivec)
|
||||
&movdqa ($inout0,$ivec);
|
||||
&mov ($rounds,$rounds_);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&movups (&QWP(0,$out),$in0); # save output
|
||||
&lea ($out,&DWP(16,$out));
|
||||
|
||||
&sub ($len,1);
|
||||
&jz (&label("ccm64_dec_break"));
|
||||
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&$movekey ($rndkey0,&QWP(0,$key_));
|
||||
&shr ($rounds,1);
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&$movekey ($rndkey1,&QWP(16,$key_));
|
||||
&xorps ($in0,$rndkey0);
|
||||
&lea ($key,&DWP(32,$key));
|
||||
&lea ($key,&DWP(32,$key_));
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&xorps ($cmac,$in0); # cmac^=out
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
|
@ -737,13 +735,18 @@ if ($PREFIX eq "aesni") {
|
|||
&aesenc ($cmac,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&jnz (&label("ccm64_dec2_loop"));
|
||||
&movups ($in0,&QWP(0,$inp)); # load inp
|
||||
&paddq ($ivec,&QWP(16,"esp"));
|
||||
&aesenc ($inout0,$rndkey1);
|
||||
&aesenc ($cmac,$rndkey1);
|
||||
&pshufb ($ivec,$inout3);
|
||||
&lea ($inp,&QWP(16,$inp));
|
||||
&aesenclast ($inout0,$rndkey0);
|
||||
&aesenclast ($cmac,$rndkey0);
|
||||
&jmp (&label("ccm64_dec_outer"));
|
||||
|
||||
&set_label("ccm64_dec_break",16);
|
||||
&mov ($key,$key_);
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc",$cmac,$in0); }
|
||||
else
|
||||
|
|
|
@ -821,8 +821,8 @@ ___
|
|||
{
|
||||
my $cmac="%r9"; # 6th argument
|
||||
|
||||
my $increment="%xmm8";
|
||||
my $bswap_mask="%xmm9";
|
||||
my $increment="%xmm6";
|
||||
my $bswap_mask="%xmm7";
|
||||
|
||||
$code.=<<___;
|
||||
.globl aesni_ccm64_encrypt_blocks
|
||||
|
@ -839,30 +839,28 @@ $code.=<<___ if ($win64);
|
|||
.Lccm64_enc_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov 240($key),$rounds # key->rounds
|
||||
movdqu ($ivp),$iv
|
||||
movdqu ($cmac),$inout1
|
||||
movdqa .Lincrement64(%rip),$increment
|
||||
movdqa .Lbswap_mask(%rip),$bswap_mask
|
||||
pshufb $bswap_mask,$iv # keep iv in reverse order
|
||||
|
||||
mov 240($key),$rounds # key->rounds
|
||||
mov $key,$key_
|
||||
mov $rounds,$rnds_
|
||||
movdqa $iv,$inout0
|
||||
|
||||
.Lccm64_enc_outer:
|
||||
movups ($inp),$in0 # load inp
|
||||
pshufb $bswap_mask,$inout0
|
||||
mov $key_,$key
|
||||
mov $rnds_,$rounds
|
||||
|
||||
$movkey ($key),$rndkey0
|
||||
shr \$1,$rounds
|
||||
$movkey 16($key),$rndkey1
|
||||
xorps $rndkey0,$in0
|
||||
lea 32($key),$key
|
||||
xorps $rndkey0,$inout0
|
||||
xorps $inout1,$in0 # cmac^=inp
|
||||
lea 0($key),$key_
|
||||
movdqu ($cmac),$inout1
|
||||
movdqa $iv,$inout0
|
||||
mov $rounds,$rnds_
|
||||
jmp .Lccm64_enc_outer
|
||||
.align 16
|
||||
.Lccm64_enc_outer:
|
||||
$movkey ($key_),$rndkey0
|
||||
mov $rnds_,$rounds
|
||||
movups ($inp),$in0 # load inp
|
||||
|
||||
xorps $rndkey0,$inout0 # counter
|
||||
$movkey 16($key_),$rndkey1
|
||||
xorps $in0,$rndkey0
|
||||
lea 32($key_),$key
|
||||
xorps $rndkey0,$inout1 # cmac^=inp
|
||||
$movkey ($key),$rndkey0
|
||||
|
||||
.Lccm64_enc2_loop:
|
||||
|
@ -875,18 +873,20 @@ $code.=<<___;
|
|||
aesenc $rndkey0,$inout1
|
||||
$movkey 0($key),$rndkey0
|
||||
jnz .Lccm64_enc2_loop
|
||||
pshufb $bswap_mask,$iv
|
||||
aesenc $rndkey1,$inout0
|
||||
aesenc $rndkey1,$inout1
|
||||
paddq $increment,$iv
|
||||
aesenclast $rndkey0,$inout0
|
||||
aesenclast $rndkey0,$inout1
|
||||
|
||||
paddq $increment,$iv
|
||||
dec $len
|
||||
lea 16($inp),$inp
|
||||
xorps $inout0,$in0 # inp ^= E(iv)
|
||||
movdqa $iv,$inout0
|
||||
movups $in0,($out) # save output
|
||||
lea 16($out),$out
|
||||
pshufb $bswap_mask,$iv
|
||||
jnz .Lccm64_enc_outer
|
||||
|
||||
movups $inout1,($cmac)
|
||||
|
@ -919,39 +919,40 @@ $code.=<<___ if ($win64);
|
|||
.Lccm64_dec_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqu ($ivp),$iv
|
||||
mov 240($key),$rounds # key->rounds
|
||||
movups ($ivp),$iv
|
||||
movdqu ($cmac),$inout1
|
||||
movdqa .Lincrement64(%rip),$increment
|
||||
movdqa .Lbswap_mask(%rip),$bswap_mask
|
||||
|
||||
mov 240($key),$rounds # key->rounds
|
||||
movdqa $iv,$inout0
|
||||
pshufb $bswap_mask,$iv # keep iv in reverse order
|
||||
movaps $iv,$inout0
|
||||
mov $rounds,$rnds_
|
||||
mov $key,$key_
|
||||
pshufb $bswap_mask,$iv
|
||||
___
|
||||
&aesni_generate1("enc",$key,$rounds);
|
||||
$code.=<<___;
|
||||
.Lccm64_dec_outer:
|
||||
paddq $increment,$iv
|
||||
movups ($inp),$in0 # load inp
|
||||
xorps $inout0,$in0
|
||||
movdqa $iv,$inout0
|
||||
paddq $increment,$iv
|
||||
pshufb $bswap_mask,$iv
|
||||
lea 16($inp),$inp
|
||||
pshufb $bswap_mask,$inout0
|
||||
mov $key_,$key
|
||||
jmp .Lccm64_dec_outer
|
||||
.align 16
|
||||
.Lccm64_dec_outer:
|
||||
xorps $inout0,$in0 # inp ^= E(iv)
|
||||
movdqa $iv,$inout0
|
||||
mov $rnds_,$rounds
|
||||
movups $in0,($out)
|
||||
movups $in0,($out) # save output
|
||||
lea 16($out),$out
|
||||
|
||||
sub \$1,$len
|
||||
jz .Lccm64_dec_break
|
||||
|
||||
$movkey ($key),$rndkey0
|
||||
$movkey ($key_),$rndkey0
|
||||
shr \$1,$rounds
|
||||
$movkey 16($key),$rndkey1
|
||||
$movkey 16($key_),$rndkey1
|
||||
xorps $rndkey0,$in0
|
||||
lea 32($key),$key
|
||||
lea 32($key_),$key
|
||||
xorps $rndkey0,$inout0
|
||||
xorps $in0,$inout1 # cmac^=out
|
||||
$movkey ($key),$rndkey0
|
||||
|
@ -966,15 +967,21 @@ $code.=<<___;
|
|||
aesenc $rndkey0,$inout1
|
||||
$movkey 0($key),$rndkey0
|
||||
jnz .Lccm64_dec2_loop
|
||||
movups ($inp),$in0 # load inp
|
||||
paddq $increment,$iv
|
||||
aesenc $rndkey1,$inout0
|
||||
aesenc $rndkey1,$inout1
|
||||
pshufb $bswap_mask,$iv
|
||||
lea 16($inp),$inp
|
||||
aesenclast $rndkey0,$inout0
|
||||
aesenclast $rndkey0,$inout1
|
||||
jmp .Lccm64_dec_outer
|
||||
|
||||
.align 16
|
||||
.Lccm64_dec_break:
|
||||
#xorps $in0,$inout1 # cmac^=out
|
||||
___
|
||||
&aesni_generate1("enc",$key,$rounds,$inout1);
|
||||
&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
|
||||
$code.=<<___;
|
||||
movups $inout1,($cmac)
|
||||
___
|
||||
|
|
|
@ -356,10 +356,10 @@ int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
|
|||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len) ctr64_add(ctx->nonce.c,n/16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
if (n) ctr64_add(ctx->nonce.c,n/16);
|
||||
for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
|
@ -409,10 +409,10 @@ int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
|
|||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len) ctr64_add(ctx->nonce.c,n/16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
if (n) ctr64_add(ctx->nonce.c,n/16);
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
for (i=0; i<len; ++i)
|
||||
ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
|
||||
|
|
Loading…
Reference in a new issue