#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements support for Intel AES-NI extension. In # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for # details]. # # Performance. # # To start with see corresponding paragraph in aesni-x86_64.pl... # Instead of filling table similar to one found there I've chosen to # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. # The simplified table below represents 32-bit performance relative # to 64-bit one in every given point. Ratios vary for different # encryption modes, therefore interval values. # # 16-byte 64-byte 256-byte 1-KB 8-KB # 53-67% 67-84% 91-94% 95-98% 97-99.5% # # Lower ratios for smaller block sizes are perfectly understandable, # because function call overhead is higher in 32-bit mode. Largest # 8-KB block performance is virtually same: 32-bit code is less than # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. # January 2011 # # See aesni-x86_64.pl for details. Unlike x86_64 version this module # interleaves at most 6 aes[enc|dec] instructions, because there are # not enough registers for 8x interleave [which should be optimal for # Sandy Bridge]. Actually, performance results for 6x interleave # factor presented in aesni-x86_64.pl (except for CTR) are for this # module. # April 2011 # # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 # Haswell 4.44/0.80 0.97 1.03 0.72 # Atom 5.77/3.56 3.67 4.03 3.46 # Bulldozer 5.80/0.98 1.05 1.24 0.93 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) $inline=1; # inline _aesni_[en|de]crypt $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],$0); if ($PREFIX eq "aesni") { $movekey=\&movups; } else { $movekey=\&movups; } $len="eax"; $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key $rndkey0="xmm0"; $rndkey1="xmm1"; $inout0="xmm2"; $inout1="xmm3"; $inout2="xmm4"; $inout3="xmm5"; $in1="xmm5"; $inout4="xmm6"; $in0="xmm6"; $inout5="xmm7"; $ivec="xmm7"; # AESNI extenstion sub aeskeygenassist { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } } sub aescommon { my($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} } sub aesimc { aescommon(0xdb,@_); } sub aesenc { aescommon(0xdc,@_); } sub aesenclast { aescommon(0xdd,@_); } sub aesdec { aescommon(0xde,@_); } sub aesdeclast { aescommon(0xdf,@_); } # Inline version of internal aesni_[en|de]crypt1 { my $sn; sub aesni_inline_generate1 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); $sn++; &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($ivec,$rndkey0) if (defined($ivec)); &lea ($key,&DWP(32,$key)); &xorps ($inout,$ivec) if (defined($ivec)); &xorps ($inout,$rndkey0) if (!defined($ivec)); &set_label("${p}1_loop_$sn"); eval"&aes${p} ($inout,$rndkey1)"; &dec ($rounds); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop_$sn")); eval"&aes${p}last ($inout,$rndkey1)"; }} sub aesni_generate1 # fully unrolled loop { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); &function_begin_B("_aesni_${p}rypt1"); &movups ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &xorps ($inout,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &cmp ($rounds,11); &jb (&label("${p}128")); &lea ($key,&DWP(0x20,$key)); &je (&label("${p}192")); &lea ($key,&DWP(0x20,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x40,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x30,$key)); &set_label("${p}192"); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x20,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x10,$key)); &set_label("${p}128"); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x10,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x20,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x30,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x40,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x50,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x60,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x70,$key)); eval"&aes${p} ($inout,$rndkey1)"; eval"&aes${p}last ($inout,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); &aesni_generate1("enc") if (!$inline); &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); &aesni_generate1("dec") if(!$inline); &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]cryptN are private interfaces, N denotes interleave # factor. Why 3x subroutine were originally used in loops? Even though # aes[enc|dec] latency was originally 6, it could be scheduled only # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. # aes[enc|dec] latency in next processor generation is 8, but the # instructions can be scheduled every cycle. Optimal interleave for # new processor is therefore 8x, but it's unfeasible to accommodate it # in XMM registers addreassable in 32-bit mode and therefore 6x is # used instead... sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &add ($rounds,16); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt3"); } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $p=shift; &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shl ($rounds,4); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &data_byte (0x0f,0x1f,0x40,0x00); &add ($rounds,16); &set_label("${p}4_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}4_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt4"); } sub aesni_generate6 { my $p=shift; &function_begin_B("_aesni_${p}rypt6"); &static_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); # pxor does better here &pxor ($inout2,$rndkey0); eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout3,$rndkey0); &pxor ($inout4,$rndkey0); eval"&aes${p} ($inout1,$rndkey1)"; &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout5,$rndkey0); &add ($rounds,16); eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jmp (&label("_aesni_${p}rypt6_enter")); &set_label("${p}6_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; &set_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; eval"&aes${p} ($inout4,$rndkey0)"; eval"&aes${p} ($inout5,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}6_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; eval"&aes${p}last ($inout4,$rndkey0)"; eval"&aes${p}last ($inout5,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt6"); } &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); &aesni_generate6("enc") if ($PREFIX eq "aesni"); &aesni_generate6("dec"); if ($PREFIX eq "aesni") { ###################################################################### # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); &function_begin("aesni_ecb_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &and ($len,-16); &jz (&label("ecb_ret")); &mov ($rounds,&DWP(240,$key)); &test ($rounds_,$rounds_); &jz (&label("ecb_decrypt")); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &cmp ($len,0x60); &jb (&label("ecb_enc_tail")); &movdqu ($inout0,&QWP(0,$inp)); &movdqu ($inout1,&QWP(0x10,$inp)); &movdqu ($inout2,&QWP(0x20,$inp)); &movdqu ($inout3,&QWP(0x30,$inp)); &movdqu ($inout4,&QWP(0x40,$inp)); &movdqu ($inout5,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &sub ($len,0x60); &jmp (&label("ecb_enc_loop6_enter")); &set_label("ecb_enc_loop6",16); &movups (&QWP(0,$out),$inout0); &movdqu ($inout0,&QWP(0,$inp)); &movups (&QWP(0x10,$out),$inout1); &movdqu ($inout1,&QWP(0x10,$inp)); &movups (&QWP(0x20,$out),$inout2); &movdqu ($inout2,&QWP(0x20,$inp)); &movups (&QWP(0x30,$out),$inout3); &movdqu ($inout3,&QWP(0x30,$inp)); &movups (&QWP(0x40,$out),$inout4); &movdqu ($inout4,&QWP(0x40,$inp)); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &movdqu ($inout5,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &set_label("ecb_enc_loop6_enter"); &call ("_aesni_encrypt6"); &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds &sub ($len,0x60); &jnc (&label("ecb_enc_loop6")); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &add ($len,0x60); &jz (&label("ecb_ret")); &set_label("ecb_enc_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x40); &jb (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &je (&label("ecb_enc_four")); &movups ($inout4,&QWP(0x40,$inp)); &xorps ($inout5,$inout5); &call ("_aesni_encrypt6"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_enc_three",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); &set_label("ecb_enc_four",16); &call ("_aesni_encrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &jmp (&label("ecb_ret")); ###################################################################### &set_label("ecb_decrypt",16); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &cmp ($len,0x60); &jb (&label("ecb_dec_tail")); &movdqu ($inout0,&QWP(0,$inp)); &movdqu ($inout1,&QWP(0x10,$inp)); &movdqu ($inout2,&QWP(0x20,$inp)); &movdqu ($inout3,&QWP(0x30,$inp)); &movdqu ($inout4,&QWP(0x40,$inp)); &movdqu ($inout5,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &sub ($len,0x60); &jmp (&label("ecb_dec_loop6_enter")); &set_label("ecb_dec_loop6",16); &movups (&QWP(0,$out),$inout0); &movdqu ($inout0,&QWP(0,$inp)); &movups (&QWP(0x10,$out),$inout1); &movdqu ($inout1,&QWP(0x10,$inp)); &movups (&QWP(0x20,$out),$inout2); &movdqu ($inout2,&QWP(0x20,$inp)); &movups (&QWP(0x30,$out),$inout3); &movdqu ($inout3,&QWP(0x30,$inp)); &movups (&QWP(0x40,$out),$inout4); &movdqu ($inout4,&QWP(0x40,$inp)); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &movdqu ($inout5,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &set_label("ecb_dec_loop6_enter"); &call ("_aesni_decrypt6"); &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds &sub ($len,0x60); &jnc (&label("ecb_dec_loop6")); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &add ($len,0x60); &jz (&label("ecb_ret")); &set_label("ecb_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x40); &jb (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &je (&label("ecb_dec_four")); &movups ($inout4,&QWP(0x40,$inp)); &xorps ($inout5,$inout5); &call ("_aesni_decrypt6"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_dec_three",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); &set_label("ecb_dec_four",16); &call ("_aesni_decrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); ###################################################################### # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec,char *cmac); # # Handles only complete blocks, operates on 64-bit counter and # does not update *ivec! Nor does it finalize CMAC value # (see engine/eng_aesni.c for details) # { my $cmac=$inout1; &function_begin("aesni_ccm64_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &shl ($rounds,4); &mov ($rounds_,16); &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); &movdqa ($inout0,$ivec); &lea ($key,&DWP(32,$key,$rounds)); &sub ($rounds_,$rounds); &pshufb ($ivec,$inout3); &set_label("ccm64_enc_outer"); &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); &movups ($in0,&QWP(0,$inp)); &xorps ($inout0,$rndkey0); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($rndkey0,$in0); &xorps ($cmac,$rndkey0); # cmac^=inp &$movekey ($rndkey0,&QWP(32,$key_)); &set_label("ccm64_enc2_loop"); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); &aesenc ($inout0,$rndkey0); &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("ccm64_enc2_loop")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movups (&QWP(0,$out),$in0); # save output &pshufb ($inout0,$inout3); &lea ($out,&DWP(16,$out)); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movups (&QWP(0,$out),$cmac); &function_end("aesni_ccm64_encrypt_blocks"); &function_begin("aesni_ccm64_decrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); &mov ($key_,$key); &mov ($rounds_,$rounds); &pshufb ($ivec,$inout3); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &shl ($rounds_,4); &mov ($rounds,16); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &lea ($inp,&QWP(16,$inp)); &sub ($rounds,$rounds_); &lea ($key,&DWP(32,$key_,$rounds_)); &mov ($rounds_,$rounds); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_outer",16); &xorps ($in0,$inout0); # inp ^= E(ivec) &movdqa ($inout0,$ivec); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); &sub ($len,1); &jz (&label("ccm64_dec_break")); &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out &$movekey ($rndkey0,&QWP(32,$key_)); &set_label("ccm64_dec2_loop"); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); &aesenc ($inout0,$rndkey0); &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("ccm64_dec2_loop")); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &lea ($inp,&QWP(16,$inp)); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); &mov ($rounds,&DWP(240,$key_)); &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } else { &call ("_aesni_encrypt1",$cmac); } &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movups (&QWP(0,$out),$cmac); &function_end("aesni_ccm64_decrypt_blocks"); } ###################################################################### # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see crypto/modes/ctr128.c for details) # # stack layout: # 0 pshufb mask # 16 vector addend: 0,6,6,6 # 32 counter-less ivec # 48 1st triplet of counter vector # 64 2nd triplet of counter vector # 80 saved %esp &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($key_,"esp"); &sub ("esp",88); &and ("esp",-16); # align stack &mov (&DWP(80,"esp"),$key_); &cmp ($len,1); &je (&label("ctr32_one_shortcut")); &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,6); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$rounds); &mov (&DWP(24,"esp"),$rounds); &mov (&DWP(28,"esp"),$key_); &pextrd ($rounds_,$inout5,3); # pull 32-bit counter &pinsrd ($inout5,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds # compose 2 vectors of 3x32-bit counters &bswap ($rounds_); &pxor ($rndkey0,$rndkey0); &pxor ($rndkey1,$rndkey1); &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask &pinsrd ($rndkey0,$rounds_,0); &lea ($key_,&DWP(3,$rounds_)); &pinsrd ($rndkey1,$key_,0); &inc ($rounds_); &pinsrd ($rndkey0,$rounds_,1); &inc ($key_); &pinsrd ($rndkey1,$key_,1); &inc ($rounds_); &pinsrd ($rndkey0,$rounds_,2); &inc ($key_); &pinsrd ($rndkey1,$key_,2); &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet &pshufb ($rndkey0,$inout0); # byte swap &movdqu ($inout4,&QWP(0,$key)); # key[0] &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet &pshufb ($rndkey1,$inout0); # byte swap &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword &pshufd ($inout1,$rndkey0,2<<6); &cmp ($len,6); &jb (&label("ctr32_tail")); &pxor ($inout5,$inout4); # counter-less ivec^key[0] &shl ($rounds,4); &mov ($rounds_,16); &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] &mov ($key_,$key); # backup $key &sub ($rounds_,$rounds); # backup twisted $rounds &lea ($key,&DWP(32,$key,$rounds)); &sub ($len,6); &jmp (&label("ctr32_loop6")); &set_label("ctr32_loop6",16); # inlining _aesni_encrypt6's prologue gives ~6% improvement... &pshufd ($inout2,$rndkey0,1<<6); &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec &pshufd ($inout3,$rndkey1,3<<6); &pxor ($inout0,$rndkey0); # merge counter-less ivec &pshufd ($inout4,$rndkey1,2<<6); &pxor ($inout1,$rndkey0); &pshufd ($inout5,$rndkey1,1<<6); &$movekey ($rndkey1,&QWP(16,$key_)); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &aesenc ($inout0,$rndkey1); &pxor ($inout4,$rndkey0); &pxor ($inout5,$rndkey0); &aesenc ($inout1,$rndkey1); &$movekey ($rndkey0,&QWP(32,$key_)); &mov ($rounds,$rounds_); &aesenc ($inout2,$rndkey1); &aesenc ($inout3,$rndkey1); &aesenc ($inout4,$rndkey1); &aesenc ($inout5,$rndkey1); &call (&label("_aesni_encrypt6_enter")); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout1,$rndkey0); &movups (&QWP(0,$out),$inout0); &movdqa ($rndkey0,&QWP(16,"esp")); # load increment &xorps ($inout2,$rndkey1); &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &paddd ($rndkey1,$rndkey0); # 2nd triplet increment &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask &movups ($inout1,&QWP(0x30,$inp)); &movups ($inout2,&QWP(0x40,$inp)); &xorps ($inout3,$inout1); &movups ($inout1,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet &pshufb ($rndkey0,$inout0); # byte swap &xorps ($inout4,$inout2); &movups (&QWP(0x30,$out),$inout3); &xorps ($inout5,$inout1); &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet &pshufb ($rndkey1,$inout0); # byte swap &movups (&QWP(0x40,$out),$inout4); &pshufd ($inout0,$rndkey0,3<<6); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &pshufd ($inout1,$rndkey0,2<<6); &sub ($len,6); &jnc (&label("ctr32_loop6")); &add ($len,6); &jz (&label("ctr32_ret")); &movdqu ($inout5,&QWP(0,$key_)); &mov ($key,$key_); &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec &mov ($rounds,&DWP(240,$key_)); # restore $rounds &set_label("ctr32_tail"); &por ($inout0,$inout5); &cmp ($len,2); &jb (&label("ctr32_one")); &pshufd ($inout2,$rndkey0,1<<6); &por ($inout1,$inout5); &je (&label("ctr32_two")); &pshufd ($inout3,$rndkey1,3<<6); &por ($inout2,$inout5); &cmp ($len,4); &jb (&label("ctr32_three")); &pshufd ($inout4,$rndkey1,2<<6); &por ($inout3,$inout5); &je (&label("ctr32_four")); &por ($inout4,$inout5); &call ("_aesni_encrypt6"); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout1,$rndkey0); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout2,$rndkey1); &movups ($rndkey1,&QWP(0x40,$inp)); &xorps ($inout3,$rndkey0); &movups (&QWP(0,$out),$inout0); &xorps ($inout4,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); &jmp (&label("ctr32_ret")); &set_label("ctr32_one_shortcut",16); &movups ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); &set_label("ctr32_one"); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &xorps ($in0,$inout0); &movups (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt3"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); &xorps ($inout1,$inout4); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); &movups ($inout5,&QWP(0x20,$inp)); &xorps ($inout1,$inout4); &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$inout5); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ctr32_ret")); &set_label("ctr32_four",16); &call ("_aesni_encrypt4"); &movups ($inout4,&QWP(0,$inp)); &movups ($inout5,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout0,$inout4); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout1,$inout5); &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &xorps ($inout3,$rndkey0); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &set_label("ctr32_ret"); &mov ("esp",&DWP(80,"esp")); &function_end("aesni_ctr32_encrypt_blocks"); ###################################################################### # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2 # const unsigned char iv[16]); # { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); &function_begin("aesni_xts_encrypt"); &mov ($key,&wparam(4)); # key2 &mov ($inp,&wparam(5)); # clear-text tweak &mov ($rounds,&DWP(240,$key)); # key2->rounds &movups ($inout0,&QWP(0,$inp)); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); # key1 &mov ($key_,"esp"); &sub ("esp",16*7+8); &mov ($rounds,&DWP(240,$key)); # key1->rounds &and ("esp",-16); # align stack &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant &mov (&DWP(16*6+4,"esp"),0); &mov (&DWP(16*6+8,"esp"),1); &mov (&DWP(16*6+12,"esp"),0); &mov (&DWP(16*7+0,"esp"),$len); # save original $len &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp &movdqa ($tweak,$inout0); &pxor ($twtmp,$twtmp); &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 &pcmpgtd($twtmp,$tweak); # broadcast upper bits &and ($len,-16); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &sub ($len,16*6); &jc (&label("xts_enc_short")); &shl ($rounds,4); &mov ($rounds_,16); &sub ($rounds_,$rounds); &lea ($key,&DWP(32,$key,$rounds)); &jmp (&label("xts_enc_loop6")); &set_label("xts_enc_loop6",16); for ($i=0;$i<4;$i++) { &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa (&QWP(16*$i,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd ($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); } &pshufd ($inout5,$twtmp,0x13); &movdqa (&QWP(16*$i++,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($tweak,1); &$movekey ($rndkey0,&QWP(0,$key_)); &pand ($inout5,$twmask); # isolate carry and residue &movups ($inout0,&QWP(0,$inp)); # load input &pxor ($inout5,$tweak); # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] &mov ($rounds,$rounds_); # restore $rounds &movdqu ($inout1,&QWP(16*1,$inp)); &xorps ($inout0,$rndkey0); # input^=rndkey[0] &movdqu ($inout2,&QWP(16*2,$inp)); &pxor ($inout1,$rndkey0); &movdqu ($inout3,&QWP(16*3,$inp)); &pxor ($inout2,$rndkey0); &movdqu ($inout4,&QWP(16*4,$inp)); &pxor ($inout3,$rndkey0); &movdqu ($rndkey1,&QWP(16*5,$inp)); &pxor ($inout4,$rndkey0); &lea ($inp,&DWP(16*6,$inp)); &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak &pxor ($inout5,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key_)); &pxor ($inout1,&QWP(16*1,"esp")); &pxor ($inout2,&QWP(16*2,"esp")); &aesenc ($inout0,$rndkey1); &pxor ($inout3,&QWP(16*3,"esp")); &pxor ($inout4,&QWP(16*4,"esp")); &aesenc ($inout1,$rndkey1); &pxor ($inout5,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key_)); &aesenc ($inout2,$rndkey1); &aesenc ($inout3,$rndkey1); &aesenc ($inout4,$rndkey1); &aesenc ($inout5,$rndkey1); &call (&label("_aesni_encrypt6_enter")); &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak &pxor ($twtmp,$twtmp); &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &pcmpgtd ($twtmp,$tweak); # broadcast upper bits &xorps ($inout1,&QWP(16*1,"esp")); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout2,&QWP(16*2,"esp")); &movups (&QWP(16*1,$out),$inout1); &xorps ($inout3,&QWP(16*3,"esp")); &movups (&QWP(16*2,$out),$inout2); &xorps ($inout4,&QWP(16*4,"esp")); &movups (&QWP(16*3,$out),$inout3); &xorps ($inout5,$tweak); &movups (&QWP(16*4,$out),$inout4); &pshufd ($twres,$twtmp,0x13); &movups (&QWP(16*5,$out),$inout5); &lea ($out,&DWP(16*6,$out)); &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 &pxor ($twtmp,$twtmp); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &sub ($len,16*6); &jnc (&label("xts_enc_loop6")); &mov ($rounds,&DWP(240,$key_)); # restore $rounds &mov ($key,$key_); # restore $key &mov ($rounds_,$rounds); &set_label("xts_enc_short"); &add ($len,16*6); &jz (&label("xts_enc_done6x")); &movdqa ($inout3,$tweak); # put aside previous tweak &cmp ($len,0x20); &jb (&label("xts_enc_one")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &je (&label("xts_enc_two")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa ($inout4,$tweak); # put aside previous tweak &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &cmp ($len,0x40); &jb (&label("xts_enc_three")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa ($inout5,$tweak); # put aside previous tweak &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &movdqa (&QWP(16*0,"esp"),$inout3); &movdqa (&QWP(16*1,"esp"),$inout4); &je (&label("xts_enc_four")); &movdqa (&QWP(16*2,"esp"),$inout5); &pshufd ($inout5,$twtmp,0x13); &movdqa (&QWP(16*3,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($inout0,1); &pand ($inout5,$twmask); # isolate carry and residue &pxor ($inout5,$tweak); &movdqu ($inout0,&QWP(16*0,$inp)); # load input &movdqu ($inout1,&QWP(16*1,$inp)); &movdqu ($inout2,&QWP(16*2,$inp)); &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak &movdqu ($inout3,&QWP(16*3,$inp)); &pxor ($inout1,&QWP(16*1,"esp")); &movdqu ($inout4,&QWP(16*4,$inp)); &pxor ($inout2,&QWP(16*2,"esp")); &lea ($inp,&DWP(16*5,$inp)); &pxor ($inout3,&QWP(16*3,"esp")); &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak &pxor ($inout4,$inout5); &call ("_aesni_encrypt6"); &movaps ($tweak,&QWP(16*4,"esp")); # last tweak &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,&QWP(16*2,"esp")); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout3,&QWP(16*3,"esp")); &movups (&QWP(16*1,$out),$inout1); &xorps ($inout4,$tweak); &movups (&QWP(16*2,$out),$inout2); &movups (&QWP(16*3,$out),$inout3); &movups (&QWP(16*4,$out),$inout4); &lea ($out,&DWP(16*5,$out)); &jmp (&label("xts_enc_done")); &set_label("xts_enc_one",16); &movups ($inout0,&QWP(16*0,$inp)); # load input &lea ($inp,&DWP(16*1,$inp)); &xorps ($inout0,$inout3); # input^=tweak if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &xorps ($inout0,$inout3); # output^=tweak &movups (&QWP(16*0,$out),$inout0); # write output &lea ($out,&DWP(16*1,$out)); &movdqa ($tweak,$inout3); # last tweak &jmp (&label("xts_enc_done")); &set_label("xts_enc_two",16); &movaps ($inout4,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); &movups (&QWP(16*0,$out),$inout0); # write output &movups (&QWP(16*1,$out),$inout1); &lea ($out,&DWP(16*2,$out)); &movdqa ($tweak,$inout4); # last tweak &jmp (&label("xts_enc_done")); &set_label("xts_enc_three",16); &movaps ($inout5,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &movups ($inout2,&QWP(16*2,$inp)); &lea ($inp,&DWP(16*3,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout5); &call ("_aesni_encrypt3"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout5); &movups (&QWP(16*0,$out),$inout0); # write output &movups (&QWP(16*1,$out),$inout1); &movups (&QWP(16*2,$out),$inout2); &lea ($out,&DWP(16*3,$out)); &movdqa ($tweak,$inout5); # last tweak &jmp (&label("xts_enc_done")); &set_label("xts_enc_four",16); &movaps ($inout4,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &movups ($inout2,&QWP(16*2,$inp)); &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak &movups ($inout3,&QWP(16*3,$inp)); &lea ($inp,&DWP(16*4,$inp)); &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,$inout5); &xorps ($inout3,$inout4); &call ("_aesni_encrypt4"); &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,$inout5); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout3,$inout4); &movups (&QWP(16*1,$out),$inout1); &movups (&QWP(16*2,$out),$inout2); &movups (&QWP(16*3,$out),$inout3); &lea ($out,&DWP(16*4,$out)); &movdqa ($tweak,$inout4); # last tweak &jmp (&label("xts_enc_done")); &set_label("xts_enc_done6x",16); # $tweak is pre-calculated &mov ($len,&DWP(16*7+0,"esp")); # restore original $len &and ($len,15); &jz (&label("xts_enc_ret")); &movdqa ($inout3,$tweak); &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 &jmp (&label("xts_enc_steal")); &set_label("xts_enc_done",16); &mov ($len,&DWP(16*7+0,"esp")); # restore original $len &pxor ($twtmp,$twtmp); &and ($len,15); &jz (&label("xts_enc_ret")); &pcmpgtd($twtmp,$tweak); # broadcast upper bits &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 &pshufd ($inout3,$twtmp,0x13); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue &pxor ($inout3,$tweak); &set_label("xts_enc_steal"); &movz ($rounds,&BP(0,$inp)); &movz ($key,&BP(-16,$out)); &lea ($inp,&DWP(1,$inp)); &mov (&BP(-16,$out),&LB($rounds)); &mov (&BP(0,$out),&LB($key)); &lea ($out,&DWP(1,$out)); &sub ($len,1); &jnz (&label("xts_enc_steal")); &sub ($out,&DWP(16*7+0,"esp")); # rewind $out &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds &movups ($inout0,&QWP(-16,$out)); # load input &xorps ($inout0,$inout3); # input^=tweak if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &xorps ($inout0,$inout3); # output^=tweak &movups (&QWP(-16,$out),$inout0); # write output &set_label("xts_enc_ret"); &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp &function_end("aesni_xts_encrypt"); &function_begin("aesni_xts_decrypt"); &mov ($key,&wparam(4)); # key2 &mov ($inp,&wparam(5)); # clear-text tweak &mov ($rounds,&DWP(240,$key)); # key2->rounds &movups ($inout0,&QWP(0,$inp)); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); # key1 &mov ($key_,"esp"); &sub ("esp",16*7+8); &and ("esp",-16); # align stack &xor ($rounds_,$rounds_); # if(len%16) len-=16; &test ($len,15); &setnz (&LB($rounds_)); &shl ($rounds_,4); &sub ($len,$rounds_); &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant &mov (&DWP(16*6+4,"esp"),0); &mov (&DWP(16*6+8,"esp"),1); &mov (&DWP(16*6+12,"esp"),0); &mov (&DWP(16*7+0,"esp"),$len); # save original $len &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp &mov ($rounds,&DWP(240,$key)); # key1->rounds &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &movdqa ($tweak,$inout0); &pxor ($twtmp,$twtmp); &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 &pcmpgtd($twtmp,$tweak); # broadcast upper bits &and ($len,-16); &sub ($len,16*6); &jc (&label("xts_dec_short")); &shl ($rounds,4); &mov ($rounds_,16); &sub ($rounds_,$rounds); &lea ($key,&DWP(32,$key,$rounds)); &jmp (&label("xts_dec_loop6")); &set_label("xts_dec_loop6",16); for ($i=0;$i<4;$i++) { &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa (&QWP(16*$i,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd ($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); } &pshufd ($inout5,$twtmp,0x13); &movdqa (&QWP(16*$i++,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($tweak,1); &$movekey ($rndkey0,&QWP(0,$key_)); &pand ($inout5,$twmask); # isolate carry and residue &movups ($inout0,&QWP(0,$inp)); # load input &pxor ($inout5,$tweak); # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] &mov ($rounds,$rounds_); &movdqu ($inout1,&QWP(16*1,$inp)); &xorps ($inout0,$rndkey0); # input^=rndkey[0] &movdqu ($inout2,&QWP(16*2,$inp)); &pxor ($inout1,$rndkey0); &movdqu ($inout3,&QWP(16*3,$inp)); &pxor ($inout2,$rndkey0); &movdqu ($inout4,&QWP(16*4,$inp)); &pxor ($inout3,$rndkey0); &movdqu ($rndkey1,&QWP(16*5,$inp)); &pxor ($inout4,$rndkey0); &lea ($inp,&DWP(16*6,$inp)); &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak &pxor ($inout5,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key_)); &pxor ($inout1,&QWP(16*1,"esp")); &pxor ($inout2,&QWP(16*2,"esp")); &aesdec ($inout0,$rndkey1); &pxor ($inout3,&QWP(16*3,"esp")); &pxor ($inout4,&QWP(16*4,"esp")); &aesdec ($inout1,$rndkey1); &pxor ($inout5,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key_)); &aesdec ($inout2,$rndkey1); &aesdec ($inout3,$rndkey1); &aesdec ($inout4,$rndkey1); &aesdec ($inout5,$rndkey1); &call (&label("_aesni_decrypt6_enter")); &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak &pxor ($twtmp,$twtmp); &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &pcmpgtd ($twtmp,$tweak); # broadcast upper bits &xorps ($inout1,&QWP(16*1,"esp")); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout2,&QWP(16*2,"esp")); &movups (&QWP(16*1,$out),$inout1); &xorps ($inout3,&QWP(16*3,"esp")); &movups (&QWP(16*2,$out),$inout2); &xorps ($inout4,&QWP(16*4,"esp")); &movups (&QWP(16*3,$out),$inout3); &xorps ($inout5,$tweak); &movups (&QWP(16*4,$out),$inout4); &pshufd ($twres,$twtmp,0x13); &movups (&QWP(16*5,$out),$inout5); &lea ($out,&DWP(16*6,$out)); &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 &pxor ($twtmp,$twtmp); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &sub ($len,16*6); &jnc (&label("xts_dec_loop6")); &mov ($rounds,&DWP(240,$key_)); # restore $rounds &mov ($key,$key_); # restore $key &mov ($rounds_,$rounds); &set_label("xts_dec_short"); &add ($len,16*6); &jz (&label("xts_dec_done6x")); &movdqa ($inout3,$tweak); # put aside previous tweak &cmp ($len,0x20); &jb (&label("xts_dec_one")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &je (&label("xts_dec_two")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa ($inout4,$tweak); # put aside previous tweak &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &cmp ($len,0x40); &jb (&label("xts_dec_three")); &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa ($inout5,$tweak); # put aside previous tweak &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &movdqa (&QWP(16*0,"esp"),$inout3); &movdqa (&QWP(16*1,"esp"),$inout4); &je (&label("xts_dec_four")); &movdqa (&QWP(16*2,"esp"),$inout5); &pshufd ($inout5,$twtmp,0x13); &movdqa (&QWP(16*3,"esp"),$tweak); &paddq ($tweak,$tweak); # &psllq($inout0,1); &pand ($inout5,$twmask); # isolate carry and residue &pxor ($inout5,$tweak); &movdqu ($inout0,&QWP(16*0,$inp)); # load input &movdqu ($inout1,&QWP(16*1,$inp)); &movdqu ($inout2,&QWP(16*2,$inp)); &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak &movdqu ($inout3,&QWP(16*3,$inp)); &pxor ($inout1,&QWP(16*1,"esp")); &movdqu ($inout4,&QWP(16*4,$inp)); &pxor ($inout2,&QWP(16*2,"esp")); &lea ($inp,&DWP(16*5,$inp)); &pxor ($inout3,&QWP(16*3,"esp")); &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak &pxor ($inout4,$inout5); &call ("_aesni_decrypt6"); &movaps ($tweak,&QWP(16*4,"esp")); # last tweak &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,&QWP(16*2,"esp")); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout3,&QWP(16*3,"esp")); &movups (&QWP(16*1,$out),$inout1); &xorps ($inout4,$tweak); &movups (&QWP(16*2,$out),$inout2); &movups (&QWP(16*3,$out),$inout3); &movups (&QWP(16*4,$out),$inout4); &lea ($out,&DWP(16*5,$out)); &jmp (&label("xts_dec_done")); &set_label("xts_dec_one",16); &movups ($inout0,&QWP(16*0,$inp)); # load input &lea ($inp,&DWP(16*1,$inp)); &xorps ($inout0,$inout3); # input^=tweak if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &xorps ($inout0,$inout3); # output^=tweak &movups (&QWP(16*0,$out),$inout0); # write output &lea ($out,&DWP(16*1,$out)); &movdqa ($tweak,$inout3); # last tweak &jmp (&label("xts_dec_done")); &set_label("xts_dec_two",16); &movaps ($inout4,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &call ("_aesni_decrypt3"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); &movups (&QWP(16*0,$out),$inout0); # write output &movups (&QWP(16*1,$out),$inout1); &lea ($out,&DWP(16*2,$out)); &movdqa ($tweak,$inout4); # last tweak &jmp (&label("xts_dec_done")); &set_label("xts_dec_three",16); &movaps ($inout5,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &movups ($inout2,&QWP(16*2,$inp)); &lea ($inp,&DWP(16*3,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout5); &call ("_aesni_decrypt3"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout5); &movups (&QWP(16*0,$out),$inout0); # write output &movups (&QWP(16*1,$out),$inout1); &movups (&QWP(16*2,$out),$inout2); &lea ($out,&DWP(16*3,$out)); &movdqa ($tweak,$inout5); # last tweak &jmp (&label("xts_dec_done")); &set_label("xts_dec_four",16); &movaps ($inout4,$tweak); # put aside last tweak &movups ($inout0,&QWP(16*0,$inp)); # load input &movups ($inout1,&QWP(16*1,$inp)); &movups ($inout2,&QWP(16*2,$inp)); &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak &movups ($inout3,&QWP(16*3,$inp)); &lea ($inp,&DWP(16*4,$inp)); &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,$inout5); &xorps ($inout3,$inout4); &call ("_aesni_decrypt4"); &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak &xorps ($inout1,&QWP(16*1,"esp")); &xorps ($inout2,$inout5); &movups (&QWP(16*0,$out),$inout0); # write output &xorps ($inout3,$inout4); &movups (&QWP(16*1,$out),$inout1); &movups (&QWP(16*2,$out),$inout2); &movups (&QWP(16*3,$out),$inout3); &lea ($out,&DWP(16*4,$out)); &movdqa ($tweak,$inout4); # last tweak &jmp (&label("xts_dec_done")); &set_label("xts_dec_done6x",16); # $tweak is pre-calculated &mov ($len,&DWP(16*7+0,"esp")); # restore original $len &and ($len,15); &jz (&label("xts_dec_ret")); &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 &jmp (&label("xts_dec_only_one_more")); &set_label("xts_dec_done",16); &mov ($len,&DWP(16*7+0,"esp")); # restore original $len &pxor ($twtmp,$twtmp); &and ($len,15); &jz (&label("xts_dec_ret")); &pcmpgtd($twtmp,$tweak); # broadcast upper bits &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 &pshufd ($twres,$twtmp,0x13); &pxor ($twtmp,$twtmp); &movdqa ($twmask,&QWP(16*6,"esp")); &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits &pxor ($tweak,$twres); &set_label("xts_dec_only_one_more"); &pshufd ($inout3,$twtmp,0x13); &movdqa ($inout4,$tweak); # put aside previous tweak &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($inout3,$twmask); # isolate carry and residue &pxor ($inout3,$tweak); &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds &movups ($inout0,&QWP(0,$inp)); # load input &xorps ($inout0,$inout3); # input^=tweak if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &xorps ($inout0,$inout3); # output^=tweak &movups (&QWP(0,$out),$inout0); # write output &set_label("xts_dec_steal"); &movz ($rounds,&BP(16,$inp)); &movz ($key,&BP(0,$out)); &lea ($inp,&DWP(1,$inp)); &mov (&BP(0,$out),&LB($rounds)); &mov (&BP(16,$out),&LB($key)); &lea ($out,&DWP(1,$out)); &sub ($len,1); &jnz (&label("xts_dec_steal")); &sub ($out,&DWP(16*7+0,"esp")); # rewind $out &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds &movups ($inout0,&QWP(0,$out)); # load input &xorps ($inout0,$inout4); # input^=tweak if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &xorps ($inout0,$inout4); # output^=tweak &movups (&QWP(0,$out),$inout0); # write output &set_label("xts_dec_ret"); &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp &function_end("aesni_xts_decrypt"); } } ###################################################################### # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); &function_begin("${PREFIX}_cbc_encrypt"); &mov ($inp,&wparam(0)); &mov ($rounds_,"esp"); &mov ($out,&wparam(1)); &sub ($rounds_,24); &mov ($len,&wparam(2)); &and ($rounds_,-16); &mov ($key,&wparam(3)); &mov ($key_,&wparam(4)); &test ($len,$len); &jz (&label("cbc_abort")); &cmp (&wparam(5),0); &xchg ($rounds_,"esp"); # alloca &movups ($ivec,&QWP(0,$key_)); # load IV &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); # backup $key &mov (&DWP(16,"esp"),$rounds_); # save original %esp &mov ($rounds_,$rounds); # backup $rounds &je (&label("cbc_decrypt")); &movaps ($inout0,$ivec); &cmp ($len,16); &jb (&label("cbc_enc_tail")); &sub ($len,16); &jmp (&label("cbc_enc_loop")); &set_label("cbc_enc_loop",16); &movups ($ivec,&QWP(0,$inp)); # input actually &lea ($inp,&DWP(16,$inp)); if ($inline) { &aesni_inline_generate1("enc",$inout0,$ivec); } else { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(0,$out),$inout0); # store output &lea ($out,&DWP(16,$out)); &sub ($len,16); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); &movaps ($ivec,$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_enc_tail"); &mov ("ecx",$len); # zaps $rounds &data_word(0xA4F3F689); # rep movsb &mov ("ecx",16); # zero tail &sub ("ecx",$len); &xor ("eax","eax"); # zaps $len &data_word(0xAAF3F689); # rep stosb &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block &mov ($rounds,$rounds_); # restore $rounds &mov ($inp,$out); # $inp and $out are the same &mov ($key,$key_); # restore $key &jmp (&label("cbc_enc_loop")); ###################################################################### &set_label("cbc_decrypt",16); &cmp ($len,0x50); &jbe (&label("cbc_dec_tail")); &movaps (&QWP(0,"esp"),$ivec); # save IV &sub ($len,0x50); &jmp (&label("cbc_dec_loop6_enter")); &set_label("cbc_dec_loop6",16); &movaps (&QWP(0,"esp"),$rndkey0); # save IV &movups (&QWP(0,$out),$inout5); &lea ($out,&DWP(0x10,$out)); &set_label("cbc_dec_loop6_enter"); &movdqu ($inout0,&QWP(0,$inp)); &movdqu ($inout1,&QWP(0x10,$inp)); &movdqu ($inout2,&QWP(0x20,$inp)); &movdqu ($inout3,&QWP(0x30,$inp)); &movdqu ($inout4,&QWP(0x40,$inp)); &movdqu ($inout5,&QWP(0x50,$inp)); &call ("_aesni_decrypt6"); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,&QWP(0,"esp")); # ^=IV &xorps ($inout1,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout2,$rndkey0); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout3,$rndkey1); &movups ($rndkey1,&QWP(0x40,$inp)); &xorps ($inout4,$rndkey0); &movups ($rndkey0,&QWP(0x50,$inp)); # IV &xorps ($inout5,$rndkey1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &lea ($inp,&DWP(0x60,$inp)); &movups (&QWP(0x20,$out),$inout2); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(0x30,$out),$inout3); &mov ($key,$key_); # restore $key &movups (&QWP(0x40,$out),$inout4); &lea ($out,&DWP(0x50,$out)); &sub ($len,0x60); &ja (&label("cbc_dec_loop6")); &movaps ($inout0,$inout5); &movaps ($ivec,$rndkey0); &add ($len,0x50); &jle (&label("cbc_dec_tail_collected")); &movups (&QWP(0,$out),$inout0); &lea ($out,&DWP(0x10,$out)); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &movaps ($in0,$inout0); &cmp ($len,0x10); &jbe (&label("cbc_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &movaps ($in1,$inout1); &cmp ($len,0x20); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &cmp ($len,0x40); &jbe (&label("cbc_dec_four")); &movups ($inout4,&QWP(0x40,$inp)); &movaps (&QWP(0,"esp"),$ivec); # save IV &movups ($inout0,&QWP(0,$inp)); &xorps ($inout5,$inout5); &call ("_aesni_decrypt6"); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,&QWP(0,"esp")); # ^= IV &xorps ($inout1,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout2,$rndkey0); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout3,$rndkey1); &movups ($ivec,&QWP(0x40,$inp)); # IV &xorps ($inout4,$rndkey0); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &lea ($out,&DWP(0x40,$out)); &movaps ($inout0,$inout4); &sub ($len,0x50); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one",16); if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &xorps ($inout0,$ivec); &movaps ($ivec,$in0); &sub ($len,0x10); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout1); &lea ($out,&DWP(0x10,$out)); &movaps ($ivec,$in1); &sub ($len,0x20); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three",16); &call ("_aesni_decrypt3"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &xorps ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout2); &movups (&QWP(0x10,$out),$inout1); &lea ($out,&DWP(0x20,$out)); &movups ($ivec,&QWP(0x20,$inp)); &sub ($len,0x30); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_four",16); &call ("_aesni_decrypt4"); &movups ($rndkey1,&QWP(0x10,$inp)); &movups ($rndkey0,&QWP(0x20,$inp)); &xorps ($inout0,$ivec); &movups ($ivec,&QWP(0x30,$inp)); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &xorps ($inout3,$rndkey0); &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &movaps ($inout0,$inout3); &sub ($len,0x40); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); &movups (&QWP(0,$out),$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial",16); &movaps (&QWP(0,"esp"),$inout0); &mov ("ecx",16); &mov ($inp,"esp"); &sub ("ecx",$len); &data_word(0xA4F3F689); # rep movsb &set_label("cbc_ret"); &mov ("esp",&DWP(16,"esp")); # pull original %esp &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV &set_label("cbc_abort"); &function_end("${PREFIX}_cbc_encrypt"); ###################################################################### # Mechanical port from aesni-x86_64.pl. # # _aesni_set_encrypt_key is private interface, # input: # "eax" const unsigned char *userKey # $rounds int bits # $key AES_KEY *key # output: # "eax" return code # $round rounds &function_begin_B("_aesni_set_encrypt_key"); &test ("eax","eax"); &jz (&label("bad_pointer")); &test ($key,$key); &jz (&label("bad_pointer")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 &lea ($key,&DWP(16,$key)); &cmp ($rounds,256); &je (&label("14rounds")); &cmp ($rounds,192); &je (&label("12rounds")); &cmp ($rounds,128); &jne (&label("bad_keybits")); &set_label("10rounds",16); &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); &aeskeygenassist("xmm1","xmm0",0x2); # round 2 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x04); # round 3 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x08); # round 4 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x10); # round 5 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x20); # round 6 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x40); # round 7 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x80); # round 8 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x36); # round 10 &call (&label("key_128")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(80,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_128",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &set_label("key_128_cold"); &shufps ("xmm4","xmm0",0b00010000); &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &xorps ("xmm0","xmm4"); &shufps ("xmm1","xmm1",0b11111111); # critical path &xorps ("xmm0","xmm1"); &ret(); &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 &call (&label("key_192b")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(48,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_192a",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &set_label("key_192a_cold",16); &movaps ("xmm5","xmm2"); &set_label("key_192b_warm"); &shufps ("xmm4","xmm0",0b00010000); &movdqa ("xmm3","xmm2"); &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &pslldq ("xmm3",4); &xorps ("xmm0","xmm4"); &pshufd ("xmm1","xmm1",0b01010101); # critical path &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm1"); &pshufd ("xmm3","xmm0",0b11111111); &pxor ("xmm2","xmm3"); &ret(); &set_label("key_192b",16); &movaps ("xmm3","xmm0"); &shufps ("xmm5","xmm0",0b01000100); &$movekey (&QWP(0,$key),"xmm5"); &shufps ("xmm3","xmm2",0b01001110); &$movekey (&QWP(16,$key),"xmm3"); &lea ($key,&DWP(32,$key)); &jmp (&label("key_192b_warm")); &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &mov ($rounds,13); &lea ($key,&DWP(16,$key)); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 &call (&label("key_256a_cold")); &aeskeygenassist("xmm1","xmm0",0x01); # round 3 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x02); # round 4 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x02); # round 5 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x04); # round 6 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x04); # round 7 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x08); # round 8 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x08); # round 9 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x10); # round 10 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x10); # round 11 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x20); # round 12 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x20); # round 13 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x40); # round 14 &call (&label("key_256a")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(16,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_256a",16); &$movekey (&QWP(0,$key),"xmm2"); &lea ($key,&DWP(16,$key)); &set_label("key_256a_cold"); &shufps ("xmm4","xmm0",0b00010000); &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &xorps ("xmm0","xmm4"); &shufps ("xmm1","xmm1",0b11111111); # critical path &xorps ("xmm0","xmm1"); &ret(); &set_label("key_256b",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &shufps ("xmm4","xmm2",0b00010000); &xorps ("xmm2","xmm4"); &shufps ("xmm4","xmm2",0b10001100); &xorps ("xmm2","xmm4"); &shufps ("xmm1","xmm1",0b10101010); # critical path &xorps ("xmm2","xmm1"); &ret(); &set_label("bad_pointer",4); &mov ("eax",-1); &ret (); &set_label("bad_keybits",4); &mov ("eax",-2); &ret (); &function_end_B("_aesni_set_encrypt_key"); # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_encrypt_key"); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &ret (); &function_end_B("${PREFIX}_set_encrypt_key"); # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_decrypt_key"); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule &$movekey ("xmm0",&QWP(0,$key)); # just swap &$movekey ("xmm1",&QWP(0,"eax")); &$movekey (&QWP(0,"eax"),"xmm0"); &$movekey (&QWP(0,$key),"xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &set_label("dec_key_inverse"); &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse &$movekey ("xmm1",&QWP(0,"eax")); &aesimc ("xmm0","xmm0"); &aesimc ("xmm1","xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &$movekey (&QWP(16,"eax"),"xmm0"); &$movekey (&QWP(-16,$key),"xmm1"); &cmp ("eax",$key); &ja (&label("dec_key_inverse")); &$movekey ("xmm0",&QWP(0,$key)); # inverse middle &aesimc ("xmm0","xmm0"); &$movekey (&QWP(0,$key),"xmm0"); &xor ("eax","eax"); # return success &set_label("dec_key_ret"); &ret (); &function_end_B("${PREFIX}_set_decrypt_key"); &asciz("AES for Intel AES-NI, CRYPTOGAMS by "); &asm_finish();