#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # September 2011 # # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for # details. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; $code=".text\n"; %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; $out="%rdi"; $inp="%rsi"; $len="%rcx"; $chunk="%rbx"; ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code.=<<___; .globl padlock_capability .type padlock_capability,\@abi-omnipotent .align 16 padlock_capability: mov %rbx,%r8 xor %eax,%eax cpuid xor %eax,%eax cmp \$`"0x".unpack("H*",'tneC')`,%ebx jne .Lnoluck cmp \$`"0x".unpack("H*",'Hrua')`,%edx jne .Lnoluck cmp \$`"0x".unpack("H*",'slua')`,%ecx jne .Lnoluck mov \$0xC0000000,%eax cpuid mov %eax,%edx xor %eax,%eax cmp \$0xC0000001,%edx jb .Lnoluck mov \$0xC0000001,%eax cpuid mov %edx,%eax and \$0xffffffef,%eax or \$0x10,%eax # set Nano bit#4 .Lnoluck: mov %r8,%rbx ret .size padlock_capability,.-padlock_capability .globl padlock_key_bswap .type padlock_key_bswap,\@abi-omnipotent,0 .align 16 padlock_key_bswap: mov 240($arg1),%edx .Lbswap_loop: mov ($arg1),%eax bswap %eax mov %eax,($arg1) lea 4($arg1),$arg1 sub \$1,%edx jnz .Lbswap_loop ret .size padlock_key_bswap,.-padlock_key_bswap .globl padlock_verify_context .type padlock_verify_context,\@abi-omnipotent .align 16 padlock_verify_context: mov $arg1,$ctx pushf lea .Lpadlock_saved_context(%rip),%rax call _padlock_verify_ctx lea 8(%rsp),%rsp ret .size padlock_verify_context,.-padlock_verify_context .type _padlock_verify_ctx,\@abi-omnipotent .align 16 _padlock_verify_ctx: mov 8(%rsp),%r8 bt \$30,%r8 jnc .Lverified cmp (%rax),$ctx je .Lverified pushf popf .Lverified: mov $ctx,(%rax) ret .size _padlock_verify_ctx,.-_padlock_verify_ctx .globl padlock_reload_key .type padlock_reload_key,\@abi-omnipotent .align 16 padlock_reload_key: pushf popf ret .size padlock_reload_key,.-padlock_reload_key .globl padlock_aes_block .type padlock_aes_block,\@function,3 .align 16 padlock_aes_block: mov %rbx,%r8 mov \$1,$len lea 32($ctx),%rbx # key lea 16($ctx),$ctx # control word .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb mov %r8,%rbx ret .size padlock_aes_block,.-padlock_aes_block .globl padlock_xstore .type padlock_xstore,\@function,2 .align 16 padlock_xstore: mov %esi,%edx .byte 0x0f,0xa7,0xc0 # xstore ret .size padlock_xstore,.-padlock_xstore .globl padlock_sha1_oneshot .type padlock_sha1_oneshot,\@function,3 .align 16 padlock_sha1_oneshot: mov %rdx,%rcx mov %rdi,%rdx # put aside %rdi movups (%rdi),%xmm0 # copy-in context sub \$128+8,%rsp mov 16(%rdi),%eax movaps %xmm0,(%rsp) mov %rsp,%rdi mov %eax,16(%rsp) xor %rax,%rax .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 movaps (%rsp),%xmm0 mov 16(%rsp),%eax add \$128+8,%rsp movups %xmm0,(%rdx) # copy-out context mov %eax,16(%rdx) ret .size padlock_sha1_oneshot,.-padlock_sha1_oneshot .globl padlock_sha1_blocks .type padlock_sha1_blocks,\@function,3 .align 16 padlock_sha1_blocks: mov %rdx,%rcx mov %rdi,%rdx # put aside %rdi movups (%rdi),%xmm0 # copy-in context sub \$128+8,%rsp mov 16(%rdi),%eax movaps %xmm0,(%rsp) mov %rsp,%rdi mov %eax,16(%rsp) mov \$-1,%rax .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 movaps (%rsp),%xmm0 mov 16(%rsp),%eax add \$128+8,%rsp movups %xmm0,(%rdx) # copy-out context mov %eax,16(%rdx) ret .size padlock_sha1_blocks,.-padlock_sha1_blocks .globl padlock_sha256_oneshot .type padlock_sha256_oneshot,\@function,3 .align 16 padlock_sha256_oneshot: mov %rdx,%rcx mov %rdi,%rdx # put aside %rdi movups (%rdi),%xmm0 # copy-in context sub \$128+8,%rsp movups 16(%rdi),%xmm1 movaps %xmm0,(%rsp) mov %rsp,%rdi movaps %xmm1,16(%rsp) xor %rax,%rax .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 add \$128+8,%rsp movups %xmm0,(%rdx) # copy-out context movups %xmm1,16(%rdx) ret .size padlock_sha256_oneshot,.-padlock_sha256_oneshot .globl padlock_sha256_blocks .type padlock_sha256_blocks,\@function,3 .align 16 padlock_sha256_blocks: mov %rdx,%rcx mov %rdi,%rdx # put aside %rdi movups (%rdi),%xmm0 # copy-in context sub \$128+8,%rsp movups 16(%rdi),%xmm1 movaps %xmm0,(%rsp) mov %rsp,%rdi movaps %xmm1,16(%rsp) mov \$-1,%rax .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 add \$128+8,%rsp movups %xmm0,(%rdx) # copy-out context movups %xmm1,16(%rdx) ret .size padlock_sha256_blocks,.-padlock_sha256_blocks .globl padlock_sha512_blocks .type padlock_sha512_blocks,\@function,3 .align 16 padlock_sha512_blocks: mov %rdx,%rcx mov %rdi,%rdx # put aside %rdi movups (%rdi),%xmm0 # copy-in context sub \$128+8,%rsp movups 16(%rdi),%xmm1 movups 32(%rdi),%xmm2 movups 48(%rdi),%xmm3 movaps %xmm0,(%rsp) mov %rsp,%rdi movaps %xmm1,16(%rsp) movaps %xmm2,32(%rsp) movaps %xmm3,48(%rsp) .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 movaps 32(%rsp),%xmm2 movaps 48(%rsp),%xmm3 add \$128+8,%rsp movups %xmm0,(%rdx) # copy-out context movups %xmm1,16(%rdx) movups %xmm2,32(%rdx) movups %xmm3,48(%rdx) ret .size padlock_sha512_blocks,.-padlock_sha512_blocks ___ sub generate_mode { my ($mode,$opcode) = @_; # int padlock_$mode_encrypt(void *out, const void *inp, # struct padlock_cipher_data *ctx, size_t len); $code.=<<___; .globl padlock_${mode}_encrypt .type padlock_${mode}_encrypt,\@function,4 .align 16 padlock_${mode}_encrypt: push %rbp push %rbx xor %eax,%eax test \$15,$ctx jnz .L${mode}_abort test \$15,$len jnz .L${mode}_abort lea .Lpadlock_saved_context(%rip),%rax pushf cld call _padlock_verify_ctx lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out setz %al # !out_misaligned test \$0x0f,$inp setz %bl # !inp_misaligned test %ebx,%eax jnz .L${mode}_aligned neg %rax mov \$$PADLOCK_CHUNK,$chunk not %rax # out_misaligned?-1:0 lea (%rsp),%rbp cmp $chunk,$len cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len and $chunk,%rax # out_misaligned?chunk:0 mov $len,$chunk neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp mov \$$PADLOCK_CHUNK,%rax cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: mov -4($ctx),%eax # pull 32-bit counter bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK cmovbe $len,$chunk ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); cmp $chunk,$len ja .L${mode}_loop mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp cmove $out,%rax add $len,%rax neg %rax and \$0xfff,%rax # distance to page boundary cmp \$$PADLOCK_PREFETCH{$mode},%rax mov \$-$PADLOCK_PREFETCH{$mode},%rax cmovae $chunk,%rax # mask=distance" .align 16 .data .align 8 .Lpadlock_saved_context: .quad 0 ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT;