From d64a7232d4c5def7ba9d0b089df71962538d558f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 27 Apr 2009 05:55:13 +0000 Subject: [PATCH] Intel AES-NI engine. Submitted by: Huang Ying --- Configure | 7 +- TABLE | 86 +-- crypto/aes/Makefile | 4 + crypto/aes/asm/aesni-x86.pl | 663 +++++++++++++++++++++++ crypto/aes/asm/aesni-x86_64.pl | 963 +++++++++++++++++++++++++++++++++ crypto/engine/Makefile | 6 +- crypto/engine/eng_aesni.c | 402 ++++++++++++++ crypto/engine/eng_all.c | 3 + crypto/engine/engine.h | 1 + 9 files changed, 2087 insertions(+), 48 deletions(-) create mode 100644 crypto/aes/asm/aesni-x86.pl create mode 100644 crypto/aes/asm/aesni-x86_64.pl create mode 100644 crypto/engine/eng_aesni.c diff --git a/Configure b/Configure index 89142eb22e..2c28be4fb7 100755 --- a/Configure +++ b/Configure @@ -121,11 +121,11 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; +my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void"; @@ -485,7 +485,7 @@ my %table=( # # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64 "VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32", -"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o::ml64:win32", +"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o::ml64:win32", # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE' "VC-WIN32","cl:-W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32", @@ -1370,6 +1370,7 @@ if ($rmd160_obj =~ /\.o$/) if ($aes_obj =~ /\.o$/) { $cflags.=" -DAES_ASM"; + $aes_obj =~ s/\s*aesni\-x86\.o// if ($no_sse2); } else { $aes_obj=$aes_enc; diff --git a/TABLE b/TABLE index 6205386776..e4325459e9 100644 --- a/TABLE +++ b/TABLE @@ -228,7 +228,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -259,7 +259,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -290,7 +290,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -321,7 +321,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -383,7 +383,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -653,7 +653,7 @@ $multilib = *** VC-WIN32 $cc = cl -$cflags = -W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE +$cflags = -W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE $unistd = $thread_cflag = $sys_id = WIN32 @@ -662,7 +662,7 @@ $bn_ops = BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -684,7 +684,7 @@ $multilib = *** VC-WIN64A $cc = cl -$cflags = -W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE +$cflags = -W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE $unistd = $thread_cflag = $sys_id = WIN64A @@ -693,7 +693,7 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -715,7 +715,7 @@ $multilib = *** VC-WIN64I $cc = cl -$cflags = -W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE +$cflags = -W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE $unistd = $thread_cflag = $sys_id = WIN64I @@ -941,7 +941,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -972,7 +972,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1003,7 +1003,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1127,7 +1127,7 @@ $bn_ops = BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1220,7 +1220,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK BF_PTR2 DES_INT DES_UNROL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -1282,7 +1282,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1530,7 +1530,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1561,7 +1561,7 @@ $bn_ops = BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1685,7 +1685,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1716,7 +1716,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1809,7 +1809,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1840,7 +1840,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1902,7 +1902,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1933,7 +1933,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -1964,7 +1964,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -2119,7 +2119,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -2150,7 +2150,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -2181,7 +2181,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -2336,7 +2336,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -2863,7 +2863,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -3204,7 +3204,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -3266,7 +3266,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -3359,7 +3359,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -3638,7 +3638,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -3700,7 +3700,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT EXPORT_V $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -3731,7 +3731,7 @@ $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -4320,7 +4320,7 @@ $bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -4351,7 +4351,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -4599,7 +4599,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -4692,7 +4692,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -4723,7 +4723,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o $des_obj = -$aes_obj = aes-x86_64.o +$aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o @@ -4971,7 +4971,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o @@ -5002,7 +5002,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid.o $bn_obj = bn-586.o co-586.o x86-mont.o $des_obj = des-586.o crypt586.o -$aes_obj = aes-586.o +$aes_obj = aes-586.o aesni-x86.o $bf_obj = bf-586.o $md5_obj = md5-586.o $sha1_obj = sha1-586.o sha256-586.o sha512-586.o diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile index c501a43a8f..3517465bd0 100644 --- a/crypto/aes/Makefile +++ b/crypto/aes/Makefile @@ -50,9 +50,13 @@ aes-ia64.s: asm/aes-ia64.S aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl + $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ aes-x86_64.s: asm/aes-x86_64.pl $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-x86_64.s: asm/aesni-x86_64.pl + $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ aes-sparcv9.s: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl new file mode 100644 index 0000000000..a9339a4c38 --- /dev/null +++ b/crypto/aes/asm/aesni-x86.pl @@ -0,0 +1,663 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for +# details]. + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-586.pl:-) + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); + +$len="eax"; +$rounds="ecx"; +$key="edx"; +$inp="esi"; +$out="edi"; +$rounds_="ebx"; +$key_="ebp"; + +$inout0="xmm0"; +$inout1="xmm1"; +$inout2="xmm2"; +$rndkey0="xmm3"; +$rndkey1="xmm4"; +$ivec="xmm5"; +$in0="xmm6"; +$in1="xmm7"; + +sub _aesni_generate1 # folded loop +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt1"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(16,$key)); + &pxor ($inout0,$rndkey0); + &dec ($rounds); + &set_label("${p}1_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + &dec ($rounds); + &lea ($key,&DWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key)); + &jnz (&label("${p}1_loop")); + eval"&aes${p}last ($inout0,$rndkey1)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + +sub aesni_generate1 # fully unrolled loop +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt1"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(0x10,$key)); + &cmp ($rounds,12); + &pxor ($inout0,$rndkey0); + &$movekey ($rndkey0,&QWP(0x20,$key)); + &lea ($key,&DWP(0x30,$key)); + &jb (&label("${p}128")); + &lea ($key,&DWP(0x20,$key)); + &je (&label("${p}192")); + &lea ($key,&DWP(0x20,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x40,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x30,$key)); + &set_label("${p}192"); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x20,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x10,$key)); + &set_label("${p}128"); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x10,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x20,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x30,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x40,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x50,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x60,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x70,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + +&aesni_generate1("enc"); +# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +&function_begin_B("${PREFIX}_encrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + &call ("_aesni_encrypt1"); + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_encrypt"); + +&aesni_generate1("dec"); +# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +&function_begin_B("${PREFIX}_decrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + &call ("_aesni_decrypt1"); + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_decrypt"); + +# _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave +# factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned +# out that it can be scheduled only every *second* cycle. Thus 3x +# interleave is the one providing optimal utilization, i.e. when +# subroutine's throughput is virtually same as of non-interleaved +# subroutine for number of input blocks up to 3. This is why it +# handles even double-block inputs. Larger interleave factor would +# perform suboptimally on shorter inputs... + +sub aesni_generate3 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shr ($rounds,1); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &dec ($rounds); + &pxor ($inout2,$rndkey0); + &jmp (&label("${p}3_loop")); + &set_label("${p}3_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt3"); +} +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); + +if ($PREFIX eq "aesni") { +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); + +&function_begin("aesni_ecb_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds,&wparam(4)); + &cmp ($len,16); + &jb (&label("ecb_ret")); + &and ($len,-16); + &test ($rounds,$rounds) + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &jz (&label("ecb_decrypt")); + + &sub ($len,0x30); + &jc (&label("ecb_enc_tail")); + jmp (&label("ecb_enc_loop3")); + +&set_label("ecb_enc_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &lea ($inp,&DWP(0x30,$inp)); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &sub ($len,0x30); + &movups (&QWP(0x10,$out),$inout0); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x20,$out),$inout0); + &mov ($rounds,$rounds_); # restore $rounds + &lea ($out,&DWP(0x30,$out)); + &jnc (&label("ecb_enc_loop3")); + +&set_label("ecb_enc_tail"); + &add ($len,0x30); + &jz (&label("ecb_ret")); + + &cmp ($len,0x10); + &movups ($inout0,&QWP(0,$inp)); + je (&label("ecb_enc_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + jmp (&label("ecb_ret")); + +&set_label("ecb_enc_one",16); + &call ("_aesni_encrypt1"); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_decrypt",16); + &sub ($len,0x30); + &jc (&label("ecb_dec_tail")); + jmp (&label("ecb_dec_loop3")); + +&set_label("ecb_dec_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &sub ($len,0x30); + &lea ($inp,&DWP(0x30,$inp)); + &movups (&QWP(0x10,$out),$inout0); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x20,$out),$inout0); + &mov ($rounds,$rounds_); # restore $rounds + &lea ($out,&DWP(0x30,$out)); + &jnc (&label("ecb_dec_loop3")); + +&set_label("ecb_dec_tail"); + &add ($len,0x30); + &jz (&label("ecb_ret")); + + &cmp ($len,0x10); + &movups ($inout0,&QWP(0,$inp)); + je (&label("ecb_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + jmp (&label("ecb_ret")); + +&set_label("ecb_dec_one",16); + &call ("_aesni_decrypt1"); + &movups (&QWP(0,$out),$inout0); + +&set_label("ecb_ret"); +&function_end("aesni_ecb_encrypt"); +} + +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &test ($len,$len); + &mov ($key_,&wparam(4)); + &je (&label("cbc_ret")); + + &cmp (&wparam(5),0); + &movups ($ivec,&QWP(0,$key_)); # load IV + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &je (&label("cbc_decrypt")); + + &movaps ($inout0,$ivec); + &cmp ($len,16); + &jb (&label("cbc_enc_tail")); + &sub ($len,16); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movups ($ivec,&QWP(0,$inp)); + &lea ($inp,&DWP(16,$inp)); + &pxor ($inout0,$ivec); + &call ("_aesni_encrypt1"); + &sub ($len,16); + &mov ($rounds,$rounds_); # restore $rounds + &mov ($key,$key_); # restore $key + &movups (&QWP(0,$out),$inout0); + &lea ($out,&DWP(16,$out)); + &jnc (&label("cbc_enc_loop")); + &add ($len,16); + &jnz (&label("cbc_enc_tail")); + &movaps ($ivec,$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_enc_tail"); + &mov ("ecx",$len); # zaps $rounds + &data_word(0xA4F3F689); # rep movsb + &mov ("ecx",16); # zero tail + &sub ("ecx",$len); + &xor ("eax","eax"); # zaps $len + &data_word(0xAAF3F689); # rep stosb + &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block + &mov ($rounds,$rounds_); # restore $rounds + &mov ($inp,$out); # $inp and $out are the same + &mov ($key,$key_); # restore $key + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_decrypt",16); + &sub ($len,0x30); + &jc (&label("cbc_dec_tail")); + &jmp (&label("cbc_dec_loop3")); + +&set_label("cbc_dec_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &movaps ($in0,$inout0); + &movaps ($in1,$inout1); + &call ("_aesni_decrypt3"); + &sub ($len,0x30); + &lea ($inp,&DWP(0x30,$inp)); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &movups ($ivec,&QWP(0x20,$inp)); + &pxor ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &mov ($rounds,$rounds_) # restore $rounds + &movups (&QWP(0x10,$out),$inout1); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x20,$out),$inout2); + &lea ($out,&DWP(0x30,$out)); + &jnc (&label("cbc_dec_loop3")); + +&set_label("cbc_dec_tail"); + &add ($len,0x30); + &jz (&label("cbc_ret")); + + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x10); + &movaps ($in0,$inout0); + &jbe (&label("cbc_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &cmp ($len,0x20); + &movaps ($in1,$inout1); + &jbe (&label("cbc_dec_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &movups ($ivec,&QWP(0x20,$inp)); + &pxor ($inout1,$in0); + &pxor ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movaps ($inout0,$inout2); + &lea ($out,&DWP(0x20,$out)); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_one"); + &call ("_aesni_decrypt1"); + &pxor ($inout0,$ivec); + &movaps ($ivec,$in0); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_two"); + &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout1); + &movaps ($ivec,$in1); + &lea ($out,&DWP(0x10,$out)); + +&set_label("cbc_dec_tail_collected"); + &and ($len,15); + &jnz (&label("cbc_dec_tail_partial")); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_dec_tail_partial"); + &mov ($key_,"esp"); + &sub ("esp",16); + &and ("esp",-16); + &movaps (&QWP(0,"esp"),$inout0); + &mov ($inp,"esp"); + &mov ("ecx",$len); + &data_word(0xA4F3F689); # rep movsb + &mov ("esp",$key_); + +&set_label("cbc_ret"); + &mov ($key_,&wparam(4)); + &movups (&QWP(0,$key_),$ivec); # output IV +&function_end("${PREFIX}_cbc_encrypt"); + +# Mechanical port from aesni-x86_64.pl. +# +# _aesni_set_encrypt_key is private interface, +# input: +# "eax" const unsigned char *userKey +# $rounds int bits +# $key AES_KEY *key +# output: +# "eax" return code +# $round rounds + +&function_begin_B("_aesni_set_encrypt_key"); + &test ("eax","eax"); + &jz (&label("bad_pointer")); + &test ($key,$key); + &jz (&label("bad_pointer")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds")); + &cmp ($rounds,192); + &je (&label("12rounds")); + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds",16); + &mov ($rounds,10); + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 + &aeskeygenassist("xmm1","xmm0",0x01); # round 1 + &call (&label("key_128_cold")); + &aeskeygenassist("xmm1","xmm0",0x2); # round 2 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 3 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 4 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 5 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 6 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x40); # round 7 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x80); # round 8 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x36); # round 10 + &call (&label("key_128")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(80,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_128",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_128_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100,); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b11111111); # critical path + &pxor ("xmm0","xmm1"); + &ret(); + +&set_label("12rounds",16); + &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey + &mov ($rounds,12); + &$movekey (&QWP(-16,$key),"xmm0") # round 0 + &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 + &call (&label("key_192a_cold")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 + &call (&label("key_192b")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(48,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_192a",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_192a_cold",16); + &movaps ("xmm5","xmm2"); +&set_label("key_192b_warm"); + &shufps ("xmm4","xmm0",0b00010000); + &movaps ("xmm3","xmm2"); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pslldq ("xmm3",4); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b01010101); # critical path + &pxor ("xmm2","xmm3"); + &pxor ("xmm0","xmm1"); + &pshufd ("xmm3","xmm0",0b11111111); + &pxor ("xmm2","xmm3"); + &ret(); + +&set_label("key_192b",16); + &movaps ("xmm3","xmm0"); + &shufps ("xmm5","xmm0",0b01000100); + &$movekey (&QWP(0,$key),"xmm5"); + &shufps ("xmm3","xmm2",0b01001110); + &$movekey (&QWP(16,$key),"xmm3"); + &lea ($key,&DWP(32,$key)); + &jmp (&label("key_192b_warm")); + +&set_label("14rounds",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &mov ($rounds,14); + &lea ($key,&DWP(16,$key)); + &$movekey (&QWP(-32,$key),"xmm0"); # round 0 + &$movekey (&QWP(-16,$key),"xmm2"); # round 1 + &aeskeygenassist("xmm1","xmm2",0x01); # round 2 + &call (&label("key_256a_cold")); + &aeskeygenassist("xmm1","xmm0",0x01); # round 3 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 4 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x02); # round 5 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 6 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 7 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 8 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 9 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 10 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 11 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 12 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 13 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 14 + &call (&label("key_256a")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(16,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_256a",16); + &$movekey (&QWP(0,$key),"xmm2"); + &lea ($key,&DWP(16,$key)); +&set_label("key_256a_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b11111111); # critical path + &pxor ("xmm0","xmm1"); + &ret(); + +&set_label("key_256b",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); + + &shufps ("xmm4","xmm2",0b00010000); + &pxor ("xmm2","xmm4"); + &shufps ("xmm4","xmm2",0b10001100); + &pxor ("xmm2","xmm4"); + &pshufd ("xmm1","xmm1",0b10101010); # critical path + &pxor ("xmm2","xmm1"); + &ret(); + +&set_label("bad_pointer",4); + &mov ("eax",-1); + &ret (); +&set_label("bad_keybits",4); + &mov ("eax",-2); + &ret (); +&function_end_B("_aesni_set_encrypt_key"); + +# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key"); + +# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_decrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &mov ($key,&wparam(2)); + &shl ($rounds,4) # actually rounds after _aesni_set_encrypt_key + &test ("eax","eax"); + &jnz (&label("dec_key_ret")); + &lea ("eax",&DWP(0,$key,$rounds)); # end of key schedule + + &$movekey ("xmm0",&QWP(0,$key)); # just swap + &$movekey ("xmm1",&QWP(0,"eax")); + &$movekey (&QWP(0,"eax"),"xmm0"); + &$movekey (&QWP(0,$key),"xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + &jmp (&label("dec_key_inverse")); + +&set_label("dec_key_inverse",16); + &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse + &$movekey ("xmm1",&QWP(0,"eax")); + &aesimc ("xmm0","xmm0"); + &aesimc ("xmm1","xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + &cmp ("eax",$key); + &$movekey (&QWP(16,"eax"),"xmm0"); + &$movekey (&QWP(-16,$key),"xmm1"); + &ja (&label("dec_key_inverse")); + + &$movekey ("xmm0",&QWP(0,$key)); # inverse middle + &aesimc ("xmm0","xmm0"); + &$movekey (&QWP(0,$key),"xmm0"); + + &xor ("eax","eax"); # return success +&set_label("dec_key_ret"); + &ret (); +&function_end_B("${PREFIX}_set_decrypt_key"); +&asciz("AES for Intel AES-NI, CRYPTOGAMS by "); + +&asm_finish(); diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl new file mode 100644 index 0000000000..4ed3932b75 --- /dev/null +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -0,0 +1,963 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for +# details]. +# +# TODO: +# - Win64 SEH handlers; + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-x86_64.pl:-) + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; + +$code=".text\n"; + +$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! + +# this is natural argument order for public $PREFIX_*crypt... +$inp="%rdi"; +$out="%rsi"; +# ... and for $PREFIX_[ebc|cbc]_encrypt in particular. +$len="%rdx"; +$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! +$ivp="%r8"; # cbc + +$rnds_="%r10d"; # backup copy for $rounds +$key_="%r11"; # backup copy for $key + +# %xmm register layout +$inout0="%xmm0"; $inout1="%xmm1"; +$inout2="%xmm2"; $inout3="%xmm3"; +$inout4="%xmm4"; $inout5="%xmm5"; +$rndkey0="%xmm6"; $rndkey1="%xmm7"; + +$iv="%xmm8"; +$in0="%xmm9"; $in1="%xmm10"; +$in2="%xmm11"; $in3="%xmm12"; +$in4="%xmm13"; $in5="%xmm14"; + +# Inline version of internal aesni_[en|de]crypt1. +# +# Why folded loop? Because aes[enc|dec] is slow enough to accommodate +# cycles which take care of loop variables... +{ my $sn; +sub aesni_encrypt1 { +my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_; +++$sn; +$code.=<<___; + $movkey ($key),$rndkey0 + $movkey 16($key),$rndkey1 + lea 16($key),$key + pxor $rndkey0,$data + dec $rounds + jmp .Loop_enc1_$sn +.align 16 +.Loop_enc1_$sn: + aesenc $rndkey1,$data + dec $rounds + lea 16($key),$key + $movkey ($key),$rndkey1 + jnz .Loop_enc1_$sn # loop body is 16 bytes + + aesenclast $rndkey1,$data +___ +}} +{ my $sn; +sub aesni_decrypt1 { +my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_; +++$sn; +$code.=<<___; + $movkey ($key),$rndkey0 + $movkey 16($key),$rndkey1 + lea 16($key),$key + pxor $rndkey0,$data + dec $rounds + jmp .Loop_dec1_$sn +.align 16 +.Loop_dec1_$sn: + aesdec $rndkey1,$data + dec $rounds + lea 16($key),$key + $movkey ($key),$rndkey1 + jnz .Loop_dec1_$sn # loop body is 16 bytes + + aesdeclast $rndkey1,$data +___ +}} + +# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +# +$code.=<<___; +.globl ${PREFIX}_encrypt +.type ${PREFIX}_encrypt,\@function,3 +.align 16 +${PREFIX}_encrypt: + movups ($inp),%xmm0 # load input + mov 240(%rdx),$rounds # pull $rounds +___ + &aesni_encrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds); +$code.=<<___; + movups %xmm0,(%rsi) # output + ret +.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt +___ + +# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +# +$code.=<<___; +.globl ${PREFIX}_decrypt +.type ${PREFIX}_decrypt,\@function,3 +.align 16 +${PREFIX}_decrypt: + movups ($inp),%xmm0 # load input + mov 240(%rdx),$rounds # pull $rounds +___ + &aesni_decrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds); +$code.=<<___; + movups %xmm0,($out) # output + ret +.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt +___ + +# _aesni_[en|de]crypt6 are private interfaces, 6 denotes interleave +# factor. Why 6x? Because aes[enc|dec] latency is 6 and 6x interleave +# provides optimal utilization, so that subroutine's throughput is +# virtually same for *any* number [naturally up to 6] of input blocks +# as for non-interleaved subroutine. This is why it handles even +# double-, tripple-, quad- and penta-block inputs. Larger interleave +# factor, e.g. 8x, would perform suboptimally on these shorter inputs... +sub aesni_generate6 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-5] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt6,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt6: + $movkey ($key),$rndkey0 + $movkey 16($key),$rndkey1 + shr \$1,$rounds + lea 32($key),$key + dec $rounds + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + pxor $rndkey0,$inout2 + pxor $rndkey0,$inout3 + pxor $rndkey0,$inout4 + pxor $rndkey0,$inout5 + jmp .L${dir}_loop6 +.align 16 +.L${dir}_loop6: + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey0,$inout0 + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + jnz .L${dir}_loop6 + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + ret +.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +___ +} +&aesni_generate6("enc"); +&aesni_generate6("dec"); + +if ($PREFIX eq "aesni") { +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); +$code.=<<___; +.globl aesni_ecb_encrypt +.type aesni_ecb_encrypt,\@function,5 +.align 16 +aesni_ecb_encrypt: + cmp \$16,$len # check length + jb .Lecb_abort +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,16(%rsp) +___ +$code.=<<___; + mov 240($key),$rounds # pull $rounds + and \$-16,$len + mov $key,$key_ # backup $key + test %r8d,%r8d + mov $rounds,$rnds_ # backup $rounds + jz .Lecb_decrypt +#--------------------------- ECB ENCRYPT ------------------------------# + sub \$0x60,$len + jc .Lecb_enc_tail + jmp .Lecb_enc_loop6 +.align 16 +.Lecb_enc_loop6: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + movups 0x30($inp),$inout3 + movups 0x40($inp),$inout4 + movups 0x50($inp),$inout5 + call _aesni_encrypt6 + movups $inout0,($out) + sub \$0x60,$len + movups $inout1,0x10($out) + lea 0x60($inp),$inp + movups $inout2,0x20($out) + mov $rnds_,$rounds # restore $rounds + movups $inout3,0x30($out) + mov $key_,$key # restore $key + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out + jnc .Lecb_enc_loop6 + +.Lecb_enc_tail: + add \$0x60,$len + jz .Lecb_ret + + cmp \$0x10,$len + movups ($inp),$inout0 + je .Lecb_enc_one + cmp \$0x20,$len + movups 0x10($inp),$inout1 + je .Lecb_enc_two + cmp \$0x30,$len + movups 0x20($inp),$inout2 + je .Lecb_enc_three + cmp \$0x40,$len + movups 0x30($inp),$inout3 + je .Lecb_enc_four + movups 0x40($inp),$inout4 + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_one: +___ + &aesni_encrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); +$code.=<<___; + movups $inout0,($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_two: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_three: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret + #--------------------------- ECB DECRYPT ------------------------------# +.align 16 +.Lecb_decrypt: + sub \$0x60,$len + jc .Lecb_dec_tail + jmp .Lecb_dec_loop6 +.align 16 +.Lecb_dec_loop6: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + movups 0x30($inp),$inout3 + movups 0x40($inp),$inout4 + movups 0x50($inp),$inout5 + call _aesni_decrypt6 + movups $inout0,($out) + sub \$0x60,$len + movups $inout1,0x10($out) + lea 0x60($inp),$inp + movups $inout2,0x20($out) + mov $rnds_,$rounds # restore $rounds + movups $inout3,0x30($out) + mov $key_,$key # restore $key + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out + jnc .Lecb_dec_loop6 + +.Lecb_dec_tail: + add \$0x60,$len + jz .Lecb_ret + + cmp \$0x10,$len + movups ($inp),$inout0 + je .Lecb_dec_one + cmp \$0x20,$len + movups 0x10($inp),$inout1 + je .Lecb_dec_two + cmp \$0x30,$len + movups 0x20($inp),$inout2 + je .Lecb_dec_three + cmp \$0x40,$len + movups 0x30($inp),$inout3 + je .Lecb_dec_four + movups 0x40($inp),$inout4 + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_one: +___ + &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); +$code.=<<___; + movups $inout0,($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_two: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_three: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + +.Lecb_ret: +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + lea 0x28(%rsp),%rsp +___ +$code.=<<___; +.Lecb_abort: + ret +.size aesni_ecb_encrypt,.-aesni_ecb_encrypt +___ +} + +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +$reserved = $win64?0x90:-0x18; # used in decrypt +$code.=<<___; +.globl ${PREFIX}_cbc_encrypt +.type ${PREFIX}_cbc_encrypt,\@function,6 +.align 16 +${PREFIX}_cbc_encrypt: + test $len,$len # check length + jz .Lcbc_ret + mov 240($key),$rounds # pull $rounds + mov $key,$key_ # backup $key + test %r9d,%r9d + mov $rounds,$rnds_ # backup $rounds + jz .Lcbc_decrypt +#--------------------------- CBC ENCRYPT ------------------------------# + movups ($ivp),%xmm0 # load iv as initial state + cmp \$16,$len + jb .Lcbc_enc_tail + sub \$16,$len + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups ($inp),%xmm2 # load input + lea 16($inp),$inp + pxor %xmm2,%xmm0 +___ + &aesni_encrypt1("%xmm0","%xmm1","%xmm2",$key,$rounds); +$code.=<<___; + movups %xmm0,($out) # store output + sub \$16,$len + lea 16($out),$out + mov $rnds_,$rounds # restore $rounds + mov $key_,$key # restore $key + jnc .Lcbc_enc_loop + add \$16,$len + jnz .Lcbc_enc_tail + movups %xmm0,($ivp) + jmp .Lcbc_ret + +.Lcbc_enc_tail: + mov $len,%rcx # zaps $key + xchg $inp,$out # $inp is %rsi and $out is %rdi now + .long 0x9066A4F3 # rep movsb + mov \$16,%ecx # zero tail + sub $len,%rcx + xor %eax,%eax + .long 0x9066AAF3 # rep stosb + lea -16(%rdi),%rdi # rewind $out by 1 block + mov $rnds_,$rounds # restore $rounds + mov %rdi,%rsi # $inp and $out are the same + mov $key_,$key # restore $key + xor $len,$len # len=16 + jmp .Lcbc_enc_loop # one more spin + #--------------------------- CBC DECRYPT ------------------------------# +.align 16 +.Lcbc_decrypt: +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) +___ +$code.=<<___; + movups ($ivp),$iv + sub \$0x60,$len + jc .Lcbc_dec_tail + jmp .Lcbc_dec_loop6 +.align 16 +.Lcbc_dec_loop6: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + movups 0x30($inp),$inout3 + movaps $inout0,$in0 + movups 0x40($inp),$inout4 + movaps $inout1,$in1 + movups 0x50($inp),$inout5 + movaps $inout2,$in2 + movaps $inout3,$in3 + movaps $inout4,$in4 + movaps $inout5,$in5 + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + sub \$0x60,$len + pxor $in1,$inout2 + movups $inout1,0x10($out) + lea 0x60($inp),$inp + pxor $in2,$inout3 + movups $inout2,0x20($out) + mov $rnds_,$rounds # restore $rounds + pxor $in3,$inout4 + movups $inout3,0x30($out) + mov $key_,$key # restore $key + pxor $in4,$inout5 + movups $inout4,0x40($out) + movaps $in5,$iv + movups $inout5,0x50($out) + lea 0x60($out),$out + jnc .Lcbc_dec_loop6 + +.Lcbc_dec_tail: + add \$0x60,$len + movups $iv,($ivp) + jz .Lcbc_dec_ret + + movups ($inp),$inout0 + cmp \$0x10,$len + movaps $inout0,$in0 + jbe .Lcbc_dec_one + movups 0x10($inp),$inout1 + cmp \$0x20,$len + movaps $inout1,$in1 + jbe .Lcbc_dec_two + movups 0x20($inp),$inout2 + cmp \$0x30,$len + movaps $inout2,$in2 + jbe .Lcbc_dec_three + movups 0x30($inp),$inout3 + cmp \$0x40,$len + movaps $inout3,$in3 + jbe .Lcbc_dec_four + movups 0x40($inp),$inout4 + cmp \$0x50,$len + movaps $inout4,$in4 + jbe .Lcbc_dec_five + movups 0x50($inp),$inout5 + movaps $inout5,$in5 + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + pxor $in2,$inout3 + movups $inout2,0x20($out) + pxor $in3,$inout4 + movups $inout3,0x30($out) + pxor $in4,$inout5 + movups $inout4,0x40($out) + movaps $in5,$iv + movaps $inout5,$inout0 + lea 0x50($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_one: +___ + &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); +$code.=<<___; + pxor $iv,$inout0 + movaps $in0,$iv + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_two: + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + movaps $in1,$iv + movaps $inout1,$inout0 + lea 0x10($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_three: + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + movaps $in2,$iv + movaps $inout2,$inout0 + lea 0x20($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + pxor $in2,$inout3 + movups $inout2,0x20($out) + movaps $in3,$iv + movaps $inout3,$inout0 + lea 0x30($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_five: + call _aesni_decrypt6 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + pxor $in2,$inout3 + movups $inout2,0x20($out) + pxor $in3,$inout4 + movups $inout3,0x30($out) + movaps $in4,$iv + movaps $inout4,$inout0 + lea 0x40($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_tail_collected: + and \$15,$len + movups $iv,($ivp) + jnz .Lcbc_dec_tail_partial + movups $inout0,($out) + jmp .Lcbc_dec_ret +.Lcbc_dec_tail_partial: + movaps $inout0,$reserved(%rsp) + mov $out,%rdi + mov $len,%rcx + lea $reserved(%rsp),%rsi + .long 0x9066A4F3 # rep movsb + +.Lcbc_dec_ret: +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + lea 0xa8(%rsp),%rsp +___ +$code.=<<___; +.Lcbc_ret: + ret +.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt +___ + +{ +# this is natural argument order for $PREFIX_set_[en|de]crypt_key +my $inp="%rdi"; +my $bits="%esi"; +my $key="%rdx"; + +# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +$code.=<<___; +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@function,3 +.align 16 +${PREFIX}_set_encrypt_key: + call _aesni_set_encrypt_key + ret +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +___ +# int $PREFIX_set_decrypt_key(const unsigned char *userKey, const int bits, +# AES_KEY *key) +$code.=<<___; +.globl ${PREFIX}_set_decrypt_key +.type ${PREFIX}_set_decrypt_key,\@function,3 +.align 16 +${PREFIX}_set_decrypt_key: + call _aesni_set_encrypt_key + shl \$4,%esi # actually rounds after _aesni_set_encrypt_key + test %eax,%eax + jnz .Ldec_key_ret + lea (%rdx,%rsi),%rsi# points at the end of key schedule + + $movkey (%rdx),%xmm0 # just swap + $movkey (%rsi),%xmm1 + $movkey %xmm0,(%rsi) + $movkey %xmm1,(%rdx) + lea 16(%rdx),%rdx + lea -16(%rsi),%rsi + jmp .Ldec_key_inverse +.align 16 +.Ldec_key_inverse: + $movkey (%rdx),%xmm0 # swap and inverse + $movkey (%rsi),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + lea 16(%rdx),%rdx + lea -16(%rsi),%rsi + cmp %rdx,%rsi + $movkey %xmm0,16(%rsi) + $movkey %xmm1,-16(%rdx) + ja .Ldec_key_inverse + + $movkey (%rdx),%xmm0 # inverse middle + aesimc %xmm0,%xmm0 + $movkey %xmm0,(%rsi) +.Ldec_key_ret: + ret +.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key +___ + +# This is based on submission by +# +# Huang Ying +# Vinodh Gopal +# Kahraman Akdemir +# +# Agressively optimized in respect to aeskeygenassist's critical path +# and is contained in %xmm0-5 to meet Win64 ABI requirement. +# +$code.=<<___; +.type _aesni_set_encrypt_key,\@abi-omnipotent +.align 16 +_aesni_set_encrypt_key: + test %rdi,%rdi + jz .Lbad_pointer + test %rdx,%rdx + jz .Lbad_pointer + + movups (%rdi),%xmm0 # pull first 128 bits of *userKey + pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + lea 16(%rdx),%rcx + cmp \$256,%esi + je .L14rounds + cmp \$192,%esi + je .L12rounds + cmp \$128,%esi + jne .Lbad_keybits + +.L10rounds: + mov \$10,%esi # 10 rounds for 128-bit key + $movkey %xmm0,(%rdx) # round 0 + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 + call .Lkey_expansion_128_cold + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 + call .Lkey_expansion_128 + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_128 + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 + call .Lkey_expansion_128 + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_128 + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 + call .Lkey_expansion_128 + aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_128 + aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 + call .Lkey_expansion_128 + aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_128 + aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 + call .Lkey_expansion_128 + $movkey %xmm0,(%rcx) + mov %esi,80(%rcx) # 240(%rdx) + xor %eax,%eax + ret + +.align 16 +.Lkey_expansion_128: + $movkey %xmm0,(%rcx) + lea 16(%rcx),%rcx +.Lkey_expansion_128_cold: + shufps \$0b00010000,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + pshufd \$0b11111111,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm0 + ret + +.align 16 +.L12rounds: + movq 16(%rdi),%xmm2 # remaining 1/3 of *userKey + mov \$12,%esi # 12 rounds for 192 + $movkey %xmm0,(%rdx) # round 0 + aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 + call .Lkey_expansion_192a_cold + aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 + call .Lkey_expansion_192b + aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 + call .Lkey_expansion_192a + aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 + call .Lkey_expansion_192b + aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 + call .Lkey_expansion_192a + aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 + call .Lkey_expansion_192b + aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 + call .Lkey_expansion_192a + aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 + call .Lkey_expansion_192b + $movkey %xmm0,(%rcx) + mov %esi,48(%rcx) # 240(%rdx) + xor %rax, %rax + ret + +.align 16 +.Lkey_expansion_192a: + $movkey %xmm0,(%rcx) + lea 16(%rcx),%rcx +.Lkey_expansion_192a_cold: + movaps %xmm2, %xmm5 +.Lkey_expansion_192b_warm: + shufps \$0b00010000,%xmm0,%xmm4 + movaps %xmm2,%xmm3 + pxor %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pslldq \$4,%xmm3 + pxor %xmm4,%xmm0 + pshufd \$0b01010101,%xmm1,%xmm1 # critical path + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd \$0b11111111,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret + +.align 16 +.Lkey_expansion_192b: + movaps %xmm0,%xmm3 + shufps \$0b01000100,%xmm0,%xmm5 + $movkey %xmm5,(%rcx) + shufps \$0b01001110,%xmm2,%xmm3 + $movkey %xmm3,16(%rcx) + lea 32(%rcx),%rcx + jmp .Lkey_expansion_192b_warm + +.align 16 +.L14rounds: + movups 16(%rdi),%xmm2 # remaning half of *userKey + mov \$14,%esi # 14 rounds for 256 + lea 16(%rcx),%rcx + $movkey %xmm0,(%rdx) # round 0 + $movkey %xmm2,16(%rdx) # round 1 + aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 + call .Lkey_expansion_256a_cold + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_256b + aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 + call .Lkey_expansion_256a + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_256b + aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 + call .Lkey_expansion_256a + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_256b + aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 + call .Lkey_expansion_256a + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_256b + aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 + call .Lkey_expansion_256a + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 + call .Lkey_expansion_256b + aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 + call .Lkey_expansion_256a + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 + call .Lkey_expansion_256b + aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 + call .Lkey_expansion_256a + $movkey %xmm0,(%rcx) + mov %esi,16(%rcx) # 240(%rdx) + xor %rax,%rax + ret + +.align 16 +.Lkey_expansion_256a: + $movkey %xmm2,(%rcx) + lea 16(%rcx),%rcx +.Lkey_expansion_256a_cold: + shufps \$0b00010000,%xmm0,%xmm4 + pxor %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pxor %xmm4,%xmm0 + pshufd \$0b11111111,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm0 + ret + +.align 16 +.Lkey_expansion_256b: + $movkey %xmm0,(%rcx) + lea 16(%rcx),%rcx + + shufps \$0b00010000,%xmm2,%xmm4 + pxor %xmm4,%xmm2 + shufps \$0b10001100,%xmm2,%xmm4 + pxor %xmm4,%xmm2 + pshufd \$0b10101010,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm2 + ret + +.align 16 +.Lbad_pointer: + mov \$-1, %rax + ret +.Lbad_keybits: + mov \$-2, %rax + ret +.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key +___ +} + +$code.=<<___; +.asciz "AES for Intel AES-NI, CRYPTOGAMS by " +.align 64 +___ + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + + if ($dst>=8 || $src>=8) { + $rex=0x40; + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex; + } +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; + +close STDOUT; diff --git a/crypto/engine/Makefile b/crypto/engine/Makefile index 9c214824eb..9ec46a87c3 100644 --- a/crypto/engine/Makefile +++ b/crypto/engine/Makefile @@ -21,12 +21,14 @@ LIBSRC= eng_err.c eng_lib.c eng_list.c eng_init.c eng_ctrl.c \ eng_table.c eng_pkey.c eng_fat.c eng_all.c \ tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \ tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \ - eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c + eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \ + eng_aesni.c LIBOBJ= eng_err.o eng_lib.o eng_list.o eng_init.o eng_ctrl.o \ eng_table.o eng_pkey.o eng_fat.o eng_all.o \ tb_rsa.o tb_dsa.o tb_ecdsa.o tb_dh.o tb_ecdh.o tb_rand.o tb_store.o \ tb_cipher.o tb_digest.o tb_pkmeth.o tb_asnmth.o \ - eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o + eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o \ + eng_aesni.o SRC= $(LIBSRC) diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c new file mode 100644 index 0000000000..6707418614 --- /dev/null +++ b/crypto/engine/eng_aesni.c @@ -0,0 +1,402 @@ +/* + * Support for Intel AES-NI intruction set + * Author: Huang Ying + * + * Intel AES-NI is a new set of Single Instruction Multiple Data + * (SIMD) instructions that are going to be introduced in the next + * generation of Intel processor, as of 2009. These instructions + * enable fast and secure data encryption and decryption, using the + * Advanced Encryption Standard (AES), defined by FIPS Publication + * number 197. The architecture introduces six instructions that + * offer full hardware support for AES. Four of them support high + * performance data encryption and decryption, and the other two + * instructions support the AES key expansion procedure. + * + * The white paper can be downloaded from: + * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf + * + * This file is based on engines/e_padlock.c + */ + +/* ==================================================================== + * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + +#include + +#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_NI) && !defined(OPENSSL_NO_AES) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* AES-NI is available *ONLY* on some x86 CPUs. Not only that it + doesn't exist elsewhere, but it even can't be compiled on other + platforms! */ +#undef COMPILE_HW_AESNI +#if (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(OPENSSL_IA32_SSE2)) && !defined(OPENSSL_NO_ASM) +#define COMPILE_HW_AESNI +static ENGINE *ENGINE_aesni (void); +#endif + +void ENGINE_load_aesni (void) +{ +/* On non-x86 CPUs it just returns. */ +#ifdef COMPILE_HW_AESNI + ENGINE *toadd = ENGINE_aesni(); + if (!toadd) + return; + ENGINE_add (toadd); + ENGINE_free (toadd); + ERR_clear_error (); +#endif +} + +#ifdef COMPILE_HW_AESNI +int aesni_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int aesni_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); + +void aesni_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void aesni_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); + +void aesni_ecb_encrypt(const unsigned char *in, + unsigned char *out, + const unsigned long length, + const AES_KEY *key, + const int enc); +void aesni_cbc_encrypt(const unsigned char *in, + unsigned char *out, + const unsigned long length, + const AES_KEY *key, + unsigned char *ivec, const int enc); + +/* Function for ENGINE detection and control */ +static int aesni_init(ENGINE *e); + +/* Cipher Stuff */ +static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, + const int **nids, int nid); + +#define AESNI_MIN_ALIGN 16 +#define AESNI_ALIGN(x) \ + ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) + +/* Engine names */ +static const char *aesni_id = "aesni"; +static const char *aesni_name = "Intel AES-NI engine"; + +/* ===== Engine "management" functions ===== */ + +/* Prepare the ENGINE structure for registration */ +static int +aesni_bind_helper(ENGINE *e) +{ + if (!(OPENSSL_ia32cap_P[1] & (1UL << (57-32)))) + return 0; + + /* Register everything or return with an error */ + if (!ENGINE_set_id(e, aesni_id) || + !ENGINE_set_name(e, aesni_name) || + + !ENGINE_set_init_function(e, aesni_init) || + !ENGINE_set_ciphers (e, aesni_ciphers)) + return 0; + + /* Everything looks good */ + return 1; +} + +/* Constructor */ +static ENGINE * +ENGINE_aesni(void) +{ + ENGINE *eng = ENGINE_new(); + + if (!eng) { + return NULL; + } + + if (!aesni_bind_helper(eng)) { + ENGINE_free(eng); + return NULL; + } + + return eng; +} + +/* Check availability of the engine */ +static int +aesni_init(ENGINE *e) +{ + return 1; +} + +#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb) +#define NID_aes_128_cfb NID_aes_128_cfb128 +#endif + +#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb) +#define NID_aes_128_ofb NID_aes_128_ofb128 +#endif + +#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb) +#define NID_aes_192_cfb NID_aes_192_cfb128 +#endif + +#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb) +#define NID_aes_192_ofb NID_aes_192_ofb128 +#endif + +#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb) +#define NID_aes_256_cfb NID_aes_256_cfb128 +#endif + +#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb) +#define NID_aes_256_ofb NID_aes_256_ofb128 +#endif + +/* List of supported ciphers. */ +static int aesni_cipher_nids[] = { + NID_aes_128_ecb, + NID_aes_128_cbc, + NID_aes_128_cfb, + NID_aes_128_ofb, + + NID_aes_192_ecb, + NID_aes_192_cbc, + NID_aes_192_cfb, + NID_aes_192_ofb, + + NID_aes_256_ecb, + NID_aes_256_cbc, + NID_aes_256_cfb, + NID_aes_256_ofb, +}; +static int aesni_cipher_nids_num = + (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); + +typedef struct +{ + AES_KEY ks; + unsigned int _pad1[3]; +} AESNI_KEY; + +static int +aesni_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *user_key, + const unsigned char *iv, int enc) +{ + int ret; + AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + + if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE + || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE + || enc) + ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); + else + ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); + + if(ret < 0) { + EVPerr(EVP_F_AES_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); + return 0; + } + + return 1; +} + +static int aesni_cipher_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + aesni_ecb_encrypt(in, out, inl, key, ctx->encrypt); + return 1; +} +static int aesni_cipher_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + aesni_cbc_encrypt(in, out, inl, key, + ctx->iv, ctx->encrypt); + return 1; +} +static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, + &ctx->num, ctx->encrypt, + aesni_encrypt); + return 1; +} +static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, + &ctx->num, aesni_encrypt); + return 1; +} + +#define AES_BLOCK_SIZE 16 + +#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_OFB 1 +#define EVP_CIPHER_block_size_CFB 1 + +/* Declaring so many ciphers by hand would be a pain. + Instead introduce a bit of preprocessor magic :-) */ +#define DECLARE_AES_EVP(ksize,lmode,umode) \ +static const EVP_CIPHER aesni_##ksize##_##lmode = { \ + NID_aes_##ksize##_##lmode, \ + EVP_CIPHER_block_size_##umode, \ + ksize / 8, \ + AES_BLOCK_SIZE, \ + 0 | EVP_CIPH_##umode##_MODE, \ + aesni_init_key, \ + aesni_cipher_##lmode, \ + NULL, \ + sizeof(AESNI_KEY), \ + EVP_CIPHER_set_asn1_iv, \ + EVP_CIPHER_get_asn1_iv, \ + NULL, \ + NULL \ +} + +DECLARE_AES_EVP(128,ecb,ECB); +DECLARE_AES_EVP(128,cbc,CBC); +DECLARE_AES_EVP(128,cfb,CFB); +DECLARE_AES_EVP(128,ofb,OFB); + +DECLARE_AES_EVP(192,ecb,ECB); +DECLARE_AES_EVP(192,cbc,CBC); +DECLARE_AES_EVP(192,cfb,CFB); +DECLARE_AES_EVP(192,ofb,OFB); + +DECLARE_AES_EVP(256,ecb,ECB); +DECLARE_AES_EVP(256,cbc,CBC); +DECLARE_AES_EVP(256,cfb,CFB); +DECLARE_AES_EVP(256,ofb,OFB); + +static int +aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, + const int **nids, int nid) +{ + /* No specific cipher => return a list of supported nids ... */ + if (!cipher) { + *nids = aesni_cipher_nids; + return aesni_cipher_nids_num; + } + + /* ... or the requested "cipher" otherwise */ + switch (nid) { + case NID_aes_128_ecb: + *cipher = &aesni_128_ecb; + break; + case NID_aes_128_cbc: + *cipher = &aesni_128_cbc; + break; + case NID_aes_128_cfb: + *cipher = &aesni_128_cfb; + break; + case NID_aes_128_ofb: + *cipher = &aesni_128_ofb; + break; + + case NID_aes_192_ecb: + *cipher = &aesni_192_ecb; + break; + case NID_aes_192_cbc: + *cipher = &aesni_192_cbc; + break; + case NID_aes_192_cfb: + *cipher = &aesni_192_cfb; + break; + case NID_aes_192_ofb: + *cipher = &aesni_192_ofb; + break; + + case NID_aes_256_ecb: + *cipher = &aesni_256_ecb; + break; + case NID_aes_256_cbc: + *cipher = &aesni_256_cbc; + break; + case NID_aes_256_cfb: + *cipher = &aesni_256_cfb; + break; + case NID_aes_256_ofb: + *cipher = &aesni_256_ofb; + break; + + default: + /* Sorry, we don't support this NID */ + *cipher = NULL; + return 0; + } + + return 1; +} + +#endif /* COMPILE_HW_AESNI */ +#endif /* !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) && !defined(OPENSSL_NO_AES) */ diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c index a6f5499340..623485d3d2 100644 --- a/crypto/engine/eng_all.c +++ b/crypto/engine/eng_all.c @@ -70,6 +70,9 @@ void ENGINE_load_builtin_engines(void) #endif #if defined(__OpenBSD__) || defined(__FreeBSD__) ENGINE_load_cryptodev(); +#endif +#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) + ENGINE_load_aesni(); #endif ENGINE_load_dynamic(); #ifndef OPENSSL_NO_STATIC_ENGINE diff --git a/crypto/engine/engine.h b/crypto/engine/engine.h index 10b5d6787a..9bc8a313a4 100644 --- a/crypto/engine/engine.h +++ b/crypto/engine/engine.h @@ -348,6 +348,7 @@ void ENGINE_load_gost(void); #endif #endif void ENGINE_load_cryptodev(void); +void ENGINE_load_aesni(void); void ENGINE_load_builtin_engines(void); /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation